Writing a large hdf5 dataset using h5py Writing a large hdf5 dataset using h5py numpy numpy

Writing a large hdf5 dataset using h5py


You could infer the dtypes of your data by reading a smaller chunk of rows at the start of the text file. Once you have these, you can create a resizable HDF5 dataset and iteratively write chunks of rows from your text file to it.

Here's a generator that yields successive chunks of rows from a text file as numpy arrays:

import numpy as npimport warningsdef iter_genfromtxt(path, chunksize=100, **kwargs):    """Yields consecutive chunks of rows from a text file as numpy arrays.    Args:      path: Path to the text file.      chunksize: Maximum number of rows to yield at a time.      **kwargs: Additional keyword arguments are passed to `np.genfromtxt`,        with the exception of `skip_footer` which is unsupported.    Yields:      A sequence of `np.ndarray`s with a maximum row dimension of `chunksize`.    """    names = kwargs.pop('names', None)    max_rows = kwargs.pop('max_rows', None)    skip_header = kwargs.pop('skip_header', kwargs.pop('skiprows', 0))    if kwargs.pop('skip_footer', None) is not None:        warnings.warn('`skip_footer` will be ignored')    with open(path, 'rb') as f:        # The first chunk is handled separately, since we may wish to skip rows,        # read column headers etc.        chunk = np.genfromtxt(f, max_rows=chunksize, skip_header=skip_header,                              names=names, **kwargs)        # Ensure that subsequent chunks have consistent dtypes and field names        kwargs.update({'dtype':chunk.dtype})        while len(chunk):            yield chunk[:max_rows]            if max_rows is not None:                max_rows -= len(chunk)                if max_rows <= 0:                     raise StopIteration            chunk = np.genfromtxt(f, max_rows=chunksize, **kwargs)

Now suppose we have a .csv file containing:

strings,ints,floatsa,1,0.1256290043b,2,0.0071402451c,3,0.2551627907d,4,0.7958570533e,5,0.8968247722f,6,0.7291124437g,7,0.4196829806h,8,0.398944394i,9,0.8718244087j,10,0.67605461k,11,0.7105670336l,12,0.6341504091m,13,0.1324232855n,14,0.7062503808o,15,0.1915132527p,16,0.4140093777q,17,0.1458217602r,18,0.1183596433s,19,0.0014556247t,20,0.1649811301

We can read this data in chunks of 5 rows at a time, and write the resulting arrays to a resizeable dataset:

import h5py# Initialize the generatorgen = iter_genfromtxt('/tmp/test.csv', chunksize=5, delimiter=',', names=True,                      dtype=None)# Read the first chunk to get the column dtypeschunk = next(gen)dtype = chunk.dtyperow_count = chunk.shape[0]with h5py.File('/tmp/test.h5', 'w') as f:    # Initialize a resizable dataset to hold the output    maxshape = (None,) + chunk.shape[1:]    dset = f.create_dataset('data', shape=chunk.shape, maxshape=maxshape,                            chunks=chunk.shape, dtype=chunk.dtype)    # Write the first chunk of rows    dset[:] = chunk    for chunk in gen:        # Resize the dataset to accommodate the next chunk of rows        dset.resize(row_count + chunk.shape[0], axis=0)        # Write the next chunk        dset[row_count:] = chunk        # Increment the row count        row_count += chunk.shape[0]

Output:

with h5py.File('/tmp/test.h5', 'r') as f:    print(repr(f['data'][:]))# array([(b'a', 1, 0.1256290043), (b'b', 2, 0.0071402451),#        (b'c', 3, 0.2551627907), (b'd', 4, 0.7958570533),#        (b'e', 5, 0.8968247722), (b'f', 6, 0.7291124437),#        (b'g', 7, 0.4196829806), (b'h', 8, 0.398944394),#        (b'i', 9, 0.8718244087), (b'j', 10, 0.67605461),#        (b'k', 11, 0.7105670336), (b'l', 12, 0.6341504091),#        (b'm', 13, 0.1324232855), (b'n', 14, 0.7062503808),#        (b'o', 15, 0.1915132527), (b'p', 16, 0.4140093777),#        (b'q', 17, 0.1458217602), (b'r', 18, 0.1183596433),#        (b's', 19, 0.0014556247), (b't', 20, 0.1649811301)], #       dtype=[('strings', 'S1'), ('ints', '<i8'), ('floats', '<f8')])

For your dataset you will probably want to use a larger chunksize.