本文整理汇总了Python中fuel.datasets.hdf5.H5PYDataset类的典型用法代码示例。如果您正苦于以下问题:Python H5PYDataset类的具体用法?Python H5PYDataset怎么用?Python H5PYDataset使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了H5PYDataset类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: setUp
def setUp(self):
self.features = numpy.arange(3600, dtype='uint16').reshape((100, 36))
self.targets = numpy.arange(30, dtype='uint8').reshape((30, 1))
h5file = h5py.File(
'file.hdf5', mode='w', driver='core', backing_store=False)
h5file['features'] = self.features
h5file['features'].dims[0].label = 'batch'
h5file['features'].dims[1].label = 'feature'
h5file['targets'] = self.targets
h5file['targets'].dims[0].label = 'batch'
h5file['targets'].dims[1].label = 'index'
split_dict = {'train': {'features': (0, 20, None), 'targets': (0, 20)},
'test': {'features': (20, 30), 'targets': (20, 30)},
'unlabeled': {'features': (30, 100, None, '.')}}
h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
self.h5file = h5file
vlen_h5file = h5py.File(
'test_vl.hdf5', mode='w', driver='core', backing_store=False)
self.vlen_features = [
numpy.arange(12, dtype='uint8').reshape((3, 2, 2)),
numpy.arange(48, dtype='uint8').reshape((3, 4, 4)),
numpy.arange(60, dtype='uint8').reshape((3, 5, 4)),
numpy.arange(18, dtype='uint8').reshape((3, 2, 3))]
self.vlen_targets = numpy.arange(4, dtype='uint8').reshape((4, 1))
dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
features = vlen_h5file.create_dataset('features', (4,), dtype=dtype)
features[...] = [d.flatten() for d in self.vlen_features]
features.dims[0].label = 'batch'
features_shapes = vlen_h5file.create_dataset(
'features_shapes', (4, 3), dtype='uint8')
features_shapes[...] = numpy.array(
[d.shape for d in self.vlen_features])
features.dims.create_scale(features_shapes, 'shapes')
features.dims[0].attach_scale(features_shapes)
features_shape_labels = vlen_h5file.create_dataset(
'features_shape_labels', (3,), dtype='S7')
features_shape_labels[...] = [
'channel'.encode('utf8'), 'height'.encode('utf8'),
'width'.encode('utf8')]
features.dims.create_scale(features_shape_labels, 'shape_labels')
features.dims[0].attach_scale(features_shape_labels)
targets = vlen_h5file.create_dataset('targets', (4, 1), dtype='uint8')
targets[...] = self.vlen_targets
targets.dims[0].label = 'batch'
targets.dims[1].label = 'index'
split_dict = {'train': {'features': (0, 4), 'targets': (0, 4)}}
vlen_h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
self.vlen_h5file = vlen_h5file
示例2: build_raw_hdf5_dataset
def build_raw_hdf5_dataset(wav_name, hdf5_name, window_size):
[rate, signal] = wav.read(wav_name)
num_steps = signal.shape[0]
num_seqs = num_steps-window_size
output_path = '{}.hdf5'.format(hdf5_name)
output_path = os.path.join(output_path)
signal = signal.reshape(num_steps,1)
with h5py.File(output_path, mode='w') as h5file:
input_feature = h5file.create_dataset(name='input_feature' , shape=(num_seqs, window_size, 1), dtype='int16')
target_feature = h5file.create_dataset(name='target_feature', shape=(num_seqs, window_size, 1), dtype='int16')
print ' num of sequences : {}'.format(num_seqs)
for s in xrange(num_seqs):
input_feature[s] = signal[s:s+window_size]
target_feature[s] = signal[(s+1):(s+1)+window_size]
# label each dataset axis
input_feature.dims[0].label = 'batch'
input_feature.dims[1].label = 'time'
input_feature.dims[2].label = 'feature'
target_feature.dims[0].label = 'batch'
target_feature.dims[1].label = 'time'
target_feature.dims[2].label = 'feature'
split_dict = {'train': {'input_feature' : ( 0, num_seqs),
'target_feature': ( 0, num_seqs)}}
h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
h5file.flush()
h5file.close()
return num_seqs
示例3: test_h5py_dataset_split
def test_h5py_dataset_split():
try:
h5file = h5py.File(name='tmp.hdf5', mode="w")
features = h5file.create_dataset('features', (10, 5), dtype='float32')
features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
split_dict = {'train': {'features': (0, 8)},
'test': {'features': (8, 10)}}
h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
h5file.flush()
h5file.close()
train_set = H5PYDataset(path='tmp.hdf5', which_set='train')
test_set = H5PYDataset(path='tmp.hdf5', which_set='test')
train_handle = train_set.open()
test_handle = test_set.open()
assert_equal(
train_set.get_data(state=train_handle, request=slice(0, 8))[0],
numpy.arange(50).reshape((10, 5))[:8])
assert_equal(
test_set.get_data(state=test_handle, request=slice(0, 2))[0],
numpy.arange(50).reshape((10, 5))[8:])
train_set.close(train_handle)
test_set.close(test_handle)
finally:
if os.path.exists('tmp.hdf5'):
os.remove('tmp.hdf5')
示例4: save_h5py
def save_h5py(tn, start, stop):
cf = train_features[start:stop]
ct = train_targets[start:stop]
np.save(pjoin(numpy_path, prefix+tn+'_features.npy'), cf)
np.save(pjoin(numpy_path, prefix+tn+'_targets.npy'), ct)
h5 = h5py.File(pjoin(fuel_path, prefix+tn+'.hdf5'), mode='w')
h5_features = h5.create_dataset(
'features', (cf.shape[0], cf.shape[1]*mult)
, dtype='float32')
lenf = stop - start
with ProgressBar(maxval=lenf) as progbar:
for i in range(lenf):
arr = []
for j in range(-concat[0], concat[0]+1, concat[1]):
arr.extend(cf[(i-j)%lenf])
h5_features[i] = np.asarray(arr)
progbar.update(i)
h5_targets = h5.create_dataset(
'targets', ct.shape, dtype='uint16')
h5_targets[...] = ct
h5_features.dims[0].label = 'batch'
h5_features.dims[1].label = 'feature'
h5_targets.dims[0].label = 'batch'
h5_targets.dims[1].label = 'index'
split_dict = {
tn: {'features': (0, stop-start), 'targets': (0, stop-start)},
#'validate': {'features': (tr_n, len(fet)), 'targets': (tr_n, len(fet))},
}
h5.attrs['split'] = H5PYDataset.create_split_array(split_dict)
h5.flush()
h5.close()
示例5: CreateHDF5
def CreateHDF5():
sizes = numpy.random.randint(3,9, size=(100,))
train_image_features = [
numpy.random.randint(256, size=(3, size, size)).astype('uint8')
for size in sizes[:90]]
test_image_features = [
numpy.random.randint(256, size=(3, size, size)).astype('uint8')
for size in sizes[90:]]
train_vector_features = numpy.random.normal(size=(90,10)).astype('float32')
test_vector_features = numpy.random.normal(size=(10,10)).astype('float32')
train_targets = numpy.random.randint(10, size=(90,1)).astype('uint8')
test_targets = numpy.random.randint(10, size=(10,1)).astype('uint8')
f = h5py.File('dataset.hdf5', mode='w')
vector_features = f.create_dataset(
'vector_features', (100, 10), dtype='float32')
targets = f.create_dataset(
'targets', (100, 1), dtype='uint8')
vector_features[...] = numpy.vstack(
[train_vector_features, test_vector_features])
targets[...] = numpy.vstack([train_targets, test_targets])
vector_features.dims[0].label = 'batch'
vector_features.dims[1].label = 'feature'
targets.dims[0].label = 'batch'
targets.dims[1].label = 'index'
all_image_features = train_image_features + test_image_features
dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
image_features = f.create_dataset('image_features', (100,), dtype=dtype)
image_features[...] = [image.flatten() for image in all_image_features]
image_features.dims[0].label='batch'
image_features_shapes = f.create_dataset(
'image_features_shapes', (100, 3), dtype='int32')
image_features_shapes[...] = numpy.array(
[image.shape for image in all_image_features])
image_features.dims.create_scale(image_features_shapes, 'shapes')
image_features.dims[0].attach_scale(image_features_shapes)
image_features_shape_labels = f.create_dataset(
'image_features_shape_labels', (3,), dtype='S7')
image_features_shape_labels[...] = [
'channel'.encode('utf8'), 'height'.encode('utf8'),
'width'.encode('utf8')]
image_features.dims.create_scale(
image_features_shape_labels, 'shape_labels')
image_features.dims[0].attach_scale(image_features_shape_labels)
split_dict = {
'train': {'vector_features': (0, 90), 'image_features': (0, 90),
'targets': (0, 90)},
'test': {'vector_features': (90, 100), 'image_features': (90, 100),
'targets': (90, 100)}}
f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
f.flush()
f.close()
示例6: createH5Dataset
def createH5Dataset(hdf5_out, corpus_path, sequence_length):
with open(corpus_path) as f:
corpus = f.read().split(",")
(indices, vocab) = pd.factorize(list(corpus))
instances_num = len(corpus) // (sequence_length + 1)
f = h5py.File(hdf5_out, mode='w')
train_data_x = np.zeros((instances_num, sequence_length), dtype=np.uint8)
train_data_y = np.zeros((instances_num, sequence_length), dtype=np.uint8)
for j in range(instances_num):
for i in range(sequence_length):
train_data_x[j][i] = indices[i + j * (sequence_length + 1)]
train_data_y[j][i] = indices[i + j * (sequence_length + 1) + 1]
char_in = f.create_dataset('inchar', train_data_x.shape, dtype='uint8')
char_out = f.create_dataset('outchar', train_data_y.shape, dtype='uint8')
char_in[...] = train_data_x
char_out[...] = train_data_y
split_dict = {
'train': {'inchar': (0, instances_num), 'outchar': (0, instances_num)}}
f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
f.attrs["vocab"] = json.dumps(list(vocab))
f.flush()
f.close()
示例7: data
def data():
try:
hf["target"].shape
except:
hf = h5py.File('faces.hdf5','r+')
num_samples = hf["input"].shape[0]
print "number of samples in dataset : %i" %num_samples
split_dict = {
'train': {'input': (2000, num_samples), 'target': (2000, num_samples)},
'test': {'input': (0, 1000), 'target': (0, 1000)},
'val': {'input': (1000, 2000), 'target': (1000, 2000)}
}
hf.attrs['split'] = H5PYDataset.create_split_array(split_dict)
train_set = H5PYDataset('faces.hdf5', which_sets=('train',))
test_set = H5PYDataset('faces.hdf5', which_sets=('test',))
val_set = H5PYDataset('faces.hdf5', which_sets=('val',))
batch_size = 128
#TODO : use shuffledscheme instead? Seems slower, might have screwed up the chunksize in the HDF5 files?
tr_scheme = SequentialScheme(examples=train_set.num_examples, batch_size=batch_size)
tr_stream = DataStream(train_set, iteration_scheme=tr_scheme)
val_scheme = SequentialScheme(examples=val_set.num_examples, batch_size=batch_size)
val_stream = DataStream(val_set, iteration_scheme=val_scheme)
test_scheme = SequentialScheme(examples=test_set.num_examples, batch_size=batch_size)
test_stream = DataStream(test_set, iteration_scheme=test_scheme)
hf.close()
return num_samples, train_set, test_set, val_set, tr_scheme, tr_stream, val_scheme, val_stream, test_scheme, test_stream
示例8: add_sets
def add_sets(args):
with h5py.File(args.h5file, "a") as h5file:
sources = []
for dataset in h5file:
if dataset.endswith("_indices") or dataset.endswith("_shapes") or dataset.endswith("_shape_labels"):
continue
sources.append(dataset)
uttid2idx = {uttid: idx for (idx, uttid) in enumerate(h5file["uttids"])}
split_dict = {}
for subset in args.sets:
name, uttids_fname = subset.split("=")
idxs = []
with open(uttids_fname) as uf:
for l in uf:
uttid = l.strip().split()[0]
idxs.append(uttid2idx[uttid])
indices_name = "{}_indices".format(name)
if indices_name in h5file:
del h5file[indices_name]
#
# Note: ideally, we would sort the indeces and do:
# h5file[indices_name] = numpy.array(sorted(idxs))
# but this would cause incompatibility with Kaldi, which keeps utterances sorted by uttid!
#
h5file[indices_name] = numpy.array(idxs)
indices_ref = h5file[indices_name].ref
split_dict[name] = {source: (-1, -1, indices_ref) for source in sources}
h5file.attrs["split"] = H5PYDataset.create_split_array(split_dict)
示例9: test_value_error_on_unequal_sources
def test_value_error_on_unequal_sources(self):
def get_subsets():
return H5PYDataset(self.h5file, which_sets=('train',)).subsets
split_dict = {'train': {'features': (0, 20), 'targets': (0, 15)},
'test': {'features': (20, 30), 'targets': (20, 30)},
'unlabeled': {'features': (30, 100, None, '.')}}
self.h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
assert_raises(ValueError, get_subsets)
示例10: save_hd5py
def save_hd5py(dataset, destfile, indices_dict):
f = h5py.File(destfile, mode='w')
images = f.create_dataset('images', dataset.shape, dtype='uint8')
images[...] = dataset
split_dict = dict((k, {'images':v}) for k,v in indices_dict.iteritems())
f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
f.flush()
f.close()
示例11: biblefile_to_hdf5
def biblefile_to_hdf5(open_file): # TODO REMOVE LINES WITH THE BOOK OF BLABLA
"""Everything in one function because we have variable-length sequences, so no intermediate arrays..."""
char_to_ind = {"<S>": 0, "</S>": 1}
current_char_ind = 2 # starts at 2 because 0, 1 are reserved for "end/start-of-sequence" character
all_verses = []
# TODO I still don't know what the readout initial_output really does; maybe we need to put <S> into every sequence
current_verse = []
for line in open_file:
# first we need to check if a new verse begins somewhere in the line (not just beginning...)
verse_marker_pos = find_verse_marker(line)
if len(line.split()) > 0 and verse_marker_pos > -1:
# if so, save the verse up to the verse marker and start a new one from the rest of the line
current_verse += list(line[:verse_marker_pos])
# also replace all characters by integers, creating more mappings if necessary
for (ind, char) in enumerate(current_verse):
if char not in char_to_ind:
char_to_ind[char] = current_char_ind
current_char_ind += 1
current_verse[ind] = char_to_ind[char]
current_verse.append(1) # for sequence generator we need to explicitly append this end-of-sequence char
all_verses.append(numpy.array(current_verse, dtype="int32"))
current_verse = list(line[verse_marker_pos:])
# otherwise, just put everything into the current verse
else:
current_verse += list(line)
all_verses = numpy.array(all_verses) # I think this conversion is necessary for the indexing below?
# at this point we have all our verses =) now we build our .hdf5 dataset
# make a little validation set
val_indices = numpy.random.choice(a=len(all_verses), replace=False, size=1500)
test_set = list(all_verses[val_indices])
train_set = list(numpy.delete(all_verses, val_indices, 0))
# if you don't get what's happening here, check the Fuel tutorial on variable-length data (only the 1D part)
f = h5py.File(name="bible.hdf5", mode="w")
dtype_varlen_int = h5py.special_dtype(vlen=numpy.dtype("int32"))
character_seqs = f.create_dataset("character_seqs", (len(all_verses),), dtype=dtype_varlen_int)
character_seqs[...] = train_set + test_set
split_dict = {"train": {"character_seqs": (0, len(train_set))},
"valid": {"character_seqs": (len(train_set), len(all_verses))}}
f.attrs["split"] = H5PYDataset.create_split_array(split_dict)
f.flush()
f.close()
# we also save the current_char_ind (equal to dimensionality of our one-hot character vectors) to a file
numpy.save("onehot_size.npy", current_char_ind)
# also the word-to-index dict
cPickle.dump(char_to_ind, open("char_to_ind.pkl", mode="w"))
# make a quick dirty reverse dict (actually a list) to map from indices to characters, so we can get readable output
# later
ind_to_char = [""]*len(char_to_ind)
ind_to_char[0] = "<S>"
ind_to_char[1] = "</S>"
for char in char_to_ind:
ind_to_char[char_to_ind[char]] = char
cPickle.dump(ind_to_char, open("ind_to_char.pkl", mode="w"))
示例12: build_hdf5_dataset
def build_hdf5_dataset(input_filename, output_filename,batch_size=64):
"""
Builds a hdf5 dataset given the input one. The output one will have
training, valid, and test as sources.
"""
input_file = h5py.File(input_filename, "r")
output_file = h5py.File(output_filename, "w")
data = input_file["features"][:]
data_length = data.shape[1] #
#print "Sample from data: {}".format(data[70])
#if not data_length % batch_size == 0:
# split 0.9 0.1 0.1
train_valid_length = 160000000
batch_index_train = int(0.9 * train_valid_length / float(batch_size))
batch_index_valid = int(train_valid_length / float(batch_size))
batch_index_test = int(data_length / float(batch_size))
print "batch indices in order : {}".format((batch_index_train,
batch_index_valid,
batch_index_test))
assert(train_valid_length == batch_index_valid * batch_size)
data = data.reshape(data_length)[:batch_index_test*batch_size]
data = data.reshape(batch_index_test,batch_size,1)
print data.shape
print ("values lost: {}").format(data_length - data.size)
test_length = data_length - train_valid_length
features = output_file.create_dataset(
name='features' ,
shape=data.shape,
dtype='int16',
data=data)
features.dims[0].label = 'batch'
features.dims[1].label = 'time'
features.dims[2].label = 'feature'
split_dict = {
'train': {
'features' : (0, batch_index_train)},
'valid': {
'features' : (batch_index_train + 1, batch_index_valid)},
'test': {
'features' : (batch_index_valid + 1,batch_index_test)}
}
output_file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
input_file.close()
output_file.flush()
output_file.close()
示例13: save_hd5py
def save_hd5py(dataset_dict, destfile, indices_dict):
f = h5py.File(destfile, mode='w')
for name, dataset in dataset_dict.iteritems():
dat = f.create_dataset(name, dataset.shape, dtype=str(dataset.dtype))
dat[...] = dataset
split_dict = dict((k, dict((name, v) for name in dataset_dict.iterkeys()))
for k,v in indices_dict.iteritems())
f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
f.flush()
f.close()
示例14: build_raw_interval_hdf5_dataset
def build_raw_interval_hdf5_dataset(youtube_id, hdf5_name, interval_size, window_size):
data_stream = YouTubeAudio(youtube_id).get_example_stream()
data_stream = Window(offset=interval_size,
source_window=interval_size*window_size,
target_window=interval_size*window_size,
overlapping=True,
data_stream=data_stream)
data_iterator = data_stream.get_epoch_iterator()
num_sequences = 0
for data in data_iterator:
num_sequences = num_sequences + 1
output_path = '{}.hdf5'.format(hdf5_name)
output_path = os.path.join(output_path)
print 'total num sequences : ', num_sequences
with h5py.File(output_path, mode='w') as h5file:
input_feature = h5file.create_dataset(name='input_feature' , shape=(num_sequences, window_size, interval_size), dtype='int16')
target_feature = h5file.create_dataset(name='target_feature', shape=(num_sequences, window_size, interval_size), dtype='int16')
data_iterator = data_stream.get_epoch_iterator()
# for each batch
for s_idx, sequence_data in enumerate(data_iterator):
# get data
source_data = sequence_data[0]
target_data = sequence_data[1]
# save data
input_feature[s_idx] = source_data.reshape(window_size, interval_size)
target_feature[s_idx] = target_data.reshape(window_size, interval_size)
# label each dataset axis
input_feature.dims[0].label = 'batch'
input_feature.dims[1].label = 'time'
input_feature.dims[2].label = 'feature'
target_feature.dims[0].label = 'batch'
target_feature.dims[1].label = 'time'
target_feature.dims[2].label = 'feature'
num_trains = int(num_sequences*0.8)
split_dict = {'train': {'input_feature' : ( 0, num_trains),
'target_feature': ( 0, num_trains)},
'valid': {'input_feature' : ( num_trains, num_sequences),
'target_feature': ( num_trains, num_sequences)},
}
h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
h5file.flush()
h5file.close()
return num_sequences
示例15: build_hdf5_dataset_single_dim
def build_hdf5_dataset_single_dim(input_filename, output_filename):
"""
Builds a hdf5 dataset given the input one. The output one will have
training, valid, and test as sources.
This function outputs a single dimension for the datasets.
Adapted to monk_music
"""
input_file = h5py.File(input_filename, "r")
output_file = h5py.File(output_filename, "w")
data = input_file["features"][:]
data_length = data.shape[1] #
#if not data_length % batch_size == 0:
# split 0.9 0.1 0.1
train_valid_length = 160000000
index_train = int(0.9 * train_valid_length)
index_valid = int(train_valid_length)
index_test = int(data_length)
print "batch indices in order : {}".format((index_train,
index_valid,
index_test))
data = data.reshape((data_length))
print "Train example: {}".format(data[index_train-100:index_train])
print "Valid example: {}".format(data[index_valid-100:index_valid])
print "Test example: {}".format(data[index_test-100:index_test])
features = output_file.create_dataset(
name='features' ,
shape=data.shape,
dtype='int16',
data=data)
#features.dims[0].label = 'batch'
#features.dims[0].label = 'time'
features.dims[0].label = 'feature'
split_dict = {
'train': {
'features' : (0,index_train)},
'valid': {
'features' : (index_train + 1,index_valid)},
'test': {
'features' : (index_valid + 1,index_test)}
}
output_file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
input_file.close()
output_file.flush()
output_file.close()