当前位置: 首页>>代码示例>>Python>>正文


Python H5PYDataset.create_split_array方法代码示例

本文整理汇总了Python中fuel.datasets.hdf5.H5PYDataset.create_split_array方法的典型用法代码示例。如果您正苦于以下问题:Python H5PYDataset.create_split_array方法的具体用法?Python H5PYDataset.create_split_array怎么用?Python H5PYDataset.create_split_array使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在fuel.datasets.hdf5.H5PYDataset的用法示例。


在下文中一共展示了H5PYDataset.create_split_array方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: setUp

# 需要导入模块: from fuel.datasets.hdf5 import H5PYDataset [as 别名]
# 或者: from fuel.datasets.hdf5.H5PYDataset import create_split_array [as 别名]
    def setUp(self):
        self.features = numpy.arange(3600, dtype='uint16').reshape((100, 36))
        self.targets = numpy.arange(30, dtype='uint8').reshape((30, 1))
        h5file = h5py.File(
            'file.hdf5', mode='w', driver='core', backing_store=False)
        h5file['features'] = self.features
        h5file['features'].dims[0].label = 'batch'
        h5file['features'].dims[1].label = 'feature'
        h5file['targets'] = self.targets
        h5file['targets'].dims[0].label = 'batch'
        h5file['targets'].dims[1].label = 'index'
        split_dict = {'train': {'features': (0, 20, None), 'targets': (0, 20)},
                      'test': {'features': (20, 30), 'targets': (20, 30)},
                      'unlabeled': {'features': (30, 100, None, '.')}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        self.h5file = h5file

        vlen_h5file = h5py.File(
            'test_vl.hdf5', mode='w', driver='core', backing_store=False)
        self.vlen_features = [
            numpy.arange(12, dtype='uint8').reshape((3, 2, 2)),
            numpy.arange(48, dtype='uint8').reshape((3, 4, 4)),
            numpy.arange(60, dtype='uint8').reshape((3, 5, 4)),
            numpy.arange(18, dtype='uint8').reshape((3, 2, 3))]
        self.vlen_targets = numpy.arange(4, dtype='uint8').reshape((4, 1))
        dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
        features = vlen_h5file.create_dataset('features', (4,), dtype=dtype)
        features[...] = [d.flatten() for d in self.vlen_features]
        features.dims[0].label = 'batch'
        features_shapes = vlen_h5file.create_dataset(
            'features_shapes', (4, 3), dtype='uint8')
        features_shapes[...] = numpy.array(
            [d.shape for d in self.vlen_features])
        features.dims.create_scale(features_shapes, 'shapes')
        features.dims[0].attach_scale(features_shapes)
        features_shape_labels = vlen_h5file.create_dataset(
            'features_shape_labels', (3,), dtype='S7')
        features_shape_labels[...] = [
            'channel'.encode('utf8'), 'height'.encode('utf8'),
            'width'.encode('utf8')]
        features.dims.create_scale(features_shape_labels, 'shape_labels')
        features.dims[0].attach_scale(features_shape_labels)
        targets = vlen_h5file.create_dataset('targets', (4, 1), dtype='uint8')
        targets[...] = self.vlen_targets
        targets.dims[0].label = 'batch'
        targets.dims[1].label = 'index'
        split_dict = {'train': {'features': (0, 4), 'targets': (0, 4)}}
        vlen_h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        self.vlen_h5file = vlen_h5file
开发者ID:Commonlibs,项目名称:fuel,代码行数:51,代码来源:test_hdf5.py

示例2: build_raw_hdf5_dataset

# 需要导入模块: from fuel.datasets.hdf5 import H5PYDataset [as 别名]
# 或者: from fuel.datasets.hdf5.H5PYDataset import create_split_array [as 别名]
def build_raw_hdf5_dataset(wav_name, hdf5_name, window_size):
    [rate, signal] = wav.read(wav_name)
    num_steps   = signal.shape[0]
    num_seqs    = num_steps-window_size
    output_path = '{}.hdf5'.format(hdf5_name)
    output_path = os.path.join(output_path)

    signal = signal.reshape(num_steps,1)
    with h5py.File(output_path, mode='w') as h5file:
        input_feature  = h5file.create_dataset(name='input_feature' , shape=(num_seqs, window_size, 1), dtype='int16')
        target_feature = h5file.create_dataset(name='target_feature', shape=(num_seqs, window_size, 1), dtype='int16')
        print ' num of sequences : {}'.format(num_seqs)
        for s in xrange(num_seqs):
            input_feature[s]  = signal[s:s+window_size]
            target_feature[s] = signal[(s+1):(s+1)+window_size]

        # label each dataset axis
        input_feature.dims[0].label = 'batch'
        input_feature.dims[1].label = 'time'
        input_feature.dims[2].label = 'feature'

        target_feature.dims[0].label = 'batch'
        target_feature.dims[1].label = 'time'
        target_feature.dims[2].label = 'feature'

        split_dict = {'train': {'input_feature' : ( 0,  num_seqs),
                                'target_feature': ( 0,  num_seqs)}}

        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

        h5file.flush()
        h5file.close()

    return num_seqs
开发者ID:taesupkim,项目名称:ift6266h16,代码行数:36,代码来源:build_audio_batch_dataset.py

示例3: test_h5py_dataset_split

# 需要导入模块: from fuel.datasets.hdf5 import H5PYDataset [as 别名]
# 或者: from fuel.datasets.hdf5.H5PYDataset import create_split_array [as 别名]
def test_h5py_dataset_split():
    try:
        h5file = h5py.File(name='tmp.hdf5', mode="w")
        features = h5file.create_dataset('features', (10, 5), dtype='float32')
        features[...] = numpy.arange(50, dtype='float32').reshape((10, 5))
        split_dict = {'train': {'features': (0, 8)},
                      'test': {'features': (8, 10)}}
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5file.flush()
        h5file.close()
        train_set = H5PYDataset(path='tmp.hdf5', which_set='train')
        test_set = H5PYDataset(path='tmp.hdf5', which_set='test')
        train_handle = train_set.open()
        test_handle = test_set.open()
        assert_equal(
            train_set.get_data(state=train_handle, request=slice(0, 8))[0],
            numpy.arange(50).reshape((10, 5))[:8])
        assert_equal(
            test_set.get_data(state=test_handle, request=slice(0, 2))[0],
            numpy.arange(50).reshape((10, 5))[8:])
        train_set.close(train_handle)
        test_set.close(test_handle)
    finally:
        if os.path.exists('tmp.hdf5'):
            os.remove('tmp.hdf5')
开发者ID:kastnerkyle,项目名称:fuel,代码行数:27,代码来源:test_hdf5.py

示例4: save_h5py

# 需要导入模块: from fuel.datasets.hdf5 import H5PYDataset [as 别名]
# 或者: from fuel.datasets.hdf5.H5PYDataset import create_split_array [as 别名]
    def save_h5py(tn, start, stop):
        cf = train_features[start:stop]
        ct = train_targets[start:stop]
        np.save(pjoin(numpy_path, prefix+tn+'_features.npy'), cf)
        np.save(pjoin(numpy_path, prefix+tn+'_targets.npy'), ct)
        h5 = h5py.File(pjoin(fuel_path, prefix+tn+'.hdf5'), mode='w')
        h5_features = h5.create_dataset(
            'features', (cf.shape[0], cf.shape[1]*mult)
            , dtype='float32')
        lenf = stop - start
        with ProgressBar(maxval=lenf) as progbar:
            for i in range(lenf):
                arr = []
                for j in range(-concat[0], concat[0]+1, concat[1]):
                    arr.extend(cf[(i-j)%lenf])
                h5_features[i] = np.asarray(arr)
                progbar.update(i)

        h5_targets = h5.create_dataset(
            'targets', ct.shape, dtype='uint16')
        h5_targets[...] = ct
        h5_features.dims[0].label = 'batch'
        h5_features.dims[1].label = 'feature'
        h5_targets.dims[0].label = 'batch'
        h5_targets.dims[1].label = 'index'

        split_dict = {
            tn: {'features': (0, stop-start), 'targets': (0, stop-start)},
            #'validate': {'features': (tr_n, len(fet)), 'targets': (tr_n, len(fet))},
        }
        h5.attrs['split'] = H5PYDataset.create_split_array(split_dict)
        h5.flush()
        h5.close()
开发者ID:AlphaLambdaMuPi,项目名称:DLAlpha,代码行数:35,代码来源:init_fuel.py

示例5: CreateHDF5

# 需要导入模块: from fuel.datasets.hdf5 import H5PYDataset [as 别名]
# 或者: from fuel.datasets.hdf5.H5PYDataset import create_split_array [as 别名]
def CreateHDF5():
    sizes = numpy.random.randint(3,9, size=(100,))
    train_image_features = [
            numpy.random.randint(256, size=(3, size, size)).astype('uint8')
            for size in sizes[:90]]
    test_image_features = [
            numpy.random.randint(256, size=(3, size, size)).astype('uint8')
            for size in sizes[90:]]

    train_vector_features = numpy.random.normal(size=(90,10)).astype('float32')
    test_vector_features = numpy.random.normal(size=(10,10)).astype('float32')
    train_targets = numpy.random.randint(10, size=(90,1)).astype('uint8')
    test_targets = numpy.random.randint(10, size=(10,1)).astype('uint8')

    f = h5py.File('dataset.hdf5', mode='w')
    vector_features = f.create_dataset(
         'vector_features', (100, 10), dtype='float32')
    targets = f.create_dataset(
         'targets', (100, 1), dtype='uint8')

    vector_features[...] = numpy.vstack(
         [train_vector_features, test_vector_features])
    targets[...] = numpy.vstack([train_targets, test_targets])


    vector_features.dims[0].label = 'batch'
    vector_features.dims[1].label = 'feature'
    targets.dims[0].label = 'batch'
    targets.dims[1].label = 'index'

    all_image_features = train_image_features + test_image_features
    dtype = h5py.special_dtype(vlen=numpy.dtype('uint8'))
    image_features = f.create_dataset('image_features', (100,), dtype=dtype)
    image_features[...] = [image.flatten() for image in all_image_features]
    image_features.dims[0].label='batch'

    image_features_shapes = f.create_dataset(
         'image_features_shapes', (100, 3), dtype='int32')
    image_features_shapes[...] = numpy.array(
         [image.shape for image in all_image_features])
    image_features.dims.create_scale(image_features_shapes, 'shapes')
    image_features.dims[0].attach_scale(image_features_shapes)

    image_features_shape_labels = f.create_dataset(
         'image_features_shape_labels', (3,), dtype='S7')
    image_features_shape_labels[...] = [
         'channel'.encode('utf8'), 'height'.encode('utf8'),
         'width'.encode('utf8')]
    image_features.dims.create_scale(
         image_features_shape_labels, 'shape_labels')
    image_features.dims[0].attach_scale(image_features_shape_labels)

    split_dict = {
         'train': {'vector_features': (0, 90), 'image_features': (0, 90),
                   'targets': (0, 90)},
         'test': {'vector_features': (90, 100), 'image_features': (90, 100),
                  'targets': (90, 100)}}
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
开发者ID:pjadzinsky,项目名称:Fuel-Test-Variable-Length,代码行数:62,代码来源:Variable_Length_Data.py

示例6: createH5Dataset

# 需要导入模块: from fuel.datasets.hdf5 import H5PYDataset [as 别名]
# 或者: from fuel.datasets.hdf5.H5PYDataset import create_split_array [as 别名]
def createH5Dataset(hdf5_out, corpus_path, sequence_length):
    with open(corpus_path) as f:
        corpus = f.read().split(",")

    (indices, vocab) = pd.factorize(list(corpus))

    instances_num = len(corpus) // (sequence_length + 1)

    f = h5py.File(hdf5_out, mode='w')

    train_data_x = np.zeros((instances_num, sequence_length), dtype=np.uint8)
    train_data_y = np.zeros((instances_num, sequence_length), dtype=np.uint8)

    for j in range(instances_num):
        for i in range(sequence_length):
            train_data_x[j][i] = indices[i + j * (sequence_length + 1)]
            train_data_y[j][i] = indices[i + j * (sequence_length + 1) + 1]

    char_in = f.create_dataset('inchar', train_data_x.shape, dtype='uint8')
    char_out = f.create_dataset('outchar', train_data_y.shape, dtype='uint8')

    char_in[...] = train_data_x
    char_out[...] = train_data_y

    split_dict = {
        'train': {'inchar': (0, instances_num), 'outchar': (0, instances_num)}}

    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)

    f.attrs["vocab"] = json.dumps(list(vocab))

    f.flush()
    f.close()
开发者ID:sharpfun,项目名称:NeverEndingMusic,代码行数:35,代码来源:dataset.py

示例7: data

# 需要导入模块: from fuel.datasets.hdf5 import H5PYDataset [as 别名]
# 或者: from fuel.datasets.hdf5.H5PYDataset import create_split_array [as 别名]
def data():

    try:
        hf["target"].shape
    except:
        hf = h5py.File('faces.hdf5','r+')
    num_samples = hf["input"].shape[0]

    print "number of samples in dataset : %i" %num_samples

    split_dict = {
         'train': {'input': (2000, num_samples), 'target': (2000, num_samples)},
         'test': {'input': (0, 1000), 'target': (0, 1000)},
         'val': {'input': (1000, 2000), 'target': (1000, 2000)}
    }
    hf.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    train_set = H5PYDataset('faces.hdf5', which_sets=('train',))
    test_set = H5PYDataset('faces.hdf5', which_sets=('test',))
    val_set = H5PYDataset('faces.hdf5', which_sets=('val',))

    batch_size = 128

#TODO : use shuffledscheme instead?  Seems slower, might have screwed up the chunksize in the HDF5 files?

    tr_scheme = SequentialScheme(examples=train_set.num_examples, batch_size=batch_size)
    tr_stream = DataStream(train_set, iteration_scheme=tr_scheme)

    val_scheme = SequentialScheme(examples=val_set.num_examples, batch_size=batch_size)
    val_stream = DataStream(val_set, iteration_scheme=val_scheme)

    test_scheme = SequentialScheme(examples=test_set.num_examples, batch_size=batch_size)
    test_stream = DataStream(test_set, iteration_scheme=test_scheme)
    hf.close()
    return num_samples, train_set, test_set, val_set, tr_scheme, tr_stream, val_scheme, val_stream, test_scheme, test_stream
开发者ID:ccienfall,项目名称:Face-Verification,代码行数:36,代码来源:data.py

示例8: add_sets

# 需要导入模块: from fuel.datasets.hdf5 import H5PYDataset [as 别名]
# 或者: from fuel.datasets.hdf5.H5PYDataset import create_split_array [as 别名]
def add_sets(args):
    with h5py.File(args.h5file, "a") as h5file:
        sources = []
        for dataset in h5file:
            if dataset.endswith("_indices") or dataset.endswith("_shapes") or dataset.endswith("_shape_labels"):
                continue
            sources.append(dataset)

        uttid2idx = {uttid: idx for (idx, uttid) in enumerate(h5file["uttids"])}

        split_dict = {}
        for subset in args.sets:
            name, uttids_fname = subset.split("=")
            idxs = []
            with open(uttids_fname) as uf:
                for l in uf:
                    uttid = l.strip().split()[0]
                    idxs.append(uttid2idx[uttid])

            indices_name = "{}_indices".format(name)

            if indices_name in h5file:
                del h5file[indices_name]

            #
            # Note: ideally, we would sort the indeces and do:
            # h5file[indices_name] = numpy.array(sorted(idxs))
            # but this would cause incompatibility with Kaldi, which keeps utterances sorted by uttid!
            #
            h5file[indices_name] = numpy.array(idxs)
            indices_ref = h5file[indices_name].ref
            split_dict[name] = {source: (-1, -1, indices_ref) for source in sources}

        h5file.attrs["split"] = H5PYDataset.create_split_array(split_dict)
开发者ID:ZhangAustin,项目名称:attention-lvcsr,代码行数:36,代码来源:kaldi2fuel.py

示例9: test_value_error_on_unequal_sources

# 需要导入模块: from fuel.datasets.hdf5 import H5PYDataset [as 别名]
# 或者: from fuel.datasets.hdf5.H5PYDataset import create_split_array [as 别名]
 def test_value_error_on_unequal_sources(self):
     def get_subsets():
         return H5PYDataset(self.h5file, which_sets=('train',)).subsets
     split_dict = {'train': {'features': (0, 20), 'targets': (0, 15)},
                   'test': {'features': (20, 30), 'targets': (20, 30)},
                   'unlabeled': {'features': (30, 100, None, '.')}}
     self.h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
     assert_raises(ValueError, get_subsets)
开发者ID:Commonlibs,项目名称:fuel,代码行数:10,代码来源:test_hdf5.py

示例10: save_hd5py

# 需要导入模块: from fuel.datasets.hdf5 import H5PYDataset [as 别名]
# 或者: from fuel.datasets.hdf5.H5PYDataset import create_split_array [as 别名]
 def save_hd5py(dataset, destfile, indices_dict):
     f = h5py.File(destfile, mode='w')
     images = f.create_dataset('images', dataset.shape, dtype='uint8')
     images[...] = dataset
     split_dict = dict((k, {'images':v}) for k,v in indices_dict.iteritems())
     f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
     f.flush()
     f.close()
开发者ID:StevenLOL,项目名称:video_predict,代码行数:10,代码来源:moving_mnist.py

示例11: biblefile_to_hdf5

# 需要导入模块: from fuel.datasets.hdf5 import H5PYDataset [as 别名]
# 或者: from fuel.datasets.hdf5.H5PYDataset import create_split_array [as 别名]
def biblefile_to_hdf5(open_file):  # TODO REMOVE LINES WITH THE BOOK OF BLABLA
    """Everything in one function because we have variable-length sequences, so no intermediate arrays..."""
    char_to_ind = {"<S>": 0, "</S>": 1}
    current_char_ind = 2  # starts at 2 because 0, 1 are reserved for "end/start-of-sequence" character
    all_verses = []
    # TODO I still don't know what the readout initial_output really does; maybe we need to put <S> into every sequence
    current_verse = []
    for line in open_file:
        # first we need to check if a new verse begins somewhere in the line (not just beginning...)
        verse_marker_pos = find_verse_marker(line)
        if len(line.split()) > 0 and verse_marker_pos > -1:
            # if so, save the verse up to the verse marker and start a new one from the rest of the line
            current_verse += list(line[:verse_marker_pos])
            # also replace all characters by integers, creating more mappings if necessary
            for (ind, char) in enumerate(current_verse):
                if char not in char_to_ind:
                    char_to_ind[char] = current_char_ind
                    current_char_ind += 1
                current_verse[ind] = char_to_ind[char]
            current_verse.append(1)  # for sequence generator we need to explicitly append this end-of-sequence char
            all_verses.append(numpy.array(current_verse, dtype="int32"))
            current_verse = list(line[verse_marker_pos:])
        # otherwise, just put everything into the current verse
        else:
            current_verse += list(line)
    all_verses = numpy.array(all_verses)  # I think this conversion is necessary for the indexing below?

    # at this point we have all our verses =) now we build our .hdf5 dataset
    # make a little validation set
    val_indices = numpy.random.choice(a=len(all_verses), replace=False, size=1500)
    test_set = list(all_verses[val_indices])
    train_set = list(numpy.delete(all_verses, val_indices, 0))

    # if you don't get what's happening here, check the Fuel tutorial on variable-length data (only the 1D part)
    f = h5py.File(name="bible.hdf5", mode="w")
    dtype_varlen_int = h5py.special_dtype(vlen=numpy.dtype("int32"))
    character_seqs = f.create_dataset("character_seqs", (len(all_verses),), dtype=dtype_varlen_int)
    character_seqs[...] = train_set + test_set

    split_dict = {"train": {"character_seqs": (0, len(train_set))},
                  "valid": {"character_seqs": (len(train_set), len(all_verses))}}
    f.attrs["split"] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()

    # we also save the current_char_ind (equal to dimensionality of our one-hot character vectors) to a file
    numpy.save("onehot_size.npy", current_char_ind)
    # also the word-to-index dict
    cPickle.dump(char_to_ind, open("char_to_ind.pkl", mode="w"))
    # make a quick dirty reverse dict (actually a list) to map from indices to characters, so we can get readable output
    # later
    ind_to_char = [""]*len(char_to_ind)
    ind_to_char[0] = "<S>"
    ind_to_char[1] = "</S>"
    for char in char_to_ind:
        ind_to_char[char_to_ind[char]] = char
    cPickle.dump(ind_to_char, open("ind_to_char.pkl", mode="w"))
开发者ID:Copper-Head,项目名称:the-three-stooges,代码行数:59,代码来源:bible_processing.py

示例12: build_hdf5_dataset

# 需要导入模块: from fuel.datasets.hdf5 import H5PYDataset [as 别名]
# 或者: from fuel.datasets.hdf5.H5PYDataset import create_split_array [as 别名]
def build_hdf5_dataset(input_filename, output_filename,batch_size=64):
    """
    Builds a hdf5 dataset given the input one. The output one will have
    training, valid, and test as sources.
    """
    input_file = h5py.File(input_filename, "r")
    output_file = h5py.File(output_filename, "w")

    data = input_file["features"][:]
    data_length = data.shape[1] #

    #print "Sample from data: {}".format(data[70])
    #if not data_length % batch_size == 0:

    # split 0.9 0.1 0.1
    train_valid_length = 160000000
    batch_index_train = int(0.9 * train_valid_length / float(batch_size))
    batch_index_valid = int(train_valid_length / float(batch_size))
    batch_index_test = int(data_length / float(batch_size))

    print "batch indices in order : {}".format((batch_index_train,
                                                batch_index_valid,
                                                batch_index_test))

    assert(train_valid_length == batch_index_valid * batch_size)

    data = data.reshape(data_length)[:batch_index_test*batch_size]
    data = data.reshape(batch_index_test,batch_size,1)
    print data.shape

    print ("values lost: {}").format(data_length - data.size)
    test_length = data_length - train_valid_length

    features = output_file.create_dataset(
        name='features' ,
        shape=data.shape,
        dtype='int16',
        data=data)

    features.dims[0].label = 'batch'
    features.dims[1].label = 'time'
    features.dims[2].label = 'feature'

    split_dict = {
        'train': {
            'features' : (0, batch_index_train)},
        'valid': {
            'features' : (batch_index_train + 1, batch_index_valid)},
        'test': {
            'features' : (batch_index_valid + 1,batch_index_test)}
    }

    output_file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    input_file.close()
    output_file.flush()
    output_file.close()
开发者ID:TiSU32,项目名称:vocal_synthesis,代码行数:58,代码来源:split_dataset.py

示例13: save_hd5py

# 需要导入模块: from fuel.datasets.hdf5 import H5PYDataset [as 别名]
# 或者: from fuel.datasets.hdf5.H5PYDataset import create_split_array [as 别名]
def save_hd5py(dataset_dict, destfile, indices_dict):
    f = h5py.File(destfile, mode='w')
    for name, dataset in dataset_dict.iteritems():
        dat = f.create_dataset(name, dataset.shape, dtype=str(dataset.dtype))
        dat[...] = dataset
    split_dict = dict((k, dict((name, v) for name in dataset_dict.iterkeys()))
            for k,v in indices_dict.iteritems())
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
开发者ID:StevenLOL,项目名称:video_predict,代码行数:12,代码来源:utils.py

示例14: build_raw_interval_hdf5_dataset

# 需要导入模块: from fuel.datasets.hdf5 import H5PYDataset [as 别名]
# 或者: from fuel.datasets.hdf5.H5PYDataset import create_split_array [as 别名]
def build_raw_interval_hdf5_dataset(youtube_id, hdf5_name, interval_size, window_size):
    data_stream = YouTubeAudio(youtube_id).get_example_stream()

    data_stream = Window(offset=interval_size,
                         source_window=interval_size*window_size,
                         target_window=interval_size*window_size,
                         overlapping=True,
                         data_stream=data_stream)

    data_iterator = data_stream.get_epoch_iterator()

    num_sequences = 0
    for data in data_iterator:
        num_sequences = num_sequences + 1

    output_path = '{}.hdf5'.format(hdf5_name)
    output_path = os.path.join(output_path)
    print 'total num sequences : ', num_sequences
    with h5py.File(output_path, mode='w') as h5file:
        input_feature  = h5file.create_dataset(name='input_feature' , shape=(num_sequences, window_size, interval_size), dtype='int16')
        target_feature = h5file.create_dataset(name='target_feature', shape=(num_sequences, window_size, interval_size), dtype='int16')

        data_iterator = data_stream.get_epoch_iterator()
        # for each batch
        for s_idx, sequence_data in enumerate(data_iterator):
            # get data
            source_data = sequence_data[0]
            target_data = sequence_data[1]

            # save data
            input_feature[s_idx]  = source_data.reshape(window_size, interval_size)
            target_feature[s_idx]  = target_data.reshape(window_size, interval_size)

        # label each dataset axis
        input_feature.dims[0].label = 'batch'
        input_feature.dims[1].label = 'time'
        input_feature.dims[2].label = 'feature'

        target_feature.dims[0].label = 'batch'
        target_feature.dims[1].label = 'time'
        target_feature.dims[2].label = 'feature'

        num_trains = int(num_sequences*0.8)

        split_dict = {'train': {'input_feature' : ( 0,  num_trains),
                                'target_feature': ( 0,  num_trains)},
                      'valid': {'input_feature' : ( num_trains,  num_sequences),
                                'target_feature': ( num_trains,  num_sequences)},
                      }
        h5file.attrs['split'] = H5PYDataset.create_split_array(split_dict)

        h5file.flush()
        h5file.close()

    return num_sequences
开发者ID:taesupkim,项目名称:ift6266h16,代码行数:57,代码来源:build_audio_interval_dataset.py

示例15: build_hdf5_dataset_single_dim

# 需要导入模块: from fuel.datasets.hdf5 import H5PYDataset [as 别名]
# 或者: from fuel.datasets.hdf5.H5PYDataset import create_split_array [as 别名]
def build_hdf5_dataset_single_dim(input_filename, output_filename):
    """
    Builds a hdf5 dataset given the input one. The output one will have
    training, valid, and test as sources.
    This function outputs a single dimension for the datasets.
    Adapted to monk_music
    """
    input_file = h5py.File(input_filename, "r")
    output_file = h5py.File(output_filename, "w")

    data = input_file["features"][:]
    data_length = data.shape[1] #
    #if not data_length % batch_size == 0:

    # split 0.9 0.1 0.1
    train_valid_length = 160000000
    index_train = int(0.9 * train_valid_length)
    index_valid = int(train_valid_length)
    index_test = int(data_length)

    print "batch indices in order : {}".format((index_train,
                                                index_valid,
                                                index_test))

    data = data.reshape((data_length))

    print "Train example: {}".format(data[index_train-100:index_train])
    print "Valid example: {}".format(data[index_valid-100:index_valid])
    print "Test example: {}".format(data[index_test-100:index_test])


    features = output_file.create_dataset(
        name='features' ,
        shape=data.shape,
        dtype='int16',
        data=data)

    #features.dims[0].label = 'batch'
    #features.dims[0].label = 'time'
    features.dims[0].label = 'feature'

    split_dict = {
        'train': {
            'features' : (0,index_train)},
        'valid': {
            'features' : (index_train + 1,index_valid)},
        'test': {
            'features' : (index_valid + 1,index_test)}
    }

    output_file.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    input_file.close()
    output_file.flush()
    output_file.close()
开发者ID:TiSU32,项目名称:vocal_synthesis,代码行数:56,代码来源:split_dataset.py


注:本文中的fuel.datasets.hdf5.H5PYDataset.create_split_array方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。