本文整理汇总了Python中cntk.io.MinibatchSource类的典型用法代码示例。如果您正苦于以下问题:Python MinibatchSource类的具体用法?Python MinibatchSource怎么用?Python MinibatchSource使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了MinibatchSource类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_large_minibatch
def test_large_minibatch(tmpdir):
tmpfile = _write_data(tmpdir, MBDATA_DENSE_2)
mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
features = StreamDef(field='S0', shape=1),
labels = StreamDef(field='S1', shape=1))),
randomization_window_in_chunks=0)
features_si = mb_source.stream_info('features')
labels_si = mb_source.stream_info('labels')
mb = mb_source.next_minibatch(1000)
features = mb[features_si]
labels = mb[labels_si]
# Actually, the minibatch spans over multiple sweeps,
# not sure if this is an artificial situation, but
# maybe instead of a boolean flag we should indicate
# the largest sweep index the data was taken from.
assert features.end_of_sweep
assert labels.end_of_sweep
assert features.num_samples == 1000 - 1000 % 7
assert labels.num_samples == 5 * (1000 // 7)
assert mb[features_si].num_sequences == (1000 // 7)
assert mb[labels_si].num_sequences == (1000 // 7)
示例2: test_MinibatchData_and_Value_as_input
def test_MinibatchData_and_Value_as_input(tmpdir):
mbdata = r'''0 |S0 100'''
tmpfile = str(tmpdir/'mbtest.txt')
with open(tmpfile, 'w') as f:
f.write(mbdata)
defs = StreamDefs(f1 = StreamDef(field='S0', shape=1))
mb_source = MinibatchSource(CTFDeserializer(tmpfile, defs),
randomize=False)
f1_si = mb_source.stream_info('f1')
mb = mb_source.next_minibatch(1)
f1 = input_variable(shape=(1,),
needs_gradient=True,
name='f')
res = f1 * 2
assert res.eval({f1: mb[f1_si]}) == [[200]]
# Test MinibatchData
assert res.eval(mb[f1_si]) == [[200]]
# Test Value
assert res.eval(mb[f1_si].data) == [[200]]
# Test NumPy (converted back from MinibatchData)
assert res.eval(mb[f1_si].value) == [[200]]
# Test Value
assert res.eval(mb[f1_si].data) == [[200]]
示例3: test_eval_sparse_dense
def test_eval_sparse_dense(tmpdir, device_id):
from cntk import Axis
from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs
from cntk.device import cpu, gpu, set_default_device
from cntk.ops import input_variable, times
from scipy.sparse import csr_matrix
input_vocab_dim = label_vocab_dim = 69
ctf_data = '''\
0 |S0 3:1 |# <s> |S1 3:1 |# <s>
0 |S0 4:1 |# A |S1 32:1 |# ~AH
0 |S0 5:1 |# B |S1 36:1 |# ~B
0 |S0 4:1 |# A |S1 31:1 |# ~AE
0 |S0 7:1 |# D |S1 38:1 |# ~D
0 |S0 12:1 |# I |S1 47:1 |# ~IY
0 |S0 1:1 |# </s> |S1 1:1 |# </s>
2 |S0 60:1 |# <s> |S1 3:1 |# <s>
2 |S0 61:1 |# A |S1 32:1 |# ~AH
'''
ctf_file = str(tmpdir/'2seqtest.txt')
with open(ctf_file, 'w') as f:
f.write(ctf_data)
mbs = MinibatchSource(CTFDeserializer(ctf_file, StreamDefs(
features = StreamDef(field='S0', shape=input_vocab_dim, is_sparse=True),
labels = StreamDef(field='S1', shape=label_vocab_dim, is_sparse=True)
)), randomize=False, epoch_size = 2)
batch_axis = Axis.default_batch_axis()
input_seq_axis = Axis('inputAxis')
label_seq_axis = Axis('labelAxis')
input_dynamic_axes = [batch_axis, input_seq_axis]
raw_input = input_variable(
shape=input_vocab_dim, dynamic_axes=input_dynamic_axes,
name='raw_input', is_sparse=True)
mb_valid = mbs.next_minibatch(minibatch_size_in_samples=100,
input_map={raw_input : mbs.streams.features})
z = times(raw_input, np.eye(input_vocab_dim))
e_reader = z.eval(mb_valid)
# CSR with the raw_input encoding in ctf_data
one_hot_data = [
[3, 4, 5, 4, 7, 12, 1],
[60, 61]
]
data = [csr_matrix(np.eye(input_vocab_dim, dtype=np.float32)[d]) for d in
one_hot_data]
e_csr = z.eval({raw_input: data}, device=cntk_device(device_id))
assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_csr)])
# One-hot with the raw_input encoding in ctf_data
data = one_hot(one_hot_data, num_classes=input_vocab_dim)
e_hot = z.eval({raw_input: data}, device=cntk_device(device_id))
assert np.all([np.allclose(a, b) for a,b in zip(e_reader, e_hot)])
示例4: test_user_deserializer_sequence_mode
def test_user_deserializer_sequence_mode():
import scipy.sparse as sp
streams = [StreamInformation('x', 0, 'dense', np.float32, (2, 3)),
StreamInformation('y', 1, 'sparse', np.float32, (3,))]
def run_minibatch_source(minibatch_source, num_chunks, num_sequences_per_value):
sequence_x_values = np.zeros(num_chunks, dtype=np.int32)
sequence_y_values = np.zeros(num_chunks, dtype=np.int32)
mb_count = 0
while True:
if mb_count % 10 == 1: # perform checkpointing
checkpoint_state = minibatch_source.get_checkpoint_state()
for i in range(3):
minibatch_source.next_minibatch(20)
minibatch_source.restore_from_checkpoint(checkpoint_state)
mb_count +=1
continue
mb = minibatch_source.next_minibatch(20)
mb_count += 1
if not mb:
break
for sequence in mb[minibatch_source.streams.x].asarray():
sequence_x_values[int(sequence[0][0][0])] +=1
for sequence in mb[minibatch_source.streams.y].as_sequences(C.sequence.input_variable((3,), True)):
sequence_y_values[int(sequence.toarray()[0][0])] += 1
mb = None
expected_values = np.full(num_chunks, fill_value=num_sequences_per_value, dtype=np.int32)
assert (sequence_x_values == expected_values).all()
assert (sequence_y_values == expected_values).all()
# Big chunks
d = GenDeserializer(stream_infos=streams, num_chunks=15,
num_sequences=100, max_sequence_len=10)
mbs = MinibatchSource([d], randomize=False, max_sweeps=2)
state = mbs.get_checkpoint_state()
mbs.restore_from_checkpoint(state)
run_minibatch_source(mbs, num_chunks=15, num_sequences_per_value=200)
# Randomized
mbs = MinibatchSource([d], randomize=True, max_sweeps=2, randomization_window_in_chunks=5)
state = mbs.get_checkpoint_state()
mbs.restore_from_checkpoint(state)
run_minibatch_source(mbs, num_chunks=15, num_sequences_per_value=200)
# Small chunks of 1
d = GenDeserializer(stream_infos=streams, num_chunks=15,
num_sequences=1, max_sequence_len=10)
mbs = MinibatchSource([d], randomize=False, max_sweeps=3)
run_minibatch_source(mbs, num_chunks=15, num_sequences_per_value=3)
# Randomized
mbs = MinibatchSource([d], randomize=True, max_sweeps=3, randomization_window_in_chunks=5)
run_minibatch_source(mbs, num_chunks=15, num_sequences_per_value=3)
示例5: test_max_samples
def test_max_samples(tmpdir):
mb_source = MinibatchSource(
create_ctf_deserializer(tmpdir), max_samples=1)
input_map = {'features': mb_source['features']}
mb = mb_source.next_minibatch(10, input_map)
assert 'features' in mb
assert mb['features'].num_samples == 1
assert not mb['features'].end_of_sweep
mb = mb_source.next_minibatch(10, input_map)
assert not mb
示例6: test_htk_deserializers
def test_htk_deserializers():
mbsize = 640
epoch_size = 1000 * mbsize
lr = [0.001]
feature_dim = 33
num_classes = 132
context = 2
os.chdir(data_path)
features_file = "glob_0000.scp"
labels_file = "glob_0000.mlf"
label_mapping_file = "state.list"
fd = HTKFeatureDeserializer(StreamDefs(
amazing_features = StreamDef(shape=feature_dim, context=(context,context), scp=features_file)))
ld = HTKMLFDeserializer(label_mapping_file, StreamDefs(
awesome_labels = StreamDef(shape=num_classes, mlf=labels_file)))
reader = MinibatchSource([fd,ld])
features = C.input_variable(((2*context+1)*feature_dim))
labels = C.input_variable((num_classes))
model = Sequential([For(range(3), lambda : Recurrence(LSTM(256))),
Dense(num_classes)])
z = model(features)
ce = C.cross_entropy_with_softmax(z, labels)
errs = C.classification_error (z, labels)
learner = C.adam_sgd(z.parameters,
lr=C.learning_rate_schedule(lr, C.UnitType.sample, epoch_size),
momentum=C.momentum_as_time_constant_schedule(1000),
low_memory=True,
gradient_clipping_threshold_per_sample=15, gradient_clipping_with_truncation=True)
trainer = C.Trainer(z, (ce, errs), learner)
input_map={ features: reader.streams.amazing_features, labels: reader.streams.awesome_labels }
pp = C.ProgressPrinter(freq=0)
# just run and verify it doesn't crash
for i in range(3):
mb_data = reader.next_minibatch(mbsize, input_map=input_map)
trainer.train_minibatch(mb_data)
pp.update_with_trainer(trainer, with_metric=True)
assert True
os.chdir(abs_path)
示例7: test_text_format
def test_text_format(tmpdir):
tmpfile = _write_data(tmpdir, MBDATA_SPARSE)
input_dim = 1000
num_output_classes = 5
mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
features=StreamDef(field='x', shape=input_dim, is_sparse=True),
labels=StreamDef(field='y', shape=num_output_classes, is_sparse=False)
)), randomize=False)
assert isinstance(mb_source, MinibatchSource)
features_si = mb_source.stream_info('features')
labels_si = mb_source.stream_info('labels')
mb = mb_source.next_minibatch(7)
features = mb[features_si]
# 2 samples, max seq len 4, 1000 dim
assert features.shape == (2, 4, input_dim)
assert features.end_of_sweep
assert features.num_sequences == 2
assert features.num_samples == 7
assert features.is_sparse
labels = mb[labels_si]
# 2 samples, max seq len 1, 5 dim
assert labels.shape == (2, 1, num_output_classes)
assert labels.end_of_sweep
assert labels.num_sequences == 2
assert labels.num_samples == 2
assert not labels.is_sparse
label_data = labels.asarray()
assert np.allclose(label_data,
np.asarray([
[[1., 0., 0., 0., 0.]],
[[0., 1., 0., 0., 0.]]
]))
mb = mb_source.next_minibatch(1)
features = mb[features_si]
labels = mb[labels_si]
assert not features.end_of_sweep
assert not labels.end_of_sweep
assert features.num_samples < 7
assert labels.num_samples == 1
示例8: test_crop_dimensionality
def test_crop_dimensionality(tmpdir):
import io; from PIL import Image
np.random.seed(1)
file_mapping_path = str(tmpdir / 'file_mapping.txt')
with open(file_mapping_path, 'w') as file_mapping:
for i in range(5):
data = np.random.randint(0, 2**8, (20, 40, 3))
image = Image.fromarray(data.astype('uint8'), "RGB")
buf = io.BytesIO()
image.save(buf, format='PNG')
assert image.width == 40 and image.height == 20
label = str(i)
# save to mapping + png file
file_name = label + '.png'
with open(str(tmpdir/file_name), 'wb') as f:
f.write(buf.getvalue())
file_mapping.write('.../%s\t%s\n' % (file_name, label))
transforms1 = [
xforms.scale(width=40, height=20, channels=3),
xforms.crop(crop_type='randomside',
crop_size=(20, 10), side_ratio=(0.2, 0.5),
jitter_type='uniratio')]
transforms2 = [
xforms.crop(crop_type='randomside',
crop_size=(20, 10), side_ratio=(0.2, 0.5),
jitter_type='uniratio')]
d1 = ImageDeserializer(file_mapping_path,
StreamDefs(
images1=StreamDef(field='image', transforms=transforms1),
labels1=StreamDef(field='label', shape=10)))
d2 = ImageDeserializer(file_mapping_path,
StreamDefs(
images2=StreamDef(field='image', transforms=transforms2),
labels2=StreamDef(field='label', shape=10)))
mbs = MinibatchSource([d1, d2])
for j in range(5):
mb = mbs.next_minibatch(1)
images1 = mb[mbs.streams.images1].asarray()
images2 = mb[mbs.streams.images2].asarray()
assert images1.shape == (1, 1, 3, 10, 20)
assert (images1 == images2).all()
示例9: test_multiple_streams_in_htk
def test_multiple_streams_in_htk():
feature_dim = 33
context = 2
os.chdir(data_path)
features_file = "glob_0000.scp"
fd = HTKFeatureDeserializer(StreamDefs(
amazing_features = StreamDef(shape=feature_dim, context=(context,context), scp=features_file),
amazing_features2 = StreamDef(shape=feature_dim, context=(context,context), scp=features_file)))
mbs = MinibatchSource([fd])
mb = mbs.next_minibatch(1)
assert (mb[mbs.streams.amazing_features].asarray() == mb[mbs.streams.amazing_features2].asarray()).all()
os.chdir(abs_path)
示例10: test_base64_is_equal_image
def test_base64_is_equal_image(tmpdir):
import io, base64; from PIL import Image
np.random.seed(1)
file_mapping_path = str(tmpdir / 'file_mapping.txt')
base64_mapping_path = str(tmpdir / 'base64_mapping.txt')
with open(file_mapping_path, 'w') as file_mapping:
with open(base64_mapping_path, 'w') as base64_mapping:
for i in range(10):
data = np.random.randint(0, 2**8, (5,7,3))
image = Image.fromarray(data.astype('uint8'), "RGB")
buf = io.BytesIO()
image.save(buf, format='PNG')
assert image.width == 7 and image.height == 5
label = str(i)
# save to base 64 mapping file
encoded = base64.b64encode(buf.getvalue()).decode('ascii')
base64_mapping.write('%s\t%s\n' % (label, encoded))
# save to mapping + png file
file_name = label + '.png'
with open(str(tmpdir/file_name), 'wb') as f:
f.write(buf.getvalue())
file_mapping.write('.../%s\t%s\n' % (file_name, label))
transforms = [xforms.scale(width=7, height=5, channels=3)]
b64_deserializer = Base64ImageDeserializer(base64_mapping_path,
StreamDefs(
images1=StreamDef(field='image', transforms=transforms),
labels1=StreamDef(field='label', shape=10)))
file_image_deserializer = ImageDeserializer(file_mapping_path,
StreamDefs(
images2=StreamDef(field='image', transforms=transforms),
labels2=StreamDef(field='label', shape=10)))
mb_source = MinibatchSource([b64_deserializer, file_image_deserializer])
for j in range(20):
mb = mb_source.next_minibatch(1)
images1_stream = mb_source.streams['images1']
images1 = mb[images1_stream].asarray()
images2_stream = mb_source.streams['images2']
images2 = mb[images2_stream].asarray()
assert(images1 == images2).all()
示例11: test_full_sweep_minibatch
def test_full_sweep_minibatch(tmpdir):
tmpfile = _write_data(tmpdir, MBDATA_DENSE_1)
mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
features = StreamDef(field='S0', shape=1),
labels = StreamDef(field='S1', shape=1))),
randomization_window_in_chunks=0, max_sweeps=1)
features_si = mb_source.stream_info('features')
labels_si = mb_source.stream_info('labels')
mb = mb_source.next_minibatch(1000)
assert mb[features_si].num_sequences == 2
assert mb[labels_si].num_sequences == 2
features = mb[features_si]
assert features.end_of_sweep
assert len(features.as_sequences()) == 2
expected_features = \
[
[[0], [1], [2], [3]],
[[4], [5], [6]]
]
for res, exp in zip(features.as_sequences(), expected_features):
assert np.allclose(res, exp)
assert np.allclose(features.data.mask,
[[2, 1, 1, 1],
[2, 1, 1, 0]])
labels = mb[labels_si]
assert labels.end_of_sweep
assert len(labels.as_sequences()) == 2
expected_labels = \
[
[[0],[1],[3]],
[[1],[2]]
]
for res, exp in zip(labels.as_sequences(), expected_labels):
assert np.allclose(res, exp)
assert np.allclose(labels.data.mask,
[[2, 1, 1],
[2, 1, 0]])
示例12: test_mlf_binary_files
def test_mlf_binary_files():
os.chdir(data_path)
feature_dim = 33
num_classes = 132
context = 2
features_file = "glob_0000.scp"
fd = HTKFeatureDeserializer(StreamDefs(
amazing_features = StreamDef(shape=feature_dim, context=(context,context), scp=features_file)))
ld = HTKMLFBinaryDeserializer(StreamDefs(awesome_labels = StreamDef(shape=num_classes, mlf=e2e_data_path + "mlf2.bin")))
# Make sure we can read at least one minibatch.
mbsource = MinibatchSource([fd,ld])
mbsource.next_minibatch(1)
os.chdir(abs_path)
示例13: test_prefetch_with_unpacking
def test_prefetch_with_unpacking(tmpdir):
data = r'''0 |S0 1 1 1 1 |S1 1000
1 |S0 2 2 2 2 |S1 100
2 |S0 3 3 3 3 |S1 100
3 |S0 1 1 1 1 |S1 10
4 |S0 2 2 2 2 |S1 1
5 |S0 3 3 3 3 |S1 2000
6 |S0 1 1 1 1 |S1 200
7 |S0 2 2 2 2 |S1 200
8 |S0 3 3 3 3 |S1 20
9 |S0 1 1 1 1 |S1 2
'''
import time
tmpfile = _write_data(tmpdir, data)
input_dim = 4
num_output_classes = 1
mb_source = MinibatchSource(CTFDeserializer(tmpfile, StreamDefs(
features=StreamDef(field='S0', shape=input_dim, is_sparse=False),
labels=StreamDef(field='S1', shape=num_output_classes, is_sparse=False)
)), randomize=False, max_samples=FULL_DATA_SWEEP)
input_map = { 'S0' : mb_source.streams.features, 'S1' : mb_source.streams.labels }
empty = False
mb_size = 3
# On the last minibatch there will be resize called,
# due to 10%3 = 1 sample in the minibatch
while not empty:
mb = mb_source.next_minibatch(mb_size, input_map=input_map)
time.sleep(1) # make sure the prefetch kicks in
if mb:
# Force unpacking to check that we do
# not break prefetch
actual_size = mb['S0'].shape[0]
assert (mb['S0'].asarray() == np.array([[[1, 1, 1, 1]],
[[2, 2, 2, 2]],
[[3, 3, 3, 3]]], dtype=np.float32)[0:actual_size]).all()
else:
empty = True
示例14: test_max_samples_over_several_sweeps
def test_max_samples_over_several_sweeps(tmpdir):
mb_source = MinibatchSource(
create_ctf_deserializer(tmpdir), max_samples=11)
input_map = {'features': mb_source['features']}
for i in range(2):
mb = mb_source.next_minibatch(5, input_map)
assert 'features' in mb
assert mb['features'].num_samples == 5
assert mb['features'].end_of_sweep
mb = mb_source.next_minibatch(5, input_map)
assert 'features' in mb
assert mb['features'].num_samples == 1
assert not mb['features'].end_of_sweep
mb = mb_source.next_minibatch(1, input_map)
assert not mb
示例15: test_max_sweeps
def test_max_sweeps(tmpdir):
# set max sweeps to 3 (12 samples altogether).
mb_source = MinibatchSource(
create_ctf_deserializer(tmpdir), max_sweeps=3)
input_map = {'features': mb_source['features']}
for i in range(2):
mb = mb_source.next_minibatch(5, input_map)
assert 'features' in mb
assert mb['features'].num_samples == 5
assert mb['features'].end_of_sweep
mb = mb_source.next_minibatch(5, input_map)
assert 'features' in mb
assert mb['features'].num_samples == 2
assert mb['features'].end_of_sweep
mb = mb_source.next_minibatch(1, input_map)
assert not mb