本文整理汇总了Python中allennlp.data.dataset.Batch方法的典型用法代码示例。如果您正苦于以下问题:Python dataset.Batch方法的具体用法?Python dataset.Batch怎么用?Python dataset.Batch使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.dataset
的用法示例。
在下文中一共展示了dataset.Batch方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: preprocess
# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def preprocess(self, token_batch):
seq_lens = [len(sequence) for sequence in token_batch if sequence]
if not seq_lens:
return []
max_len = min(max(seq_lens), self.max_len)
batches = []
for indexer in self.indexers:
batch = []
for sequence in token_batch:
tokens = sequence[:max_len]
tokens = [Token(token) for token in ['$START'] + tokens]
batch.append(Instance({'tokens': TextField(tokens, indexer)}))
batch = Batch(batch)
batch.index_instances(self.vocab)
batches.append(batch)
return batches
示例2: test_saving_and_loading_works_with_byte_encoding
# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def test_saving_and_loading_works_with_byte_encoding(self):
# We're going to set a vocabulary from a TextField using byte encoding, index it, save the
# vocab, load the vocab, then index the text field again, and make sure we get the same
# result.
tokenizer = CharacterTokenizer(byte_encoding=u'utf-8')
token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer)
tokens = [Token(t) for t in [u"Øyvind", u"für", u"汉字"]]
text_field = TextField(tokens, {u"characters": token_indexer})
dataset = Batch([Instance({u"sentence": text_field})])
vocab = Vocabulary.from_instances(dataset)
text_field.index(vocab)
indexed_tokens = deepcopy(text_field._indexed_tokens) # pylint: disable=protected-access
vocab_dir = self.TEST_DIR / u'vocab_save'
vocab.save_to_files(vocab_dir)
vocab2 = Vocabulary.from_files(vocab_dir)
text_field2 = TextField(tokens, {u"characters": token_indexer})
text_field2.index(vocab2)
indexed_tokens2 = deepcopy(text_field2._indexed_tokens) # pylint: disable=protected-access
assert indexed_tokens == indexed_tokens2
示例3: test_from_params_extend_config
# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def test_from_params_extend_config(self):
vocab_dir = self.TEST_DIR / u'vocab_save'
original_vocab = Vocabulary(non_padded_namespaces=[u"tokens"])
original_vocab.add_token_to_namespace(u"a", namespace=u"tokens")
original_vocab.save_to_files(vocab_dir)
text_field = TextField([Token(t) for t in [u"a", u"b"]],
{u"tokens": SingleIdTokenIndexer(u"tokens")})
instances = Batch([Instance({u"text": text_field})])
# If you ask to extend vocab from `directory_path`, instances must be passed
# in Vocabulary constructor, or else there is nothing to extend to.
params = Params({u"directory_path": vocab_dir, u"extend": True})
with pytest.raises(ConfigurationError):
_ = Vocabulary.from_params(params)
# If you ask to extend vocab, `directory_path` key must be present in params,
# or else there is nothing to extend from.
params = Params({u"extend": True})
with pytest.raises(ConfigurationError):
_ = Vocabulary.from_params(params, instances)
示例4: set_up_model
# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def set_up_model(self, param_file, dataset_file):
# pylint: disable=attribute-defined-outside-init
self.param_file = param_file
params = Params.from_file(self.param_file)
reader = DatasetReader.from_params(params[u'dataset_reader'])
instances = reader.read(dataset_file)
# Use parameters for vocabulary if they are present in the config file, so that choices like
# "non_padded_namespaces", "min_count" etc. can be set if needed.
if u'vocabulary' in params:
vocab_params = params[u'vocabulary']
vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
else:
vocab = Vocabulary.from_instances(instances)
self.vocab = vocab
self.instances = instances
self.model = Model.from_params(vocab=self.vocab, params=params[u'model'])
# TODO(joelgrus) get rid of these
# (a lot of the model tests use them, so they'll have to be changed)
self.dataset = Batch(self.instances)
self.dataset.index_instances(self.vocab)
示例5: _create_batches
# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]:
for batch in self.iterator._create_batches(instances, shuffle):
# split after shuffling so smaller batches are kept together
batch_instances = batch.instances
# split if needed
batch_length = -1
for instance in batch_instances:
instance.index_fields(self.vocab)
field_lengths = instance.get_padding_lengths()
batch_length = max(batch_length, field_lengths['tokens']['num_tokens'])
# get the required batch size
index = bisect.bisect_left(self._schedule_lengths, batch_length)
if index == len(self._schedule_lengths):
# this batch exceeds the maximum allowed, just skip it
continue
batch_size = self._schedule_batch_sizes[index]
start = 0
while start < len(batch_instances):
end = start + batch_size
yield Batch(batch_instances[start:end])
start = end
示例6: set_up_model
# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def set_up_model(self, param_file, dataset_file):
# pylint: disable=attribute-defined-outside-init
self.param_file = param_file
params = Params.from_file(self.param_file)
reader = DatasetReader.from_params(params['dataset_reader'])
# The dataset reader might be lazy, but a lazy list here breaks some of our tests.
instances = list(reader.read(str(dataset_file)))
# Use parameters for vocabulary if they are present in the config file, so that choices like
# "non_padded_namespaces", "min_count" etc. can be set if needed.
if 'vocabulary' in params:
vocab_params = params['vocabulary']
vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
else:
vocab = Vocabulary.from_instances(instances)
self.vocab = vocab
self.instances = instances
self.model = Model.from_params(vocab=self.vocab, params=params['model'])
# TODO(joelgrus) get rid of these
# (a lot of the model tests use them, so they'll have to be changed)
self.dataset = Batch(self.instances)
self.dataset.index_instances(self.vocab)
示例7: predict_batch_instance
# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
model = self._model
with torch.no_grad():
cuda_device = model._get_prediction_device()
dataset = Batch(instances)
dataset.index_instances(model.vocab)
model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device)
outputs = model.decode(model(**model_input))
return sanitize(outputs)
示例8: collate_fn
# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def collate_fn(data, to_gpu=False):
"""Creates mini-batch tensors
"""
images, instances = zip(*data)
images = torch.stack(images, 0)
batch = Batch(instances)
td = batch.as_tensor_dict()
if 'question' in td:
td['question_mask'] = get_text_field_mask(td['question'], num_wrapping_dims=1)
td['question_tags'][td['question_mask'] == 0] = -2 # Padding
td['answer_mask'] = get_text_field_mask(td['answers'], num_wrapping_dims=1)
td['answer_tags'][td['answer_mask'] == 0] = -2
td['box_mask'] = torch.all(td['boxes'] >= 0, -1).long()
td['images'] = images
# Deprecated
# if to_gpu:
# for k in td:
# if k != 'metadata':
# td[k] = {k2: v.cuda(non_blocking=True) for k2, v in td[k].items()} if isinstance(td[k], dict) else td[k].cuda(
# non_blocking=True)
# # No nested dicts
# for k in sorted(td.keys()):
# if isinstance(td[k], dict):
# for k2 in sorted(td[k].keys()):
# td['{}_{}'.format(k, k2)] = td[k].pop(k2)
# td.pop(k)
return td
示例9: forward_on_instances
# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def forward_on_instances(self,
instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]:
"""
Takes a list of :class:`~allennlp.data.instance.Instance`s, converts that text into
arrays using this model's :class:`Vocabulary`, passes those arrays through
:func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
and returns the result. Before returning the result, we convert any
``torch.Tensors`` into numpy arrays and separate the
batched output into a list of individual dicts per instance. Note that typically
this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
:func:`forward_on_instance`.
Parameters
----------
instances : List[Instance], required
The instances to run the model on.
Returns
-------
A list of the models output for each instance.
"""
with torch.no_grad():
dataset = Batch(instances)
dataset.index_instances(self.vocab)
model_input = dataset.as_tensor_dict()
outputs = self.decode(self(**model_input))
instance_separated_output = []
metadata = [x.fields["metadata"].metadata for x in dataset.instances]
for res in export_output_data_arc_multi_choice_json(metadata, outputs):
instance_separated_output.append(res)
return instance_separated_output
示例10: _sentences_to_ids
# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def _sentences_to_ids(sentences):
indexer = ELMoTokenCharactersIndexer()
# For each sentence, first create a TextField, then create an instance
instances = []
for sentence in sentences:
tokens = [Token(token) for token in sentence]
field = TextField(tokens, {'character_ids': indexer})
instance = Instance({'elmo': field})
instances.append(instance)
dataset = Batch(instances)
vocab = Vocabulary()
dataset.index_instances(vocab)
return dataset.as_tensor_dict()['elmo']['character_ids']
示例11: batch_to_ids
# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def batch_to_ids(batch ) :
u"""
Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
(len(batch), max sentence length, max word length).
Parameters
----------
batch : ``List[List[str]]``, required
A list of tokenized sentences.
Returns
-------
A tensor of padded character ids.
"""
instances = []
indexer = ELMoTokenCharactersIndexer()
for sentence in batch:
tokens = [Token(token) for token in sentence]
field = TextField(tokens,
{u'character_ids': indexer})
instance = Instance({u"elmo": field})
instances.append(instance)
dataset = Batch(instances)
vocab = Vocabulary()
dataset.index_instances(vocab)
return dataset.as_tensor_dict()[u'elmo'][u'character_ids']
示例12: _create_batches
# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def _create_batches(self, instances , shuffle ) :
# First break the dataset into memory-sized lists:
for instance_list in self._memory_sized_lists(instances):
if shuffle:
random.shuffle(instance_list)
iterator = iter(instance_list)
# Then break each memory-sized list into batches.
for batch_instances in lazy_groups_of(iterator, self._batch_size):
for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances):
batch = Batch(possibly_smaller_batches)
yield batch
示例13: _create_batches
# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def _create_batches(self, instances , shuffle ) :
for instance_list in self._memory_sized_lists(instances):
instance_list = sort_by_padding(instance_list,
self._sorting_keys,
self.vocab,
self._padding_noise)
batches = []
for batch_instances in lazy_groups_of(iter(instance_list), self._batch_size):
for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances):
batches.append(Batch(possibly_smaller_batches))
move_to_front = self._biggest_batch_first and len(batches) > 1
if move_to_front:
# We'll actually pop the last _two_ batches, because the last one might not be full.
last_batch = batches.pop()
penultimate_batch = batches.pop()
if shuffle:
random.shuffle(batches)
else:
logger.warning(u"shuffle parameter is set to False,"
u" while bucket iterators by definition change the order of your data.")
if move_to_front:
batches.insert(0, penultimate_batch)
batches.insert(0, last_batch)
_i = batches
while True:
yield _i.next()
示例14: _sentences_to_ids
# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def _sentences_to_ids(self, sentences):
indexer = ELMoTokenCharactersIndexer()
# For each sentence, first create a TextField, then create an instance
instances = []
for sentence in sentences:
tokens = [Token(token) for token in sentence]
field = TextField(tokens, {u'character_ids': indexer})
instance = Instance({u'elmo': field})
instances.append(instance)
dataset = Batch(instances)
vocab = Vocabulary()
dataset.index_instances(vocab)
return dataset.as_tensor_dict()[u'elmo'][u'character_ids']
示例15: test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly
# 需要导入模块: from allennlp.data import dataset [as 别名]
# 或者: from allennlp.data.dataset import Batch [as 别名]
def test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly(self):
dataset = Batch(self.instances)
dataset.index_instances(self.vocab)
training_tensors = dataset.as_tensor_dict()
output_dict = self.model(**training_tensors)
tags = output_dict[u'tags']
assert len(tags) == 2
assert len(tags[0]) == 7
assert len(tags[1]) == 7
for example_tags in tags:
for tag_id in example_tags:
tag = self.model.vocab.get_token_from_index(tag_id, namespace=u"labels")
assert tag in set([u'O', u'I-ORG', u'I-PER', u'I-LOC'])