本文整理汇总了Python中allennlp.common.util.lazy_groups_of方法的典型用法代码示例。如果您正苦于以下问题:Python util.lazy_groups_of方法的具体用法?Python util.lazy_groups_of怎么用?Python util.lazy_groups_of使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.common.util
的用法示例。
在下文中一共展示了util.lazy_groups_of方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import lazy_groups_of [as 别名]
def run(self) -> None:
has_reader = self._dataset_reader is not None
index = 0
if has_reader:
for batch in lazy_groups_of(self._get_instance_data(), self._batch_size):
for model_input_instance, result in zip(batch, self._predict_instances(batch)):
self._maybe_print_to_console_and_file(index, result, str(model_input_instance))
index = index + 1
else:
for batch_json in lazy_groups_of(self._get_json_data(), self._batch_size):
for model_input_json, result in zip(batch_json, self._predict_json(batch_json)):
self._maybe_print_to_console_and_file(
index, result, json.dumps(model_input_json)
)
index = index + 1
if self._output_file is not None:
self._output_file.close()
示例2: embed_sentences
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import lazy_groups_of [as 别名]
def embed_sentences(self,
sentences ,
batch_size = DEFAULT_BATCH_SIZE) :
u"""
Computes the ELMo embeddings for a iterable of sentences.
Please note that ELMo has internal state and will give different results for the same input.
See the comment under the class definition.
Parameters
----------
sentences : ``Iterable[List[str]]``, required
An iterable of tokenized sentences.
batch_size : ``int``, required
The number of sentences ELMo should process at once.
Returns
-------
A list of tensors, each representing the ELMo vectors for the input sentence at the same index.
"""
for batch in lazy_groups_of(iter(sentences), batch_size):
for _i in self.embed_batch(batch):
yield _i
示例3: __iter__
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import lazy_groups_of [as 别名]
def __iter__(self) -> Iterable[List[int]]:
indices, _ = self._argsort_by_padding(self.data_source)
batches = []
for group in lazy_groups_of(indices, self.batch_size):
batch_indices = list(group)
if self.drop_last and len(batch_indices) < self.batch_size:
continue
batches.append(batch_indices)
random.shuffle(batches)
for batch in batches:
yield batch
示例4: test_lazy_groups_of
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import lazy_groups_of [as 别名]
def test_lazy_groups_of(self):
xs = [1, 2, 3, 4, 5, 6, 7]
groups = util.lazy_groups_of(iter(xs), group_size=3)
assert next(groups) == [1, 2, 3]
assert next(groups) == [4, 5, 6]
assert next(groups) == [7]
with pytest.raises(StopIteration):
_ = next(groups)
示例5: run
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import lazy_groups_of [as 别名]
def run(self) :
has_reader = self._dataset_reader is not None
if has_reader:
for batch in lazy_groups_of(self._get_instance_data(), self._batch_size):
for result in self._predict_instances(batch):
self._maybe_print_to_console_and_file(result)
else:
for batch_json in lazy_groups_of(self._get_json_data(), self._batch_size):
for model_input, result in izip(batch_json, self._predict_json(batch_json)):
self._maybe_print_to_console_and_file(result, json.dumps(model_input))
if self._output_file is not None:
self._output_file.close()
示例6: _memory_sized_lists
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import lazy_groups_of [as 别名]
def _memory_sized_lists(self,
instances ) :
u"""
Breaks the dataset into "memory-sized" lists of instances,
which it yields up one at a time until it gets through a full epoch.
For example, if the dataset is already an in-memory list, and each epoch
represents one pass through the dataset, it just yields back the dataset.
Whereas if the dataset is lazily read from disk and we've specified to
load 1000 instances at a time, then it yields lists of 1000 instances each.
"""
lazy = is_lazy(instances)
# Get an iterator over the next epoch worth of instances.
iterator = self._take_instances(instances, self._instances_per_epoch)
# We have four different cases to deal with:
# With lazy instances and no guidance about how many to load into memory,
# we just load ``batch_size`` instances at a time:
if lazy and self._max_instances_in_memory is None:
_i = lazy_groups_of(iterator, self._batch_size)
while True:
yield _i.next()
# If we specified max instances in memory, lazy or not, we just
# load ``max_instances_in_memory`` instances at a time:
elif self._max_instances_in_memory is not None:
_i = lazy_groups_of(iterator, self._max_instances_in_memory)
while True:
yield _i.next()
# If we have non-lazy instances, and we want all instances each epoch,
# then we just yield back the list of instances:
elif self._instances_per_epoch is None:
yield ensure_list(instances)
# In the final case we have non-lazy instances, we want a specific number
# of instances each epoch, and we didn't specify how to many instances to load
# into memory. So we convert the whole iterator to a list:
else:
yield list(iterator)
示例7: _create_batches
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import lazy_groups_of [as 别名]
def _create_batches(self, instances , shuffle ) :
# First break the dataset into memory-sized lists:
for instance_list in self._memory_sized_lists(instances):
if shuffle:
random.shuffle(instance_list)
iterator = iter(instance_list)
# Then break each memory-sized list into batches.
for batch_instances in lazy_groups_of(iterator, self._batch_size):
for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances):
batch = Batch(possibly_smaller_batches)
yield batch
示例8: run
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import lazy_groups_of [as 别名]
def run(self) -> None:
# Instead of lazy reading of data, load them at once, extend the vocab for model
# and then predict in batches.
has_reader = self._dataset_reader is not None
index = 0
if has_reader:
instances = list(self._get_instance_data())
else:
jsons = list(self._get_json_data())
instances = [self._predictor._json_to_instance(json) for json in jsons]
embedding_sources: Dict[str, str] = (json.loads(self._embedding_sources_mapping)
if self._embedding_sources_mapping else {})
self._predictor._model.vocab.extend_from_instances(Params({}), instances=instances)
self._predictor._model.extend_embedder_vocab(embedding_sources)
if has_reader:
for batch in lazy_groups_of(iter(instances), self._batch_size):
for model_input_instance, result in zip(batch, self._predict_instances(batch)):
self._maybe_print_to_console_and_file(index, result, str(model_input_instance))
index = index + 1
else:
for batch_json in lazy_groups_of(iter(jsons), self._batch_size):
for model_input_json, result in zip(batch_json, self._predict_json(batch_json)):
self._maybe_print_to_console_and_file(index, result, json.dumps(model_input_json))
index = index + 1
if self._output_file is not None:
self._output_file.close()
示例9: _create_batches
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import lazy_groups_of [as 别名]
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]:
for instance_list in self._memory_sized_lists(instances):
bags = {}
for instance_id, mentions in groupby(
sorted(instance_list, key=lambda instance: instance['metadata']['instance_id']),
key=lambda instance: instance['metadata']['instance_id']):
bags[instance_id] = list(mentions)
shuffled_instance_ids = list(bags.keys())
random.shuffle(shuffled_instance_ids)
print('creating new instances')
new_instances = []
for instance_id in shuffled_instance_ids:
mentions = bags[instance_id]
if shuffle:
random.shuffle(mentions)
new_instances.extend(mentions)
print('creating batches')
batches = []
for batch_instances in lazy_groups_of(iter(new_instances), self._batch_size):
for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances):
batches.append(Batch(possibly_smaller_batches))
print('num batches:', len(batches))
move_to_front = self._biggest_batch_first and len(batches) > 1
if move_to_front:
# We'll actually pop the last _two_ batches, because the last one might not be full.
last_batch = batches.pop()
penultimate_batch = batches.pop()
if move_to_front:
batches.insert(0, penultimate_batch)
batches.insert(0, last_batch)
print('yielding from batches')
yield from batches
示例10: _validation_loss
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import lazy_groups_of [as 别名]
def _validation_loss(self) -> Tuple[float, int]:
"""
Computes the validation loss. Returns it and the number of batches.
"""
logger.info("Validating")
self.model.eval()
# Replace parameter values with the shadow values from the moving averages.
if self._moving_average is not None:
self._moving_average.assign_average_value()
if self._validation_iterator is not None:
val_iterator = self._validation_iterator
else:
val_iterator = self.iterator
num_gpus = len(self._cuda_devices)
raw_val_generator = val_iterator(self._validation_data, num_epochs=1, shuffle=False)
val_generator = lazy_groups_of(raw_val_generator, num_gpus)
num_validation_batches = math.ceil(
val_iterator.get_num_batches(self._validation_data) / num_gpus
)
val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches)
batches_this_epoch = 0
val_loss = 0
for batch_group in val_generator_tqdm:
loss = self.batch_loss(batch_group, for_training=False)
if loss is not None:
# You shouldn't necessarily have to compute a loss for validation, so we allow for
# `loss` to be None. We need to be careful, though - `batches_this_epoch` is
# currently only used as the divisor for the loss function, so we can safely only
# count those batches for which we actually have a loss. If this variable ever
# gets used for something else, we might need to change things around a bit.
batches_this_epoch += 1
val_loss += loss.detach().cpu().numpy()
# Update the description with the latest metrics
val_metrics = training_util.get_metrics(self.model, val_loss, batches_this_epoch)
description = training_util.description_from_metrics(val_metrics)
val_generator_tqdm.set_description(description, refresh=False)
# Now restore the original parameter values.
if self._moving_average is not None:
self._moving_average.restore()
return val_loss, batches_this_epoch
示例11: embed_sentences
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import lazy_groups_of [as 别名]
def embed_sentences(self,
sentences: Iterable[List[str]],
add_bos: bool = False,
add_eos: bool = False,
initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
batch_size: int = DEFAULT_BATCH_SIZE) -> \
List[Tuple[numpy.ndarray, Tuple[torch.Tensor, torch.Tensor]]]:
"""
Computes the forward only ELMo embeddings for a iterable of sentences.
See the comment under the class definition.
Parameters
----------
sentences : ``Iterable[List[str]]``, required
An iterable of tokenized sentences.
add_bos: ``bool``
Whether to add begin of sentence token.
add_eos: ``bool``
Whether to add end of sentence token.
initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None)
A tuple (state, memory) representing the initial hidden state and memory
of the LSTM, with shape (num_layers, batch_size, 1 * hidden_size) and
(num_layers, batch_size, 1 * cell_size) respectively.
Or, with shape (num_layers, 1 * hidden_size) and
(num_layers, 1 * cell_size) respectively, if all the batch share the same initial_state.
batch_size : ``int``, required
The number of sentences ELMo should process at once.
Returns
-------
A list of tuple of (numpy.ndarray/torch.Tensor, (torch.Tensor, torch.Tensor)),
each representing the ELMo vectors for the input sentence
at the same index, and the final states after running that sentence, with shape (num_layers, hidden_size) and
(num_layers, cell_size) respectively.
(The return type could also be a generator. Can convert to a list using list().)
"""
embeddings_and_states = []
print('Embedding sentences into forward ELMo vectors ---')
# for batch in Tqdm.tqdm(lazy_groups_of(iter(sentences), batch_size)):
for batch in lazy_groups_of(iter(sentences), batch_size):
elmo_embeddings, final_states = self.forward(batch, add_bos, add_eos, initial_state)
# Remember: final_states is a tuple of tensors
final_states_chunked = []
for i in range(2):
final_states_chunked.append(list(map(lambda x: torch.squeeze(x, dim=1),
final_states[i].chunk(final_states[i].size(1), dim=1))))
final_states_chunked = list(zip(*final_states_chunked))
assert len(elmo_embeddings) == len(final_states_chunked), 'length of embeddings and final states mismatch'
# yield from zip(elmo_embeddings, final_states_chunked)
embeddings_and_states += list(zip(elmo_embeddings, final_states_chunked))
return embeddings_and_states
示例12: create_cached_cnn_embeddings
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import lazy_groups_of [as 别名]
def create_cached_cnn_embeddings(self, tokens: List[str]) -> None:
"""
Given a list of tokens, this method precomputes word representations
by running just the character convolutions and highway layers of elmo,
essentially creating uncontextual word vectors. On subsequent forward passes,
the word ids are looked up from an embedding, rather than being computed on
the fly via the CNN encoder.
This function sets 3 attributes:
_word_embedding : ``torch.Tensor``
The word embedding for each word in the tokens passed to this method.
_bos_embedding : ``torch.Tensor``
The embedding for the BOS token.
_eos_embedding : ``torch.Tensor``
The embedding for the EOS token.
Parameters
----------
tokens : ``List[str]``, required.
A list of tokens to precompute character convolutions for.
"""
tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens
timesteps = 32
batch_size = 32
chunked_tokens = lazy_groups_of(iter(tokens), timesteps)
all_embeddings = []
device = get_device_of(next(self.parameters()))
for batch in lazy_groups_of(chunked_tokens, batch_size):
# Shape (batch_size, timesteps, 50)
batched_tensor = batch_to_ids(batch)
# NOTE: This device check is for when a user calls this method having
# already placed the model on a device. If this is called in the
# constructor, it will probably happen on the CPU. This isn't too bad,
# because it's only a few convolutions and will likely be very fast.
if device >= 0:
batched_tensor = batched_tensor.cuda(device)
output = self._token_embedder(batched_tensor, add_bos=False, add_eos=False)
token_embedding = output["token_embedding"]
mask = output["mask"]
token_embedding, _ = remove_sentence_boundaries(token_embedding, mask, rmv_bos=False, rmv_eos=False)
all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1)))
full_embedding = torch.cat(all_embeddings, 0)
# We might have some trailing embeddings from padding in the batch, so
# we clip the embedding and lookup to the right size.
full_embedding = full_embedding[:len(tokens), :]
embedding = full_embedding[2:len(tokens), :]
vocab_size, embedding_dim = list(embedding.size())
from allennlp.modules.token_embedders import Embedding # type: ignore
self._bos_embedding = full_embedding[0, :]
self._eos_embedding = full_embedding[1, :]
self._word_embedding = Embedding(vocab_size, # type: ignore
embedding_dim,
weight=embedding.data,
trainable=self._requires_grad,
padding_index=0)
示例13: create_cached_cnn_embeddings
# 需要导入模块: from allennlp.common import util [as 别名]
# 或者: from allennlp.common.util import lazy_groups_of [as 别名]
def create_cached_cnn_embeddings(self, tokens ) :
u"""
Given a list of tokens, this method precomputes word representations
by running just the character convolutions and highway layers of elmo,
essentially creating uncontextual word vectors. On subsequent forward passes,
the word ids are looked up from an embedding, rather than being computed on
the fly via the CNN encoder.
This function sets 3 attributes:
_word_embedding : ``torch.Tensor``
The word embedding for each word in the tokens passed to this method.
_bos_embedding : ``torch.Tensor``
The embedding for the BOS token.
_eos_embedding : ``torch.Tensor``
The embedding for the EOS token.
Parameters
----------
tokens : ``List[str]``, required.
A list of tokens to precompute character convolutions for.
"""
tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens
timesteps = 32
batch_size = 32
chunked_tokens = lazy_groups_of(iter(tokens), timesteps)
all_embeddings = []
device = get_device_of(next(self.parameters()))
for batch in lazy_groups_of(chunked_tokens, batch_size):
# Shape (batch_size, timesteps, 50)
batched_tensor = batch_to_ids(batch)
# NOTE: This device check is for when a user calls this method having
# already placed the model on a device. If this is called in the
# constructor, it will probably happen on the CPU. This isn't too bad,
# because it's only a few convolutions and will likely be very fast.
if device >= 0:
batched_tensor = batched_tensor.cuda(device)
output = self._token_embedder(batched_tensor)
token_embedding = output[u"token_embedding"]
mask = output[u"mask"]
token_embedding, _ = remove_sentence_boundaries(token_embedding, mask)
all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1)))
full_embedding = torch.cat(all_embeddings, 0)
# We might have some trailing embeddings from padding in the batch, so
# we clip the embedding and lookup to the right size.
full_embedding = full_embedding[:len(tokens), :]
embedding = full_embedding[2:len(tokens), :]
vocab_size, embedding_dim = list(embedding.size())
from allennlp.modules.token_embedders import Embedding # type: ignore
self._bos_embedding = full_embedding[0, :]
self._eos_embedding = full_embedding[1, :]
self._word_embedding = Embedding(vocab_size, # type: ignore
embedding_dim,
weight=embedding.data,
trainable=self._requires_grad,
padding_index=0)