本文整理汇总了Python中tensor2tensor.data_generators.text_encoder.EOS_ID属性的典型用法代码示例。如果您正苦于以下问题:Python text_encoder.EOS_ID属性的具体用法?Python text_encoder.EOS_ID怎么用?Python text_encoder.EOS_ID使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类tensor2tensor.data_generators.text_encoder
的用法示例。
在下文中一共展示了text_encoder.EOS_ID属性的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: generate_encoded_samples
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import EOS_ID [as 别名]
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
"""A generator that generates samples that are encoded.
Args:
data_dir: data directory
tmp_dir: temp directory
dataset_split: dataset split
Yields:
A dict.
"""
generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
encoder = self.get_or_create_vocab(data_dir, tmp_dir)
label_encoder = self.get_labels_encoder(data_dir)
for sample in generator:
inputs = encoder.encode(sample['inputs'])
inputs.append(text_encoder.EOS_ID)
context = encoder.encode(sample['context'])
context.append(text_encoder.EOS_ID)
targets = label_encoder.encode(sample['targets'])
sample['targets'] = targets
yield {'inputs': inputs, 'context': context, 'targets': targets}
示例2: generate_data
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import EOS_ID [as 别名]
def generate_data(self, data_dir, _, task_id=-1):
def generator_eos(nbr_symbols, max_length, nbr_cases):
"""Shift by NUM_RESERVED_IDS and append EOS token."""
for case in self.generator(nbr_symbols, max_length, nbr_cases):
new_case = {}
for feature in case:
new_case[feature] = [
i + text_encoder.NUM_RESERVED_TOKENS for i in case[feature]
] + [text_encoder.EOS_ID]
yield new_case
utils.generate_dataset_and_shuffle(
generator_eos(self.num_symbols, self.train_length, self.train_size),
self.training_filepaths(data_dir, self.num_shards, shuffled=True),
generator_eos(self.num_symbols, self.dev_length, self.dev_size),
self.dev_filepaths(data_dir, 1, shuffled=True),
shuffle=False)
示例3: generate_encoded_samples
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import EOS_ID [as 别名]
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
"""A generator that generates samples that are encoded.
Args:
data_dir: data directory
tmp_dir: temp directory
dataset_split: dataset split
Yields:
A dict.
"""
generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
txt_encoder = self.get_or_create_vocab(data_dir, tmp_dir)
label_encoder = self.get_labels_encoder(data_dir)
for sample in generator:
inputs = txt_encoder.encode(sample["inputs"])
inputs.append(text_encoder.EOS_ID)
targets = label_encoder.encode(sample["label"])
yield {"inputs": inputs, "targets": targets}
示例4: generate_encoded_samples
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import EOS_ID [as 别名]
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
"""A generator that generates samples that are encoded.
Args:
data_dir: data directory
tmp_dir: temp directory
dataset_split: dataset split
Yields:
A dict.
"""
generator = self.generate_samples(data_dir, tmp_dir, dataset_split)
encoder = self.get_or_create_vocab(data_dir, tmp_dir)
label_encoder = self.get_labels_encoder(data_dir)
for sample in generator:
inputs = encoder.encode(sample["inputs"])
inputs.append(text_encoder.EOS_ID)
context = encoder.encode(sample["context"])
context.append(text_encoder.EOS_ID)
targets = label_encoder.encode(sample["targets"])
sample["targets"] = targets
yield {"inputs": inputs, "context": context, "targets": targets}
示例5: text2text_generate_encoded_oovs
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import EOS_ID [as 别名]
def text2text_generate_encoded_oovs(self,
sample_generator,
vocab,
targets_vocab=None,
has_inputs=True):
"""Encode Text2Text samples from the generator with the vocab."""
targets_vocab = targets_vocab or vocab
for sample in sample_generator:
if has_inputs:
(sample["inputs"], sample["inputs_extend"], source_oovs,
_) = vocab.encode(sample["inputs"])
sample["inputs"].append(text_encoder.EOS_ID)
sample["inputs_extend"].append(text_encoder.EOS_ID)
# need to pass the source OOV tokens to the target encoder
sample["targets"], sample["targets_extend"] = targets_vocab.encode_target(
sample["targets"], source_oovs)
sample["targets"].append(text_encoder.EOS_ID)
sample["targets_extend"].append(text_encoder.EOS_ID)
yield sample
示例6: consume
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import EOS_ID [as 别名]
def consume(self, word):
"""Append ``word`` to the current history."""
if word == utils.EOS_ID:
return
pos = (word // EditT2TPredictor.POS_FACTOR) \
% (EditT2TPredictor.MAX_SEQ_LEN + 1)
token = word % EditT2TPredictor.POS_FACTOR
# TODO(fstahlberg): Do not hard code the following section
op = word // 100000000
if op == 1: # Insertion
self.trg_sentence = self._ins_op(pos, token)
elif op == 2: # Substitution
self.trg_sentence = self._sub_op(pos, token)
elif op == 3: # Deletion
self.trg_sentence = self._del_op(pos)
else:
logging.warn("Invalid edit descriptor %d. Ignoring..." % word)
self._update_cur_score()
self.cache.add(self.trg_sentence, utils.NEG_INF)
示例7: _decode_batch_input_fn
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import EOS_ID [as 别名]
def _decode_batch_input_fn(num_decode_batches, sorted_inputs, vocabulary,
batch_size, max_input_size):
"""Generator to produce batches of inputs."""
tf.logging.info(" batch %d" % num_decode_batches)
# First reverse all the input sentences so that if you're going to get OOMs,
# you'll see it in the first batch
sorted_inputs.reverse()
for b in range(num_decode_batches):
tf.logging.info("Decoding batch %d" % b)
batch_length = 0
batch_inputs = []
for inputs in sorted_inputs[b * batch_size:(b + 1) * batch_size]:
input_ids = vocabulary.encode(inputs)
if max_input_size > 0:
# Subtract 1 for the EOS_ID.
input_ids = input_ids[:max_input_size - 1]
input_ids.append(text_encoder.EOS_ID)
batch_inputs.append(input_ids)
if len(input_ids) > batch_length:
batch_length = len(input_ids)
final_batch_inputs = []
for input_ids in batch_inputs:
assert len(input_ids) <= batch_length
x = input_ids + [0] * (batch_length - len(input_ids))
final_batch_inputs.append(x)
yield {
"inputs": np.array(final_batch_inputs).astype(np.int32),
}
示例8: _save_until_eos
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import EOS_ID [as 别名]
def _save_until_eos(ids, skip=False):
"""Strips everything after the first <EOS> token, which is normally 1."""
ids = ids.flatten()
if skip:
return ids
try:
index = list(ids).index(text_encoder.EOS_ID)
return ids[0:index]
except ValueError:
# No EOS_ID: return the array as-is.
return ids
示例9: _truncate_to_lead_section
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import EOS_ID [as 别名]
def _truncate_to_lead_section(example):
wiki = example["targets"]
lead_boundary = example["section_boundaries"][0]
# Concat a new EOS to the lead since the original one gets truncated.
lead = tf.concat((wiki[:lead_boundary], [text_encoder.EOS_ID]), 0)
return lead
示例10: _default_hparams
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import EOS_ID [as 别名]
def _default_hparams():
"""A set of basic model hyperparameters."""
return tf.contrib.training.HParams(
# Use this parameter to get comparable perplexity numbers with different
# tokenizations. This value should be set to the ratio of the number of
# tokens in the test set according to the tokenization used to the number
# of tokens in the test set in the "official" tokenization. For
# example, if we are using a word-piece based model and we want to
# compute per-word perplexity, then we set loss_multiplier to the number
# of wordpieces per word in the test set.
loss_multiplier=1.0,
# Use this parameter to allow for larger sequences in the batch. Without
# the use of this parameter, the size of the inner two dimensions will
# be used to judge the sequence length.
batch_size_multiplier=1,
# During inference for autoregressive problems, if the batch_size is 1,
# the inference will stop when the model predict a text_encoder.EOS_ID
# token.
stop_at_eos=False,
# Modalities used to map from input features to a space compatible with
# chosen model architecture. One modality spec (which is a 2-tuple,
# (modality_full_name, vocab_size)) per feature key. modality_full_name
# is a string type:name, e.g. class_label:class_label_2d. Leaving off
# the name uses the default modality for that type (e.g. class_label ==
# class_label:default).
input_modality={},
# Modality used to map from hidden representation to the target space.
# Specified as a modality spec, a 2-tuple described above.
target_modality=None,
# Identifiers used to tell the model which input/target space will be
# expected. For example, it can tell that we expect French as characters
# as output, or Spanish as sound. Spaces defined as constants in SpaceID
# class.
input_space_id=SpaceID.GENERIC,
target_space_id=SpaceID.GENERIC)
示例11: encode
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import EOS_ID [as 别名]
def encode(self, s):
return super(ByteTextEncoderWithEos, self).encode(s) + [text_encoder.EOS_ID]
示例12: generate_encoded_samples
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import EOS_ID [as 别名]
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
generator = super(
QuestionAndContext2TextProblem, self).generate_encoded_samples(
data_dir, tmp_dir, dataset_split)
vocab = self.feature_encoders(data_dir)["context"]
for sample in generator:
context = vocab.encode(sample["context"])
context.append(text_encoder.EOS_ID)
sample["context"] = context
yield sample
示例13: text2text_generate_encoded
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import EOS_ID [as 别名]
def text2text_generate_encoded(sample_generator,
vocab,
targets_vocab=None,
has_inputs=True):
"""Encode Text2Text samples from the generator with the vocab."""
targets_vocab = targets_vocab or vocab
for sample in sample_generator:
if has_inputs:
sample["inputs"] = vocab.encode(sample["inputs"])
sample["inputs"].append(text_encoder.EOS_ID)
sample["targets"] = targets_vocab.encode(sample["targets"])
sample["targets"].append(text_encoder.EOS_ID)
yield sample
示例14: to_example_dict
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import EOS_ID [as 别名]
def to_example_dict(encoder, inputs, mask, outputs):
"""Convert single h5 record to an example dict."""
# Inputs
bases = []
input_ids = []
last_idx = -1
for row in np.argwhere(inputs):
idx, base_id = row
idx, base_id = int(idx), int(base_id)
assert idx > last_idx # if not, means 2 True values in 1 row
# Some rows are all False. Those rows are mapped to UNK_ID.
while idx != last_idx + 1:
bases.append(encoder.UNK)
last_idx += 1
bases.append(encoder.BASES[base_id])
last_idx = idx
assert len(inputs) == len(bases)
input_ids = encoder.encode(bases)
input_ids.append(text_encoder.EOS_ID)
# Targets: mask and output
targets_mask = [float(v) for v in mask]
# The output is (n, m); store targets_shape so that it can be reshaped
# properly on the other end.
targets = [float(v) for v in outputs.flatten()]
targets_shape = [int(dim) for dim in outputs.shape]
assert mask.shape[0] == outputs.shape[0]
example_keys = ["inputs", "targets_mask", "targets", "targets_shape"]
ex_dict = dict(
zip(example_keys, [input_ids, targets_mask, targets, targets_shape]))
return ex_dict
示例15: _encode
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import EOS_ID [as 别名]
def _encode(inputs, encoder, add_eos=True):
input_ids = encoder.encode(inputs)
if add_eos:
input_ids.append(text_encoder.EOS_ID)
return input_ids