本文整理汇总了Python中tensor2tensor.data_generators.text_encoder.ByteTextEncoder方法的典型用法代码示例。如果您正苦于以下问题:Python text_encoder.ByteTextEncoder方法的具体用法?Python text_encoder.ByteTextEncoder怎么用?Python text_encoder.ByteTextEncoder使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tensor2tensor.data_generators.text_encoder
的用法示例。
在下文中一共展示了text_encoder.ByteTextEncoder方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: vocab_type
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import ByteTextEncoder [as 别名]
def vocab_type(self):
"""What kind of vocabulary to use.
`VocabType`s:
* `SUBWORD`: `SubwordTextEncoder`, an invertible wordpiece vocabulary.
Must provide `self.approx_vocab_size`. Generates the vocabulary based on
the training data. To limit the number of samples the vocab generation
looks at, override `self.max_samples_for_vocab`. Recommended and
default.
* `CHARACTER`: `ByteTextEncoder`, encode raw bytes.
* `TOKEN`: `TokenTextEncoder`, vocabulary based on a file. Must provide a
vocabulary file yourself (`TokenTextEncoder.store_to_file`) because one
will not be generated for you. The vocab file should be stored in
`data_dir/` with the name specified by `self.vocab_filename`.
Returns:
VocabType constant
"""
return VocabType.SUBWORD
示例2: tabbed_parsing_character_generator
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import ByteTextEncoder [as 别名]
def tabbed_parsing_character_generator(tmp_dir, train):
"""Generate source and target data from a single file."""
character_vocab = text_encoder.ByteTextEncoder()
filename = "parsing_{0}.pairs".format("train" if train else "dev")
pair_filepath = os.path.join(tmp_dir, filename)
return text_problems.text2text_generate_encoded(
text_problems.text2text_txt_tab_iterator(pair_filepath), character_vocab)
示例3: feature_encoders
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import ByteTextEncoder [as 别名]
def feature_encoders(self, _):
return {
"inputs": text_encoder.TextEncoder(),
"targets": text_encoder.ByteTextEncoder(),
}
示例4: feature_encoders
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import ByteTextEncoder [as 别名]
def feature_encoders(self, data_dir):
if self.is_character_level:
encoder = text_encoder.ByteTextEncoder()
else:
vocab_filename = os.path.join(
data_dir, "vocab.ende.%d" % self.targeted_vocab_size)
encoder = text_encoder.SubwordTextEncoder(vocab_filename)
input_encoder = text_encoder.ImageEncoder(channels=self.num_channels)
return {"inputs": input_encoder, "targets": encoder}
示例5: feature_encoders
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import ByteTextEncoder [as 别名]
def feature_encoders(self, data_dir):
if self.is_character_level:
encoder = text_encoder.ByteTextEncoder()
else:
vocab_filename = os.path.join(
data_dir, self.vocab_problem.vocab_filename)
encoder = text_encoder.SubwordTextEncoder(vocab_filename)
input_encoder = text_encoder.ImageEncoder(channels=self.num_channels)
return {"inputs": input_encoder, "targets": encoder}
示例6: generator
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import ByteTextEncoder [as 别名]
def generator(self, data_dir, tmp_dir, train):
'''
Generate the vocab and then build train and validation t2t-datagen files.
Four .txt files have to be present in the data_dir directory:
trainSource.txt
trainTarget.txt
devSource.txt
devTarget.txt
Params:
:train: Whether we are in train mode or not.
'''
character_vocab = text_encoder.ByteTextEncoder()
mode = 'train' if train else 'dev'
print('t2t_csaky_log: ' + mode + ' data generation activated.')
sourcePath = os.path.join(data_dir, mode + 'Source.txt')
targetPath = os.path.join(data_dir, mode + 'Target.txt')
# Try to find the txt files.
if os.path.isfile(sourcePath) and os.path.isfile(targetPath):
print('t2t_csaky_log: Generating ' + mode + ' files in ' + data_dir)
return translate.character_generator(sourcePath,
targetPath,
character_vocab,
EOS)
else:
print('t2t_csaky_log: ' + mode +
' source or target file not found, please check ' +
'that the following files exist in your ' + data_dir +
' directory and rerun this program:')
print(' trainSource.txt')
print(' trainTarget.txt')
print(' devSource.txt')
print(' devTarget.txt')
示例7: main
# 需要导入模块: from tensor2tensor.data_generators import text_encoder [as 别名]
# 或者: from tensor2tensor.data_generators.text_encoder import ByteTextEncoder [as 别名]
def main(_):
"""Convert a file to examples."""
if FLAGS.subword_text_encoder_filename:
encoder = text_encoder.SubwordTextEncoder(
FLAGS.subword_text_encoder_filename)
elif FLAGS.token_text_encoder_filename:
encoder = text_encoder.TokenTextEncoder(FLAGS.token_text_encoder_filename)
elif FLAGS.byte_text_encoder:
encoder = text_encoder.ByteTextEncoder()
else:
encoder = None
reader = tf.python_io.tf_record_iterator(FLAGS.input_filename)
total_sequences = 0
total_input_tokens = 0
total_target_tokens = 0
nonpadding_input_tokens = 0
nonpadding_target_tokens = 0
max_input_length = 0
max_target_length = 0
for record in reader:
x = tf.train.Example()
x.ParseFromString(record)
inputs = [int(i) for i in x.features.feature["inputs"].int64_list.value]
targets = [int(i) for i in x.features.feature["targets"].int64_list.value]
if FLAGS.print_inputs:
print("INPUTS:\n" + encoder.decode(inputs) if encoder else inputs)
if FLAGS.print_targets:
print("TARGETS:\n" + encoder.decode(targets) if encoder else targets)
nonpadding_input_tokens += len(inputs) - inputs.count(0)
nonpadding_target_tokens += len(targets) - targets.count(0)
total_input_tokens += len(inputs)
total_target_tokens += len(targets)
total_sequences += 1
max_input_length = max(max_input_length, len(inputs))
max_target_length = max(max_target_length, len(targets))
if FLAGS.print_all:
for k, v in six.iteritems(x.features.feature):
print("%s: %s" % (k, v.int64_list.value))
print("total_sequences: %d" % total_sequences)
print("total_input_tokens: %d" % total_input_tokens)
print("total_target_tokens: %d" % total_target_tokens)
print("nonpadding_input_tokens: %d" % nonpadding_input_tokens)
print("nonpadding_target_tokens: %d" % nonpadding_target_tokens)
print("max_input_length: %d" % max_input_length)
print("max_target_length: %d" % max_target_length)