本文整理匯總了Python中tensor2tensor.data_generators.generator_utils.get_or_generate_vocab方法的典型用法代碼示例。如果您正苦於以下問題:Python generator_utils.get_or_generate_vocab方法的具體用法?Python generator_utils.get_or_generate_vocab怎麽用?Python generator_utils.get_or_generate_vocab使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類tensor2tensor.data_generators.generator_utils
的用法示例。
在下文中一共展示了generator_utils.get_or_generate_vocab方法的8個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: generate_samples
# 需要導入模塊: from tensor2tensor.data_generators import generator_utils [as 別名]
# 或者: from tensor2tensor.data_generators.generator_utils import get_or_generate_vocab [as 別名]
def generate_samples(self, data_dir, tmp_dir, dataset_split):
dataset = self.dataset_url(dataset_split)
tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"
url = dataset[0][0]
compressed_filename = os.path.basename(url)
compressed_filepath = os.path.join(tmp_dir, compressed_filename)
generator_utils.maybe_download(tmp_dir, compressed_filename, url)
mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
with tarfile.open(compressed_filepath, mode) as corpus_tar:
corpus_tar.extractall(tmp_dir)
if self.vocab_type == text_problems.VocabType.SUBWORD:
generator_utils.get_or_generate_vocab(
data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
self.vocab_data_files())
source_file = os.path.join(tmp_dir, tag + ".modern")
target_file = os.path.join(tmp_dir, tag + ".original")
return text_problems.text2text_txt_iterator(source_file,
target_file)
示例2: generate_samples
# 需要導入模塊: from tensor2tensor.data_generators import generator_utils [as 別名]
# 或者: from tensor2tensor.data_generators.generator_utils import get_or_generate_vocab [as 別名]
def generate_samples(self, data_dir, tmp_dir, dataset_split):
dataset = self.dataset_url(dataset_split)
url = dataset[0][0]
compressed_filename = os.path.basename(url)
compressed_filepath = os.path.join(tmp_dir, compressed_filename)
generator_utils.maybe_download(tmp_dir, compressed_filename, url)
mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
with tarfile.open(compressed_filepath, mode) as corpus_tar:
corpus_tar.extractall(tmp_dir)
if self.vocab_type == text_problems.VocabType.SUBWORD:
generator_utils.get_or_generate_vocab(
data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
self.vocab_data_files())
source_file, target_file = self.source_target_paths(dataset_split, tmp_dir)
return text_problems.text2text_txt_iterator(source_file,
target_file)
示例3: generator
# 需要導入模塊: from tensor2tensor.data_generators import generator_utils [as 別名]
# 或者: from tensor2tensor.data_generators.generator_utils import get_or_generate_vocab [as 別名]
def generator(self, data_dir, tmp_dir, train):
datasets = self.get_datasets(train)
# build vocab from training datasets
source_datasets = [[item[0], [item[1][0]]] for item in self.get_datasets(train=True)]
target_datasets = [[item[0], [item[1][1]]] for item in self.get_datasets(train=True)]
source_vocab = generator_utils.get_or_generate_vocab(
data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size,
source_datasets)
target_vocab = generator_utils.get_or_generate_vocab(
data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size,
target_datasets)
tag = "train" if train else "dev"
data_path = compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag)
return bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2",
source_vocab, target_vocab, EOS)
示例4: vocab_data_files
# 需要導入模塊: from tensor2tensor.data_generators import generator_utils [as 別名]
# 或者: from tensor2tensor.data_generators.generator_utils import get_or_generate_vocab [as 別名]
def vocab_data_files(self):
"""Files to be passed to get_or_generate_vocab."""
return self.dataset_url(problem.DatasetSplit.TRAIN)
示例5: vocab_data_files
# 需要導入模塊: from tensor2tensor.data_generators import generator_utils [as 別名]
# 或者: from tensor2tensor.data_generators.generator_utils import get_or_generate_vocab [as 別名]
def vocab_data_files(self):
"""Files to be passed to get_or_generate_vocab."""
return self.source_data_files(problem.DatasetSplit.TRAIN)
示例6: generate_samples
# 需要導入模塊: from tensor2tensor.data_generators import generator_utils [as 別名]
# 或者: from tensor2tensor.data_generators.generator_utils import get_or_generate_vocab [as 別名]
def generate_samples(self, data_dir, tmp_dir, dataset_split):
datasets = self.source_data_files(dataset_split)
tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"
data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name,
tag))
if self.vocab_type == text_problems.VocabType.SUBWORD:
generator_utils.get_or_generate_vocab(
data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size,
self.vocab_data_files())
return text_problems.text2text_txt_iterator(data_path + ".lang1",
data_path + ".lang2")
示例7: generate_encoded_samples
# 需要導入模塊: from tensor2tensor.data_generators import generator_utils [as 別名]
# 或者: from tensor2tensor.data_generators.generator_utils import get_or_generate_vocab [as 別名]
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
train = dataset_split == problem.DatasetSplit.TRAIN
train_dataset = self.get_training_dataset(tmp_dir)
datasets = train_dataset if train else _NC_TEST_DATASETS
source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
source_vocab = generator_utils.get_or_generate_vocab(
data_dir,
tmp_dir,
self.source_vocab_name,
self.approx_vocab_size,
source_datasets,
file_byte_budget=1e8)
target_vocab = generator_utils.get_or_generate_vocab(
data_dir,
tmp_dir,
self.target_vocab_name,
self.approx_vocab_size,
target_datasets,
file_byte_budget=1e8)
tag = "train" if train else "dev"
filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag)
data_path = translate.compile_data(tmp_dir, datasets, filename_base)
return text_problems.text2text_generate_encoded(
text_problems.text2text_txt_iterator(data_path + ".lang1",
data_path + ".lang2"),
source_vocab, target_vocab)
示例8: generate_encoded_samples
# 需要導入模塊: from tensor2tensor.data_generators import generator_utils [as 別名]
# 或者: from tensor2tensor.data_generators.generator_utils import get_or_generate_vocab [as 別名]
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
train = dataset_split == problem.DatasetSplit.TRAIN
train_dataset = self.get_training_dataset(tmp_dir)
datasets = train_dataset if train else _NC_TEST_DATASETS
source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
source_vocab = generator_utils.get_or_generate_vocab(
data_dir,
tmp_dir,
self.source_vocab_name,
self.approx_vocab_size,
source_datasets,
file_byte_budget=1e8,
max_subtoken_length=self.max_subtoken_length)
target_vocab = generator_utils.get_or_generate_vocab(
data_dir,
tmp_dir,
self.target_vocab_name,
self.approx_vocab_size,
target_datasets,
file_byte_budget=1e8,
max_subtoken_length=self.max_subtoken_length)
tag = "train" if train else "dev"
filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag)
data_path = translate.compile_data(tmp_dir, datasets, filename_base)
return text_problems.text2text_generate_encoded(
text_problems.text2text_txt_iterator(data_path + ".lang1",
data_path + ".lang2"),
source_vocab, target_vocab)