本文整理汇总了Python中tensor2tensor.data_generators.generator_utils.maybe_download函数的典型用法代码示例。如果您正苦于以下问题:Python maybe_download函数的具体用法?Python maybe_download怎么用?Python maybe_download使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了maybe_download函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _get_mnist
def _get_mnist(directory):
"""Download all MNIST files to directory unless they are there."""
for filename in [
_MNIST_TRAIN_DATA_FILENAME, _MNIST_TRAIN_LABELS_FILENAME,
_MNIST_TEST_DATA_FILENAME, _MNIST_TEST_LABELS_FILENAME
]:
generator_utils.maybe_download(directory, filename, _MNIST_URL + filename)
示例2: maybe_prepare_text
def maybe_prepare_text(self, tmp_dir):
"""Download corpus if necessary, decompress, split into multiple text files.
Args:
tmp_dir: directory containing dataset.
Returns:
list of filepaths for local text files.
"""
compressed_filename = os.path.basename(self.corpus_url)
compressed_filepath = os.path.join(tmp_dir, compressed_filename)
decompressed_filepath = compressed_filepath[:-4]
split_file_prefix = decompressed_filepath + "-part-"
split_filepattern = split_file_prefix + "?????"
split_files = sorted(tf.gfile.Glob(split_filepattern))
if not split_files:
if not tf.gfile.Exists(decompressed_filepath):
if not tf.gfile.Exists(compressed_filepath):
generator_utils.maybe_download(
tmp_dir, compressed_filepath, self.corpus_url)
assert not subprocess.call(["bunzip2", compressed_filepath])
assert tf.gfile.Exists(decompressed_filepath)
assert not subprocess.call([
"split", "--line-bytes=4M", "--suffix-length=5",
"--numeric-suffixes", decompressed_filepath, split_file_prefix])
split_files = sorted(tf.gfile.Glob(split_filepattern))
assert split_files
return split_files
示例3: _get_fashion_mnist
def _get_fashion_mnist(directory):
"""Download all FashionMNIST files to directory unless they are there."""
# Fashion mnist files have the same names as MNIST.
# We must choose a separate name (by adding 'fashion-' prefix) in the tmp_dir.
for filename in [
_MNIST_TRAIN_DATA_FILENAME, _MNIST_TRAIN_LABELS_FILENAME,
_MNIST_TEST_DATA_FILENAME, _MNIST_TEST_LABELS_FILENAME
]:
generator_utils.maybe_download(directory,
_FASHION_MNIST_LOCAL_FILE_PREFIX + filename,
_FASHION_MNIST_URL + filename)
示例4: _compile_data
def _compile_data(tmp_dir, datasets, filename):
"""Concatenate all `datasets` and save to `filename`."""
filename = os.path.join(tmp_dir, filename)
lang1_lines, lang2_lines = [], []
for dataset in datasets:
url = dataset[0]
compressed_filename = os.path.basename(url)
compressed_filepath = os.path.join(tmp_dir, compressed_filename)
lang1_filename, lang2_filename = dataset[1]
lang1_filepath = os.path.join(tmp_dir, lang1_filename)
lang2_filepath = os.path.join(tmp_dir, lang2_filename)
if not os.path.exists(compressed_filepath):
generator_utils.maybe_download(tmp_dir, compressed_filename, url)
if not os.path.exists(lang1_filepath) or not os.path.exists(lang2_filepath):
mode = "r:gz" if "gz" in compressed_filepath else "r"
with tarfile.open(compressed_filepath, mode) as corpus_tar:
corpus_tar.extractall(tmp_dir)
if ".gz" in lang1_filepath:
new_filepath = lang1_filepath.strip(".gz")
generator_utils.gunzip_file(lang1_filepath, new_filepath)
lang1_filepath = new_filepath
if ".gz" in lang2_filepath:
new_filepath = lang2_filepath.strip(".gz")
generator_utils.gunzip_file(lang2_filepath, new_filepath)
lang2_filepath = new_filepath
with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file:
with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file:
lang1_file_lines = lang1_file.readlines()
lang2_file_lines = lang2_file.readlines()
assert len(lang1_file_lines) == len(lang2_file_lines), lang1_filepath
lang1_lines.extend(lang1_file_lines)
lang2_lines.extend(lang2_file_lines)
write_chunk_size = 10000
assert len(lang1_lines) == len(lang2_lines)
with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_file:
i = 0
while i <= len(lang1_lines):
for line in lang1_lines[i * write_chunk_size:(i + 1) * write_chunk_size]:
lang1_file.write(line)
i += 1
for line in lang1_lines[i * write_chunk_size:]:
lang1_file.write(line)
with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_file:
i = 0
while i <= len(lang2_lines):
for line in lang2_lines[i * write_chunk_size:(i + 1) * write_chunk_size]:
lang2_file.write(line)
i += 1
for line in lang2_lines[i * write_chunk_size:]:
lang2_file.write(line)
return filename
示例5: _maybe_download_corpus
def _maybe_download_corpus(tmp_dir):
"""Download and unpack the corpus.
Args:
tmp_dir: directory containing dataset.
"""
corpus_url = ("http://www.statmt.org/lm-benchmark/"
"1-billion-word-language-modeling-benchmark-r13output.tar.gz")
corpus_filename = os.path.basename(corpus_url)
corpus_filepath = os.path.join(tmp_dir, corpus_filename)
if not os.path.exists(corpus_filepath):
generator_utils.maybe_download(tmp_dir, corpus_filename, corpus_url)
with tarfile.open(corpus_filepath, "r:gz") as corpus_tar:
corpus_tar.extractall(tmp_dir)
示例6: generate_samples
def generate_samples(self, data_dir, tmp_dir, dataset_split):
"""A generator to return data samples.Returns the data generator to return.
Args:
data_dir: A string representing the data directory.
tmp_dir: A string representing the temporary directory and is
used to download files if not already available.
dataset_split: Train, Test or Eval.
Yields:
Each element yielded is of a Python dict of the form
{"inputs": "STRING", "targets": "STRING"}
"""
# TODO(sanyamkapoor): Manually separate train/eval data set.
csv_file_names = self.pair_files_list
csv_files = [
generator_utils.maybe_download(tmp_dir, file_list[0], uri)
for uri, file_list in csv_file_names
]
for pairs_file in csv_files:
tf.logging.debug("Reading {}".format(pairs_file))
with open(pairs_file, "r") as csv_file:
for line in csv_file:
reader = csv.reader(StringIO(line))
for docstring_tokens, function_tokens in reader:
yield {"inputs": docstring_tokens, "targets": function_tokens}
示例7: _maybe_download_corpus
def _maybe_download_corpus(tmp_dir, vocab_type):
"""Download and unpack the corpus.
Args:
tmp_dir: directory containing dataset.
vocab_type: which vocabulary are we using.
Returns:
The list of names of files.
"""
filename = os.path.basename(PTB_URL)
compressed_filepath = generator_utils.maybe_download(
tmp_dir, filename, PTB_URL)
ptb_files = []
ptb_char_files = []
with tarfile.open(compressed_filepath, "r:gz") as tgz:
files = []
# Selecting only relevant files.
for m in tgz.getmembers():
if "ptb" in m.name and ".txt" in m.name:
if "char" in m.name:
ptb_char_files += [m.name]
else:
ptb_files += [m.name]
files += [m]
tgz.extractall(tmp_dir, members=files)
if vocab_type == text_problems.VocabType.CHARACTER:
return ptb_char_files
else:
return ptb_files
示例8: _maybe_download_corpus
def _maybe_download_corpus(tmp_dir):
"""Download corpus if necessary.
Args:
tmp_dir: directory containing dataset.
Returns:
filepath of the downloaded corpus file.
"""
corpus_url = ("https://dumps.wikimedia.org/enwiki/20170620/"
"enwiki-20170620-pages-articles-multistream.xml.bz2")
corpus_filename = os.path.basename(corpus_url)
corpus_filepath = os.path.join(tmp_dir, corpus_filename)
if not tf.gfile.Exists(corpus_filepath):
generator_utils.maybe_download(tmp_dir, corpus_filename, corpus_url)
return corpus_filepath
示例9: generator
def generator(self, data_dir, tmp_dir, datasets,
eos_list=None, start_from=0, how_many=0):
i = 0
for url, subdir in datasets:
filename = os.path.basename(url)
compressed_file = generator_utils.maybe_download(tmp_dir, filename, url)
read_type = "r:gz" if filename.endswith("tgz") else "r"
with tarfile.open(compressed_file, read_type) as corpus_tar:
# Create a subset of files that don't already exist.
# tarfile.extractall errors when encountering an existing file
# and tarfile.extract is extremely slow
members = []
for f in corpus_tar:
if not os.path.isfile(os.path.join(tmp_dir, f.name)):
members.append(f)
corpus_tar.extractall(tmp_dir, members=members)
data_dir = os.path.join(tmp_dir, "LibriSpeech", subdir)
data_files = _collect_data(data_dir, "flac", "txt")
data_pairs = data_files.values()
encoders = self.feature_encoders(None)
audio_encoder = encoders["waveforms"]
text_encoder = encoders["targets"]
for media_file, text_data in sorted(data_pairs)[start_from:]:
if how_many > 0 and i == how_many:
return
i += 1
yield {
"waveforms": audio_encoder.encode(media_file),
"targets": text_encoder.encode(text_data)
}
示例10: load_examples
def load_examples(tmp_dir, prop_train=0.09, prop_val=0.01):
"""Loads exampls from the tsv file.
Args:
tmp_dir: temp directory.
prop_train: proportion of the train data
prop_val: proportion of the validation data
Returns:
All examples in the dataset pluse train, test, and development splits.
"""
infile = generator_utils.maybe_download(tmp_dir, _TAR, _URL)
tf.logging.info('Loading examples')
all_examples = []
for i, d in enumerate(csv.DictReader(gzip.open(infile), delimiter='\t')):
if i % 100000 == 0:
tf.logging.info('%d examples have been loaded....' % i)
ex = {x: int(y) if y.isdigit() else y for x, y in d.items()}
all_examples.append(ex)
random.seed(1)
random.shuffle(all_examples)
n_train = int(len(all_examples) * prop_train)
n_val = n_train + int(len(all_examples) * prop_val)
train = all_examples[:n_train]
val = all_examples[n_train:n_val]
test = []
for e in all_examples[n_val:]:
if e['n_intervening'] == e['n_diff_intervening']:
test.append(e)
return all_examples, train, val, test
示例11: _prepare_lambada_data
def _prepare_lambada_data(tmp_dir, data_dir, vocab_size, vocab_filename):
"""Downloading and preparing the dataset.
Args:
tmp_dir: tem directory
data_dir: data directory
vocab_size: size of vocabulary
vocab_filename: name of vocab file
"""
if not tf.gfile.Exists(data_dir):
tf.gfile.MakeDirs(data_dir)
file_path = generator_utils.maybe_download(tmp_dir, _TAR, _URL)
tar_all = tarfile.open(file_path)
tar_all.extractall(tmp_dir)
tar_all.close()
tar_train = tarfile.open(os.path.join(tmp_dir, "train-novels.tar"))
tar_train.extractall(tmp_dir)
tar_train.close()
vocab_path = os.path.join(data_dir, vocab_filename)
if not tf.gfile.Exists(vocab_path):
with tf.gfile.GFile(os.path.join(tmp_dir, _VOCAB), "r") as infile:
reader = csv.reader(infile, delimiter="\t")
words = [row[0] for row in reader]
words = [_UNK] + words[:vocab_size]
with tf.gfile.GFile(vocab_path, "w") as outfile:
outfile.write("\n".join(words))
示例12: _get_mscoco
def _get_mscoco(directory):
"""Download and extract MSCOCO datasets to directory unless it is there."""
for url in _MSCOCO_URLS:
filename = os.path.basename(url)
download_url = os.path.join(_MSCOCO_ROOT_URL, url)
path = generator_utils.maybe_download(directory, filename, download_url)
unzip_dir = os.path.join(directory, filename.strip(".zip"))
if not tf.gfile.Exists(unzip_dir):
zipfile.ZipFile(path, "r").extractall(directory)
示例13: _get_vqa_v2_image_raw_dataset
def _get_vqa_v2_image_raw_dataset(directory, image_root_url, image_urls):
"""Extract the VQA V2 image data set to directory unless it's there."""
for url in image_urls:
filename = os.path.basename(url)
download_url = os.path.join(image_root_url, url)
path = generator_utils.maybe_download(directory, filename, download_url)
unzip_dir = os.path.join(directory, filename.strip(".zip"))
if not tf.gfile.Exists(unzip_dir):
zipfile.ZipFile(path, "r").extractall(directory)
示例14: _download_and_parse_dataset
def _download_and_parse_dataset(tmp_dir, train):
"""Downloads and prepairs the dataset to be parsed by the data_generator."""
file_path = generator_utils.maybe_download(tmp_dir, _SNLI_ZIP, _SNLI_URL)
zip_ref = zipfile.ZipFile(file_path, 'r')
zip_ref.extractall(tmp_dir)
zip_ref.close()
file_name = 'train' if train else 'dev'
dataset_file_path = os.path.join(tmp_dir, _SNLI_DATA_PATH % file_name)
_parse_dataset(dataset_file_path, tmp_dir, train)
示例15: _maybe_download_corpora
def _maybe_download_corpora(self, tmp_dir):
sst_binary_filename = "SST-2.zip"
sst_binary_finalpath = os.path.join(tmp_dir, "SST-2")
if not tf.gfile.Exists(sst_binary_finalpath):
zip_filepath = generator_utils.maybe_download(
tmp_dir, sst_binary_filename, self._SST2_URL)
zip_ref = zipfile.ZipFile(zip_filepath, "r")
zip_ref.extractall(tmp_dir)
zip_ref.close()
return sst_binary_finalpath