本文整理汇总了Python中skip_thoughts.data.special_words.EOS属性的典型用法代码示例。如果您正苦于以下问题:Python special_words.EOS属性的具体用法?Python special_words.EOS怎么用?Python special_words.EOS使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类skip_thoughts.data.special_words
的用法示例。
在下文中一共展示了special_words.EOS属性的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _preprocess
# 需要导入模块: from skip_thoughts.data import special_words [as 别名]
# 或者: from skip_thoughts.data.special_words import EOS [as 别名]
def _preprocess(self, data, use_eos):
"""Preprocesses text for the encoder.
Args:
data: A list of input strings.
use_eos: Whether to append the end-of-sentence word to each sentence.
Returns:
embeddings: A list of word embedding sequences corresponding to the input
strings.
"""
preprocessed_data = []
for item in data:
tokenized = self._tokenize(item)
if use_eos:
tokenized.append(special_words.EOS)
preprocessed_data.append([self._word_to_embedding(w) for w in tokenized])
return preprocessed_data
示例2: _build_vocabulary
# 需要导入模块: from skip_thoughts.data import special_words [as 别名]
# 或者: from skip_thoughts.data.special_words import EOS [as 别名]
def _build_vocabulary(input_files):
"""Loads or builds the model vocabulary.
Args:
input_files: List of pre-tokenized input .txt files.
Returns:
vocab: A dictionary of word to id.
"""
if FLAGS.vocab_file:
tf.logging.info("Loading existing vocab file.")
vocab = collections.OrderedDict()
with tf.gfile.GFile(FLAGS.vocab_file, mode="r") as f:
for i, line in enumerate(f):
word = line.decode("utf-8").strip()
assert word not in vocab, "Attempting to add word twice: %s" % word
vocab[word] = i
tf.logging.info("Read vocab of size %d from %s",
len(vocab), FLAGS.vocab_file)
return vocab
tf.logging.info("Creating vocabulary.")
num = 0
wordcount = collections.Counter()
for input_file in input_files:
tf.logging.info("Processing file: %s", input_file)
for sentence in tf.gfile.FastGFile(input_file):
wordcount.update(sentence.split())
num += 1
if num % 1000000 == 0:
tf.logging.info("Processed %d sentences", num)
tf.logging.info("Processed %d sentences total", num)
words = wordcount.keys()
freqs = wordcount.values()
sorted_indices = np.argsort(freqs)[::-1]
vocab = collections.OrderedDict()
vocab[special_words.EOS] = special_words.EOS_ID
vocab[special_words.UNK] = special_words.UNK_ID
for w_id, w_index in enumerate(sorted_indices[0:FLAGS.num_words - 2]):
vocab[words[w_index]] = w_id + 2 # 0: EOS, 1: UNK.
tf.logging.info("Created vocab with %d words", len(vocab))
vocab_file = os.path.join(FLAGS.output_dir, "vocab.txt")
with tf.gfile.FastGFile(vocab_file, "w") as f:
f.write("\n".join(vocab.keys()))
tf.logging.info("Wrote vocab file to %s", vocab_file)
word_counts_file = os.path.join(FLAGS.output_dir, "word_counts.txt")
with tf.gfile.FastGFile(word_counts_file, "w") as f:
for i in sorted_indices:
f.write("%s %d\n" % (words[i], freqs[i]))
tf.logging.info("Wrote word counts file to %s", word_counts_file)
return vocab
示例3: _process_input_file
# 需要导入模块: from skip_thoughts.data import special_words [as 别名]
# 或者: from skip_thoughts.data.special_words import EOS [as 别名]
def _process_input_file(filename, vocab, stats):
"""Processes the sentences in an input file.
Args:
filename: Path to a pre-tokenized input .txt file.
vocab: A dictionary of word to id.
stats: A Counter object for statistics.
Returns:
processed: A list of serialized Example protos
"""
tf.logging.info("Processing input file: %s", filename)
processed = []
predecessor = None # Predecessor sentence (list of words).
current = None # Current sentence (list of words).
successor = None # Successor sentence (list of words).
for successor_str in tf.gfile.FastGFile(filename):
stats.update(["sentences_seen"])
successor = successor_str.split()
# The first 2 sentences per file will be skipped.
if predecessor and current and successor:
stats.update(["sentences_considered"])
# Note that we are going to insert <EOS> later, so we only allow
# sentences with strictly less than max_sentence_length to pass.
if FLAGS.max_sentence_length and (
len(predecessor) >= FLAGS.max_sentence_length or len(current) >=
FLAGS.max_sentence_length or len(successor) >=
FLAGS.max_sentence_length):
stats.update(["sentences_too_long"])
else:
serialized = _create_serialized_example(predecessor, current, successor,
vocab)
processed.append(serialized)
stats.update(["sentences_output"])
predecessor = current
current = successor
sentences_seen = stats["sentences_seen"]
sentences_output = stats["sentences_output"]
if sentences_seen and sentences_seen % 100000 == 0:
tf.logging.info("Processed %d sentences (%d output)", sentences_seen,
sentences_output)
if FLAGS.max_sentences and sentences_output >= FLAGS.max_sentences:
break
tf.logging.info("Completed processing file %s", filename)
return processed
示例4: _build_vocabulary
# 需要导入模块: from skip_thoughts.data import special_words [as 别名]
# 或者: from skip_thoughts.data.special_words import EOS [as 别名]
def _build_vocabulary(input_files):
"""Loads or builds the model vocabulary.
Args:
input_files: List of pre-tokenized input .txt files.
Returns:
vocab: A dictionary of word to id.
"""
if FLAGS.vocab_file:
tf.logging.info("Loading existing vocab file.")
vocab = collections.OrderedDict()
with tf.gfile.GFile(FLAGS.vocab_file, mode="r") as f:
for i, line in enumerate(f):
word = line.decode("utf-8").strip()
assert word not in vocab, "Attempting to add word twice: %s" % word
vocab[word] = i
tf.logging.info("Read vocab of size %d from %s",
len(vocab), FLAGS.vocab_file)
return vocab
tf.logging.info("Creating vocabulary.")
num = 0
wordcount = collections.Counter()
for input_file in input_files:
tf.logging.info("Processing file: %s", input_file)
for sentence in tf.gfile.FastGFile(input_file):
wordcount.update(sentence.split())
num += 1
if num % 1000000 == 0:
tf.logging.info("Processed %d sentences", num)
tf.logging.info("Processed %d sentences total", num)
words = list(wordcount)
freqs = list(wordcount.values())
sorted_indices = np.argsort(freqs)[::-1]
vocab = collections.OrderedDict()
vocab[special_words.EOS] = special_words.EOS_ID
vocab[special_words.UNK] = special_words.UNK_ID
for w_id, w_index in enumerate(sorted_indices[0:FLAGS.num_words - 2]):
vocab[words[w_index]] = w_id + 2 # 0: EOS, 1: UNK.
tf.logging.info("Created vocab with %d words", len(vocab))
vocab_file = os.path.join(FLAGS.output_dir, "vocab.txt")
with tf.gfile.FastGFile(vocab_file, "w") as f:
f.write("\n".join(vocab.keys()))
tf.logging.info("Wrote vocab file to %s", vocab_file)
word_counts_file = os.path.join(FLAGS.output_dir, "word_counts.txt")
with tf.gfile.FastGFile(word_counts_file, "w") as f:
for i in sorted_indices:
f.write("%s %d\n" % (words[i], freqs[i]))
tf.logging.info("Wrote word counts file to %s", word_counts_file)
return vocab