本文整理汇总了Python中data.SENTENCE_START属性的典型用法代码示例。如果您正苦于以下问题:Python data.SENTENCE_START属性的具体用法?Python data.SENTENCE_START怎么用?Python data.SENTENCE_START使用的例子?那么, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类data
的用法示例。
在下文中一共展示了data.SENTENCE_START属性的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _AddSentenceBoundary
# 需要导入模块: import data [as 别名]
# 或者: from data import SENTENCE_START [as 别名]
def _AddSentenceBoundary(self, text):
"""Pads text with start end end of sentence token iff needed.
Args:
text: text to be padded.
Returns:
A text with start and end tokens.
"""
if not text.startswith(data.SENTENCE_START):
text = data.SENTENCE_START + ' ' + text
if not text.endswith(data.SENTENCE_END):
text = text + ' ' + data.SENTENCE_END
return text
示例2: _Decode
# 需要导入模块: import data [as 别名]
# 或者: from data import SENTENCE_START [as 别名]
def _Decode(self, saver, sess):
"""Restore a checkpoint and decode it.
Args:
saver: Tensorflow checkpoint saver.
sess: Tensorflow session.
Returns:
If success, returns true, otherwise, false.
"""
ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root)
if not (ckpt_state and ckpt_state.model_checkpoint_path):
tf.logging.info('No model to decode yet at %s', FLAGS.log_root)
return False
tf.logging.info('checkpoint path %s', ckpt_state.model_checkpoint_path)
ckpt_path = os.path.join(
FLAGS.log_root, os.path.basename(ckpt_state.model_checkpoint_path))
tf.logging.info('renamed checkpoint path %s', ckpt_path)
saver.restore(sess, ckpt_path)
self._decode_io.ResetFiles()
for _ in xrange(FLAGS.decode_batches_per_ckpt):
(article_batch, _, _, article_lens, _, _, origin_articles,
origin_abstracts) = self._batch_reader.NextBatch()
for i in xrange(self._hps.batch_size):
bs = beam_search.BeamSearch(
self._model, self._hps.batch_size,
self._vocab.WordToId(data.SENTENCE_START),
self._vocab.WordToId(data.SENTENCE_END),
self._hps.dec_timesteps)
article_batch_cp = article_batch.copy()
article_batch_cp[:] = article_batch[i:i+1]
article_lens_cp = article_lens.copy()
article_lens_cp[:] = article_lens[i:i+1]
best_beam = bs.BeamSearch(sess, article_batch_cp, article_lens_cp)[0]
decode_output = [int(t) for t in best_beam.tokens[1:]]
self._DecodeBatch(
origin_articles[i], origin_abstracts[i], decode_output)
return True
示例3: main
# 需要导入模块: import data [as 别名]
# 或者: from data import SENTENCE_START [as 别名]
def main(unused_argv):
vocab = data.Vocab(FLAGS.vocab_path, 1000000)
# Check for presence of required special tokens.
assert vocab.CheckVocab(data.PAD_TOKEN) > 0
assert vocab.CheckVocab(data.UNKNOWN_TOKEN) >= 0
assert vocab.CheckVocab(data.SENTENCE_START) > 0
assert vocab.CheckVocab(data.SENTENCE_END) > 0
batch_size = 4
if FLAGS.mode == 'decode':
batch_size = FLAGS.beam_size
hps = seq2seq_attention_model.HParams(
mode=FLAGS.mode, # train, eval, decode
min_lr=0.01, # min learning rate.
lr=0.15, # learning rate
batch_size=batch_size,
enc_layers=4,
enc_timesteps=120,
dec_timesteps=30,
min_input_len=2, # discard articles/summaries < than this
num_hidden=256, # for rnn cell
emb_dim=128, # If 0, don't use embedding
max_grad_norm=2,
num_softmax_samples=4096) # If 0, no sampled softmax.
batcher = batch_reader.Batcher(
FLAGS.data_path, vocab, hps, FLAGS.article_key,
FLAGS.abstract_key, FLAGS.max_article_sentences,
FLAGS.max_abstract_sentences, bucketing=FLAGS.use_bucketing,
truncate_input=FLAGS.truncate_input)
tf.set_random_seed(FLAGS.random_seed)
if hps.mode == 'train':
model = seq2seq_attention_model.Seq2SeqAttentionModel(
hps, vocab, num_gpus=FLAGS.num_gpus)
_Train(model, batcher)
elif hps.mode == 'eval':
model = seq2seq_attention_model.Seq2SeqAttentionModel(
hps, vocab, num_gpus=FLAGS.num_gpus)
_Eval(model, batcher, vocab=vocab)
elif hps.mode == 'decode':
decode_mdl_hps = hps
# Only need to restore the 1st step and reuse it since
# we keep and feed in state for each step's output.
decode_mdl_hps = hps._replace(dec_timesteps=1)
model = seq2seq_attention_model.Seq2SeqAttentionModel(
decode_mdl_hps, vocab, num_gpus=FLAGS.num_gpus)
decoder = seq2seq_attention_decode.BSDecoder(model, batcher, hps, vocab)
decoder.DecodeLoop()
示例4: _fill_example_queue
# 需要导入模块: import data [as 别名]
# 或者: from data import SENTENCE_START [as 别名]
def _fill_example_queue(self):
"""Reads data from file and processes into Examples which are then placed into the example queue."""
input_gen = self.text_generator(
data.example_generator(self._data_path, self._single_pass))
cnt = 0
fail = 0
while True:
try:
# read the next example from file. article and abstract are
# both strings.
(article_id, article_text, abstract_sents, labels,
section_names, sections) = six.next(input_gen)
except StopIteration: # if there are no more examples:
tf.logging.info(
"The example generator for this example queue filling thread has exhausted data.")
if self._single_pass:
tf.logging.info(
"single_pass mode is on, so we've finished reading dataset. This thread is stopping.")
self._finished_reading = True
break
else:
raise Exception(
"single_pass mode is off but the example generator is out of data; error.")
# Use the <s> and </s> tags in abstract to get a list of sentences.
# abstract_sentences = [sent.strip() for sent in data.abstract2sents(''.join(abstract_sents))]
abstract_sentences = [e.replace(data.SENTENCE_START, '').replace(data.SENTENCE_END, '').strip()
for e in abstract_sents]
# at least 2 sections, some articles do not have sections
if "_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ __ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _" in article_text:
continue
if len(sections) <= 1:
continue
if not sections or len(sections) == 0:
continue
# do not process that are too long
if len(article_text) > self._hps.max_article_sents:
continue
# Do not process documents with unusually long or short abstracts
abst_len = len(' '.join(abstract_sentences).split())
if abst_len > self._hps.max_abstract_len or\
abst_len < self._hps.min_abstract_len:
continue
# Process into an Example.
example = Example(article_text, abstract_sentences, article_id, sections, section_names, labels,
self._vocab, self._hps)
# place the Example in the example queue.
if example.discard:
fail += 1
cnt += 1
if example is not None and not example.discard:
self._example_queue.put(example)
if cnt % 100 == 0:
print('total in queue: {} of {}'.format(cnt - fail, cnt))
示例5: main
# 需要导入模块: import data [as 别名]
# 或者: from data import SENTENCE_START [as 别名]
def main(unused_argv):
config = importlib.import_module('config.%s' % FLAGS.config)
for argument in FLAGS.override.split(','):
if '=' in argument:
name = argument.split('=')[0]
value = type(getattr(config, name))(argument.split('=')[1])
setattr(config, name, value)
config.input_vocab = data.Vocab(config.input_vocab_file,
config.max_vocab_size) # Max IDs
if config.input_vocab.WordToId(data.PAD_TOKEN) <= 0:
raise ValueError('Invalid PAD_TOKEN id.')
# id of the UNKNOWN_TOKEN should be "0" for copynet model
if config.input_vocab.WordToId(data.UNKNOWN_TOKEN) != 0:
raise ValueError('Invalid UNKOWN_TOKEN id.')
if config.input_vocab.WordToId(data.SENTENCE_START) <= 0:
raise ValueError('Invalid SENTENCE_START id.')
if config.input_vocab.WordToId(data.SENTENCE_END) <= 0:
raise ValueError('Invalid SENTENCE_END id.')
if config.output_vocab_file:
config.output_vocab = data.Vocab(config.output_vocab_file,
config.max_vocab_size) # Max IDs
if config.output_vocab.WordToId(data.PAD_TOKEN) <= 0:
raise ValueError('Invalid PAD_TOKEN id.')
# id of the UNKNOWN_TOKEN should be "0" for copynet model
if config.output_vocab.WordToId(data.UNKNOWN_TOKEN) != 0:
raise ValueError('Invalid UNKOWN_TOKEN id.')
if config.output_vocab.WordToId(data.SENTENCE_START) <= 0:
raise ValueError('Invalid SENTENCE_START id.')
if config.output_vocab.WordToId(data.SENTENCE_END) <= 0:
raise ValueError('Invalid SENTENCE_END id.')
else:
config.output_vocab = config.input_vocab
train_batcher = config.Batcher(config.train_set, config)
valid_batcher = config.Batcher(config.valid_set, config)
tf.set_random_seed(config.random_seed)
if FLAGS.mode == 'train':
model = config.Model(config, 'train', num_gpus=FLAGS.num_gpus)
_Train(model, config, train_batcher)
elif FLAGS.mode == 'eval':
config.dropout_rnn = 1.0
config.dropout_emb = 1.0
model = config.Model(config, 'eval', num_gpus=FLAGS.num_gpus)
_Eval(model, config, valid_batcher)
elif FLAGS.mode == 'decode':
config.dropout_rnn = 1.0
config.dropout_emb = 1.0
config.batch_size = config.beam_size
model = config.Model(config, 'decode', num_gpus=FLAGS.num_gpus)
decoder = decode.BeamSearch(model, valid_batcher, config)
decoder.DecodeLoop()
示例6: _DecodeBatch
# 需要导入模块: import data [as 别名]
# 或者: from data import SENTENCE_START [as 别名]
def _DecodeBatch(self, source, targets, dec_outputs):
"""Converts id to words and writes results.
Args:
source: The original source string.
targets: The human (correct) target string.
dec_outputs: The target word ids output by machine.
Returns:
List of metric scores for this batch.
"""
output = ['None'] * len(dec_outputs)
source_words = source.split()
for i in range(len(dec_outputs)):
if dec_outputs[i] < 0: # it's from copier
position = -1 - dec_outputs[i]
if position < len(source_words):
output[i] = source_words[position]
else:
output[i] = '<out_of_bound>'
elif dec_outputs[i] >= 0: # it's from generator or unk (if 0)
output[i] = data.Ids2Words([dec_outputs[i]], self._output_vocab)[0]
source = source.replace(data.SENTENCE_START + ' ', '').replace(
' ' + data.SENTENCE_END, '')
targets = [
x.replace(data.SENTENCE_START + ' ', '').replace(
' ' + data.SENTENCE_END, '') for x in targets
]
decoded = ' '.join(output)
end_p = decoded.find(data.SENTENCE_END, 0)
if end_p != -1:
decoded = decoded[:end_p].strip()
bleu_score = metrics.get_bleu(decoded, targets)
f1_score = metrics.get_f1(decoded, targets)
exact_score = metrics.get_exact(decoded, targets)
self._decode_io.Write(source, targets, decoded, bleu_score,
f1_score, exact_score)
return bleu_score, f1_score, exact_score
示例7: main
# 需要导入模块: import data [as 别名]
# 或者: from data import SENTENCE_START [as 别名]
def main(unused_argv):
vocab = data.Vocab(FLAGS.vocab_path, 1000000)
# Check for presence of required special tokens.
assert vocab.WordToId(data.PAD_TOKEN) > 0
assert vocab.WordToId(data.UNKNOWN_TOKEN) >= 0
assert vocab.WordToId(data.SENTENCE_START) > 0
assert vocab.WordToId(data.SENTENCE_END) > 0
batch_size = 4
if FLAGS.mode == 'decode':
batch_size = FLAGS.beam_size
hps = seq2seq_attention_model.HParams(
mode=FLAGS.mode, # train, eval, decode
min_lr=0.01, # min learning rate.
lr=0.15, # learning rate
batch_size=batch_size,
enc_layers=4,
enc_timesteps=120,
dec_timesteps=30,
min_input_len=2, # discard articles/summaries < than this
num_hidden=256, # for rnn cell
emb_dim=128, # If 0, don't use embedding
max_grad_norm=2,
num_softmax_samples=4096) # If 0, no sampled softmax.
batcher = batch_reader.Batcher(
FLAGS.data_path, vocab, hps, FLAGS.article_key,
FLAGS.abstract_key, FLAGS.max_article_sentences,
FLAGS.max_abstract_sentences, bucketing=FLAGS.use_bucketing,
truncate_input=FLAGS.truncate_input)
tf.set_random_seed(FLAGS.random_seed)
if hps.mode == 'train':
model = seq2seq_attention_model.Seq2SeqAttentionModel(
hps, vocab, num_gpus=FLAGS.num_gpus)
_Train(model, batcher)
elif hps.mode == 'eval':
model = seq2seq_attention_model.Seq2SeqAttentionModel(
hps, vocab, num_gpus=FLAGS.num_gpus)
_Eval(model, batcher, vocab=vocab)
elif hps.mode == 'decode':
decode_mdl_hps = hps
# Only need to restore the 1st step and reuse it since
# we keep and feed in state for each step's output.
decode_mdl_hps = hps._replace(dec_timesteps=1)
model = seq2seq_attention_model.Seq2SeqAttentionModel(
decode_mdl_hps, vocab, num_gpus=FLAGS.num_gpus)
decoder = seq2seq_attention_decode.BSDecoder(model, batcher, hps, vocab)
decoder.DecodeLoop()