本文整理汇总了Python中utils.Vocab方法的典型用法代码示例。如果您正苦于以下问题:Python utils.Vocab方法的具体用法?Python utils.Vocab怎么用?Python utils.Vocab使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类utils
的用法示例。
在下文中一共展示了utils.Vocab方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: decode_batch_output
# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def decode_batch_output(decoded_tokens, vocab: Vocab, oov_dict: OOVDict) -> List[List[str]]:
"""Convert word indices to strings."""
decoded_batch = []
if not isinstance(decoded_tokens, list):
decoded_tokens = decoded_tokens.transpose(0, 1).tolist()
for i, doc in enumerate(decoded_tokens):
decoded_doc = []
for word_idx in doc:
if word_idx >= len(vocab):
word = oov_dict.index2word.get((i, word_idx), '<UNK>')
else:
word = vocab[word_idx]
decoded_doc.append(word)
if word_idx == vocab.EOS:
break
decoded_batch.append(decoded_doc)
return decoded_batch
示例2: decode_batch
# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def decode_batch(batch: Batch, model: Seq2Seq, vocab: Vocab, criterion=None, *, pack_seq=True,
show_cover_loss=False) -> Tuple[List[List[str]], Seq2SeqOutput]:
"""Test the `model` on the `batch`, return the decoded textual tokens and the Seq2SeqOutput."""
if not pack_seq:
input_lengths = None
else:
input_lengths = batch.input_lengths
with torch.no_grad():
input_tensor = batch.input_tensor.to(DEVICE)
if batch.target_tensor is None or criterion is None:
target_tensor = None
else:
target_tensor = batch.target_tensor.to(DEVICE)
out = model(input_tensor, target_tensor, input_lengths, criterion,
ext_vocab_size=batch.ext_vocab_size, include_cover_loss=show_cover_loss)
decoded_batch = decode_batch_output(out.decoded_tokens, vocab, batch.oov_dict)
target_length = batch.target_tensor.size(0)
out.loss_value /= target_length
return decoded_batch, out
示例3: eval_batch_output
# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def eval_batch_output(tgt_tensor_or_tokens: Union[torch.Tensor, List[List[str]]], vocab: Vocab,
oov_dict: OOVDict, *pred_tensors: torch.Tensor) -> List[Dict[str, float]]:
"""
:param tgt_tensor_or_tokens: the gold standard, either as indices or textual tokens
:param vocab: the fixed-size vocab
:param oov_dict: out-of-vocab dict
:param pred_tensors: one or more systems' prediction (output tensors)
:return: two-level score lookup (system index => ROUGE metric => value)
Evaluate one or more systems' output.
"""
decoded_batch = [decode_batch_output(pred_tensor, vocab, oov_dict)
for pred_tensor in pred_tensors]
if isinstance(tgt_tensor_or_tokens, torch.Tensor):
gold_summaries = decode_batch_output(tgt_tensor_or_tokens, vocab, oov_dict)
else:
gold_summaries = tgt_tensor_or_tokens
scores = rouge(gold_summaries, *decoded_batch)
return scores
示例4: load_data
# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def load_data(self, debug=False):
"""Loads starter word-vectors and train/dev/test data."""
self.vocab = Vocab()
self.vocab.construct(get_ptb_dataset('train'))
self.encoded_train = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('train')],
dtype=np.int32)
self.encoded_valid = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('valid')],
dtype=np.int32)
self.encoded_test = np.array(
[self.vocab.encode(word) for word in get_ptb_dataset('test')],
dtype=np.int32)
if debug:
num_debug = 1024
self.encoded_train = self.encoded_train[:num_debug]
self.encoded_valid = self.encoded_valid[:num_debug]
self.encoded_test = self.encoded_test[:num_debug]
示例5: eval_batch
# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def eval_batch(batch: Batch, model: Seq2Seq, vocab: Vocab, criterion=None, *, pack_seq=True,
show_cover_loss=False) -> Tuple[float, float]:
"""Test the `model` on the `batch`, return the ROUGE score and the loss."""
decoded_batch, out = decode_batch(batch, model, vocab, criterion=criterion, pack_seq=pack_seq,
show_cover_loss=show_cover_loss)
examples = batch[0]
gold_summaries = [ex.tgt for ex in examples]
scores = rouge(gold_summaries, decoded_batch)
return out.loss_value, scores[0]['l_f']
示例6: eval_bs
# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def eval_bs(test_set: Dataset, vocab: Vocab, model: Seq2Seq, params: Params):
test_gen = test_set.generator(1, vocab, None, True if params.pointer else False)
n_samples = int(params.test_sample_ratio * len(test_set.pairs))
if params.test_save_results and params.model_path_prefix:
result_file = tarfile.open(params.model_path_prefix + ".results.tgz", 'w:gz')
else:
result_file = None
model.eval()
r1, r2, rl, rsu4 = 0, 0, 0, 0
prog_bar = tqdm(range(1, n_samples + 1))
for i in prog_bar:
batch = next(test_gen)
scores, file_content = eval_bs_batch(batch, model, vocab, pack_seq=params.pack_seq,
beam_size=params.beam_size,
min_out_len=params.min_out_len,
max_out_len=params.max_out_len,
len_in_words=params.out_len_in_words,
details=result_file is not None)
if file_content:
file_content = file_content.encode('utf-8')
file_info = tarfile.TarInfo(name='%06d.txt' % i)
file_info.size = len(file_content)
result_file.addfile(file_info, fileobj=BytesIO(file_content))
if scores:
r1 += scores[0]['1_f']
r2 += scores[0]['2_f']
rl += scores[0]['l_f']
rsu4 += scores[0]['su4_f']
prog_bar.set_postfix(R1='%.4g' % (r1 / i * 100), R2='%.4g' % (r2 / i * 100),
RL='%.4g' % (rl / i * 100), RSU4='%.4g' % (rsu4 / i * 100))
示例7: load_data
# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def load_data(self):
"""Loads train/dev/test data and builds vocabulary."""
self.train_data, self.dev_data, self.test_data = tr.simplified_data(700, 100, 200)
# build vocab from training data
self.vocab = Vocab()
train_sents = [t.get_words() for t in self.train_data]
self.vocab.construct(list(itertools.chain.from_iterable(train_sents)))
示例8: load_data
# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def load_data(self):
"""Loads train/dev/test data and builds vocabulary."""
self.train_data, self.dev_data, self.test_data = tr.simplified_data(
700, 100, 200)
# build vocab from training data
self.vocab = Vocab()
train_sents = [t.get_words() for t in self.train_data]
self.vocab.construct(list(itertools.chain.from_iterable(train_sents)))
示例9: __init__
# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def __init__(self, vocab: Vocab, params: Params, max_dec_steps=None):
"""
:param vocab: mainly for info about special tokens and vocab size
:param params: model hyper-parameters
:param max_dec_steps: max num of decoding steps (only effective at test time, as during
training the num of steps is determined by the `target_tensor`); it is
safe to change `self.max_dec_steps` as the network architecture is
independent of src/tgt seq lengths
Create the seq2seq model; its encoder and decoder will be created automatically.
"""
super(Seq2Seq, self).__init__()
self.vocab = vocab
self.vocab_size = len(vocab)
if vocab.embeddings is not None:
self.embed_size = vocab.embeddings.shape[1]
if params.embed_size is not None and self.embed_size != params.embed_size:
print("Warning: Model embedding size %d is overriden by pre-trained embedding size %d."
% (params.embed_size, self.embed_size))
embedding_weights = torch.from_numpy(vocab.embeddings)
else:
self.embed_size = params.embed_size
embedding_weights = None
self.max_dec_steps = params.max_tgt_len + 1 if max_dec_steps is None else max_dec_steps
self.enc_attn = params.enc_attn
self.enc_attn_cover = params.enc_attn_cover
self.dec_attn = params.dec_attn
self.pointer = params.pointer
self.cover_loss = params.cover_loss
self.cover_func = params.cover_func
enc_total_size = params.hidden_size * 2 if params.enc_bidi else params.hidden_size
if params.dec_hidden_size:
dec_hidden_size = params.dec_hidden_size
self.enc_dec_adapter = nn.Linear(enc_total_size, dec_hidden_size)
else:
dec_hidden_size = enc_total_size
self.enc_dec_adapter = None
self.embedding = nn.Embedding(self.vocab_size, self.embed_size, padding_idx=vocab.PAD,
_weight=embedding_weights)
self.encoder = EncoderRNN(self.embed_size, params.hidden_size, params.enc_bidi,
rnn_drop=params.enc_rnn_dropout)
self.decoder = DecoderRNN(self.vocab_size, self.embed_size, dec_hidden_size,
enc_attn=params.enc_attn, dec_attn=params.dec_attn,
pointer=params.pointer, out_embed_size=params.out_embed_size,
tied_embedding=self.embedding if params.tie_embed else None,
in_drop=params.dec_in_dropout, rnn_drop=params.dec_rnn_dropout,
out_drop=params.dec_out_dropout, enc_hidden_size=enc_total_size)
示例10: test
# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def test():
embed = torch.Tensor(np.load(args.embedding)['embedding'])
with open(args.word2id) as f:
word2id = json.load(f)
vocab = utils.Vocab(embed, word2id)
with open(args.test_dir) as f:
examples = [json.loads(line) for line in f]
test_dataset = utils.Dataset(examples)
test_iter = DataLoader(dataset=test_dataset,
batch_size=args.batch_size,
shuffle=False)
if use_gpu:
checkpoint = torch.load(args.load_dir)
else:
checkpoint = torch.load(args.load_dir, map_location=lambda storage, loc: storage)
# checkpoint['args']['device'] saves the device used as train time
# if at test time, we are using a CPU, we must override device to None
if not use_gpu:
checkpoint['args'].device = None
net = getattr(models,checkpoint['args'].model)(checkpoint['args'])
net.load_state_dict(checkpoint['model'])
if use_gpu:
net.cuda()
net.eval()
doc_num = len(test_dataset)
time_cost = 0
file_id = 1
for batch in tqdm(test_iter):
features,_,summaries,doc_lens = vocab.make_features(batch)
t1 = time()
if use_gpu:
probs = net(Variable(features).cuda(), doc_lens)
else:
probs = net(Variable(features), doc_lens)
t2 = time()
time_cost += t2 - t1
start = 0
for doc_id,doc_len in enumerate(doc_lens):
stop = start + doc_len
prob = probs[start:stop]
topk = min(args.topk,doc_len)
topk_indices = prob.topk(topk)[1].cpu().data.numpy()
topk_indices.sort()
doc = batch['doc'][doc_id].split('\n')[:doc_len]
hyp = [doc[index] for index in topk_indices]
ref = summaries[doc_id]
with open(os.path.join(args.ref,str(file_id)+'.txt'), 'w') as f:
f.write(ref)
with open(os.path.join(args.hyp,str(file_id)+'.txt'), 'w') as f:
f.write('\n'.join(hyp))
start = stop
file_id = file_id + 1
print('Speed: %.2f docs / s' % (doc_num / time_cost))
示例11: predict
# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def predict(examples):
embed = torch.Tensor(np.load(args.embedding)['embedding'])
with open(args.word2id) as f:
word2id = json.load(f)
vocab = utils.Vocab(embed, word2id)
pred_dataset = utils.Dataset(examples)
pred_iter = DataLoader(dataset=pred_dataset,
batch_size=args.batch_size,
shuffle=False)
if use_gpu:
checkpoint = torch.load(args.load_dir)
else:
checkpoint = torch.load(args.load_dir, map_location=lambda storage, loc: storage)
# checkpoint['args']['device'] saves the device used as train time
# if at test time, we are using a CPU, we must override device to None
if not use_gpu:
checkpoint['args'].device = None
net = getattr(models,checkpoint['args'].model)(checkpoint['args'])
net.load_state_dict(checkpoint['model'])
if use_gpu:
net.cuda()
net.eval()
doc_num = len(pred_dataset)
time_cost = 0
file_id = 1
for batch in tqdm(pred_iter):
features, doc_lens = vocab.make_predict_features(batch)
t1 = time()
if use_gpu:
probs = net(Variable(features).cuda(), doc_lens)
else:
probs = net(Variable(features), doc_lens)
t2 = time()
time_cost += t2 - t1
start = 0
for doc_id,doc_len in enumerate(doc_lens):
stop = start + doc_len
prob = probs[start:stop]
topk = min(args.topk,doc_len)
topk_indices = prob.topk(topk)[1].cpu().data.numpy()
topk_indices.sort()
doc = batch[doc_id].split('. ')[:doc_len]
hyp = [doc[index] for index in topk_indices]
with open(os.path.join(args.hyp,str(file_id)+'.txt'), 'w') as f:
f.write('. '.join(hyp))
start = stop
file_id = file_id + 1
print('Speed: %.2f docs / s' % (doc_num / time_cost))