当前位置: 首页>>代码示例>>Python>>正文


Python utils.Vocab方法代码示例

本文整理汇总了Python中utils.Vocab方法的典型用法代码示例。如果您正苦于以下问题:Python utils.Vocab方法的具体用法?Python utils.Vocab怎么用?Python utils.Vocab使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在utils的用法示例。


在下文中一共展示了utils.Vocab方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: decode_batch_output

# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def decode_batch_output(decoded_tokens, vocab: Vocab, oov_dict: OOVDict) -> List[List[str]]:
  """Convert word indices to strings."""
  decoded_batch = []
  if not isinstance(decoded_tokens, list):
    decoded_tokens = decoded_tokens.transpose(0, 1).tolist()
  for i, doc in enumerate(decoded_tokens):
    decoded_doc = []
    for word_idx in doc:
      if word_idx >= len(vocab):
        word = oov_dict.index2word.get((i, word_idx), '<UNK>')
      else:
        word = vocab[word_idx]
      decoded_doc.append(word)
      if word_idx == vocab.EOS:
        break
    decoded_batch.append(decoded_doc)
  return decoded_batch 
开发者ID:ymfa,项目名称:seq2seq-summarizer,代码行数:19,代码来源:test.py

示例2: decode_batch

# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def decode_batch(batch: Batch, model: Seq2Seq, vocab: Vocab, criterion=None, *, pack_seq=True,
                 show_cover_loss=False) -> Tuple[List[List[str]], Seq2SeqOutput]:
  """Test the `model` on the `batch`, return the decoded textual tokens and the Seq2SeqOutput."""
  if not pack_seq:
    input_lengths = None
  else:
    input_lengths = batch.input_lengths
  with torch.no_grad():
    input_tensor = batch.input_tensor.to(DEVICE)
    if batch.target_tensor is None or criterion is None:
      target_tensor = None
    else:
      target_tensor = batch.target_tensor.to(DEVICE)
    out = model(input_tensor, target_tensor, input_lengths, criterion,
                ext_vocab_size=batch.ext_vocab_size, include_cover_loss=show_cover_loss)
    decoded_batch = decode_batch_output(out.decoded_tokens, vocab, batch.oov_dict)
  target_length = batch.target_tensor.size(0)
  out.loss_value /= target_length
  return decoded_batch, out 
开发者ID:ymfa,项目名称:seq2seq-summarizer,代码行数:21,代码来源:test.py

示例3: eval_batch_output

# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def eval_batch_output(tgt_tensor_or_tokens: Union[torch.Tensor, List[List[str]]], vocab: Vocab,
                      oov_dict: OOVDict, *pred_tensors: torch.Tensor) -> List[Dict[str, float]]:
  """
  :param tgt_tensor_or_tokens: the gold standard, either as indices or textual tokens
  :param vocab: the fixed-size vocab
  :param oov_dict: out-of-vocab dict
  :param pred_tensors: one or more systems' prediction (output tensors)
  :return: two-level score lookup (system index => ROUGE metric => value)

  Evaluate one or more systems' output.
  """
  decoded_batch = [decode_batch_output(pred_tensor, vocab, oov_dict)
                   for pred_tensor in pred_tensors]
  if isinstance(tgt_tensor_or_tokens, torch.Tensor):
    gold_summaries = decode_batch_output(tgt_tensor_or_tokens, vocab, oov_dict)
  else:
    gold_summaries = tgt_tensor_or_tokens
  scores = rouge(gold_summaries, *decoded_batch)
  return scores 
开发者ID:ymfa,项目名称:seq2seq-summarizer,代码行数:21,代码来源:test.py

示例4: load_data

# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def load_data(self, debug=False):
    """Loads starter word-vectors and train/dev/test data."""
    self.vocab = Vocab()
    self.vocab.construct(get_ptb_dataset('train'))
    self.encoded_train = np.array(
        [self.vocab.encode(word) for word in get_ptb_dataset('train')],
        dtype=np.int32)
    self.encoded_valid = np.array(
        [self.vocab.encode(word) for word in get_ptb_dataset('valid')],
        dtype=np.int32)
    self.encoded_test = np.array(
        [self.vocab.encode(word) for word in get_ptb_dataset('test')],
        dtype=np.int32)
    if debug:
      num_debug = 1024
      self.encoded_train = self.encoded_train[:num_debug]
      self.encoded_valid = self.encoded_valid[:num_debug]
      self.encoded_test = self.encoded_test[:num_debug] 
开发者ID:bogatyy,项目名称:cs224d,代码行数:20,代码来源:q3_RNNLM.py

示例5: eval_batch

# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def eval_batch(batch: Batch, model: Seq2Seq, vocab: Vocab, criterion=None, *, pack_seq=True,
               show_cover_loss=False) -> Tuple[float, float]:
  """Test the `model` on the `batch`, return the ROUGE score and the loss."""
  decoded_batch, out = decode_batch(batch, model, vocab, criterion=criterion, pack_seq=pack_seq,
                                    show_cover_loss=show_cover_loss)
  examples = batch[0]
  gold_summaries = [ex.tgt for ex in examples]
  scores = rouge(gold_summaries, decoded_batch)
  return out.loss_value, scores[0]['l_f'] 
开发者ID:ymfa,项目名称:seq2seq-summarizer,代码行数:11,代码来源:test.py

示例6: eval_bs

# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def eval_bs(test_set: Dataset, vocab: Vocab, model: Seq2Seq, params: Params):
  test_gen = test_set.generator(1, vocab, None, True if params.pointer else False)
  n_samples = int(params.test_sample_ratio * len(test_set.pairs))

  if params.test_save_results and params.model_path_prefix:
    result_file = tarfile.open(params.model_path_prefix + ".results.tgz", 'w:gz')
  else:
    result_file = None

  model.eval()
  r1, r2, rl, rsu4 = 0, 0, 0, 0
  prog_bar = tqdm(range(1, n_samples + 1))
  for i in prog_bar:
    batch = next(test_gen)
    scores, file_content = eval_bs_batch(batch, model, vocab, pack_seq=params.pack_seq,
                                         beam_size=params.beam_size,
                                         min_out_len=params.min_out_len,
                                         max_out_len=params.max_out_len,
                                         len_in_words=params.out_len_in_words,
                                         details=result_file is not None)
    if file_content:
      file_content = file_content.encode('utf-8')
      file_info = tarfile.TarInfo(name='%06d.txt' % i)
      file_info.size = len(file_content)
      result_file.addfile(file_info, fileobj=BytesIO(file_content))
    if scores:
      r1 += scores[0]['1_f']
      r2 += scores[0]['2_f']
      rl += scores[0]['l_f']
      rsu4 += scores[0]['su4_f']
      prog_bar.set_postfix(R1='%.4g' % (r1 / i * 100), R2='%.4g' % (r2 / i * 100),
                           RL='%.4g' % (rl / i * 100), RSU4='%.4g' % (rsu4 / i * 100)) 
开发者ID:ymfa,项目名称:seq2seq-summarizer,代码行数:34,代码来源:test.py

示例7: load_data

# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def load_data(self):
        """Loads train/dev/test data and builds vocabulary."""
        self.train_data, self.dev_data, self.test_data = tr.simplified_data(700, 100, 200)

        # build vocab from training data
        self.vocab = Vocab()
        train_sents = [t.get_words() for t in self.train_data]
        self.vocab.construct(list(itertools.chain.from_iterable(train_sents))) 
开发者ID:vijayvee,项目名称:Recursive-neural-networks-TensorFlow,代码行数:10,代码来源:rnn.py

示例8: load_data

# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def load_data(self):
    """Loads train/dev/test data and builds vocabulary."""
    self.train_data, self.dev_data, self.test_data = tr.simplified_data(
        700, 100, 200)

    # build vocab from training data
    self.vocab = Vocab()
    train_sents = [t.get_words() for t in self.train_data]
    self.vocab.construct(list(itertools.chain.from_iterable(train_sents))) 
开发者ID:bogatyy,项目名称:cs224d,代码行数:11,代码来源:rnn_dynamic_graph.py

示例9: __init__

# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def __init__(self, vocab: Vocab, params: Params, max_dec_steps=None):
    """
    :param vocab: mainly for info about special tokens and vocab size
    :param params: model hyper-parameters
    :param max_dec_steps: max num of decoding steps (only effective at test time, as during
                          training the num of steps is determined by the `target_tensor`); it is
                          safe to change `self.max_dec_steps` as the network architecture is
                          independent of src/tgt seq lengths

    Create the seq2seq model; its encoder and decoder will be created automatically.
    """
    super(Seq2Seq, self).__init__()
    self.vocab = vocab
    self.vocab_size = len(vocab)
    if vocab.embeddings is not None:
      self.embed_size = vocab.embeddings.shape[1]
      if params.embed_size is not None and self.embed_size != params.embed_size:
        print("Warning: Model embedding size %d is overriden by pre-trained embedding size %d."
              % (params.embed_size, self.embed_size))
      embedding_weights = torch.from_numpy(vocab.embeddings)
    else:
      self.embed_size = params.embed_size
      embedding_weights = None
    self.max_dec_steps = params.max_tgt_len + 1 if max_dec_steps is None else max_dec_steps
    self.enc_attn = params.enc_attn
    self.enc_attn_cover = params.enc_attn_cover
    self.dec_attn = params.dec_attn
    self.pointer = params.pointer
    self.cover_loss = params.cover_loss
    self.cover_func = params.cover_func
    enc_total_size = params.hidden_size * 2 if params.enc_bidi else params.hidden_size
    if params.dec_hidden_size:
      dec_hidden_size = params.dec_hidden_size
      self.enc_dec_adapter = nn.Linear(enc_total_size, dec_hidden_size)
    else:
      dec_hidden_size = enc_total_size
      self.enc_dec_adapter = None

    self.embedding = nn.Embedding(self.vocab_size, self.embed_size, padding_idx=vocab.PAD,
                                  _weight=embedding_weights)
    self.encoder = EncoderRNN(self.embed_size, params.hidden_size, params.enc_bidi,
                              rnn_drop=params.enc_rnn_dropout)
    self.decoder = DecoderRNN(self.vocab_size, self.embed_size, dec_hidden_size,
                              enc_attn=params.enc_attn, dec_attn=params.dec_attn,
                              pointer=params.pointer, out_embed_size=params.out_embed_size,
                              tied_embedding=self.embedding if params.tie_embed else None,
                              in_drop=params.dec_in_dropout, rnn_drop=params.dec_rnn_dropout,
                              out_drop=params.dec_out_dropout, enc_hidden_size=enc_total_size) 
开发者ID:ymfa,项目名称:seq2seq-summarizer,代码行数:50,代码来源:model.py

示例10: test

# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def test():
     
    embed = torch.Tensor(np.load(args.embedding)['embedding'])
    with open(args.word2id) as f:
        word2id = json.load(f)
    vocab = utils.Vocab(embed, word2id)

    with open(args.test_dir) as f:
        examples = [json.loads(line) for line in f]
    test_dataset = utils.Dataset(examples)

    test_iter = DataLoader(dataset=test_dataset,
                            batch_size=args.batch_size,
                            shuffle=False)
    if use_gpu:
        checkpoint = torch.load(args.load_dir)
    else:
        checkpoint = torch.load(args.load_dir, map_location=lambda storage, loc: storage)

    # checkpoint['args']['device'] saves the device used as train time
    # if at test time, we are using a CPU, we must override device to None
    if not use_gpu:
        checkpoint['args'].device = None
    net = getattr(models,checkpoint['args'].model)(checkpoint['args'])
    net.load_state_dict(checkpoint['model'])
    if use_gpu:
        net.cuda()
    net.eval()
    
    doc_num = len(test_dataset)
    time_cost = 0
    file_id = 1
    for batch in tqdm(test_iter):
        features,_,summaries,doc_lens = vocab.make_features(batch)
        t1 = time()
        if use_gpu:
            probs = net(Variable(features).cuda(), doc_lens)
        else:
            probs = net(Variable(features), doc_lens)
        t2 = time()
        time_cost += t2 - t1
        start = 0
        for doc_id,doc_len in enumerate(doc_lens):
            stop = start + doc_len
            prob = probs[start:stop]
            topk = min(args.topk,doc_len)
            topk_indices = prob.topk(topk)[1].cpu().data.numpy()
            topk_indices.sort()
            doc = batch['doc'][doc_id].split('\n')[:doc_len]
            hyp = [doc[index] for index in topk_indices]
            ref = summaries[doc_id]
            with open(os.path.join(args.ref,str(file_id)+'.txt'), 'w') as f:
                f.write(ref)
            with open(os.path.join(args.hyp,str(file_id)+'.txt'), 'w') as f:
                f.write('\n'.join(hyp))
            start = stop
            file_id = file_id + 1
    print('Speed: %.2f docs / s' % (doc_num / time_cost)) 
开发者ID:hpzhao,项目名称:SummaRuNNer,代码行数:60,代码来源:main.py

示例11: predict

# 需要导入模块: import utils [as 别名]
# 或者: from utils import Vocab [as 别名]
def predict(examples):
    embed = torch.Tensor(np.load(args.embedding)['embedding'])
    with open(args.word2id) as f:
        word2id = json.load(f)
    vocab = utils.Vocab(embed, word2id)
    pred_dataset = utils.Dataset(examples)

    pred_iter = DataLoader(dataset=pred_dataset,
                            batch_size=args.batch_size,
                            shuffle=False)
    if use_gpu:
        checkpoint = torch.load(args.load_dir)
    else:
        checkpoint = torch.load(args.load_dir, map_location=lambda storage, loc: storage)

    # checkpoint['args']['device'] saves the device used as train time
    # if at test time, we are using a CPU, we must override device to None
    if not use_gpu:
        checkpoint['args'].device = None
    net = getattr(models,checkpoint['args'].model)(checkpoint['args'])
    net.load_state_dict(checkpoint['model'])
    if use_gpu:
        net.cuda()
    net.eval()
    
    doc_num = len(pred_dataset)
    time_cost = 0
    file_id = 1
    for batch in tqdm(pred_iter):
        features, doc_lens = vocab.make_predict_features(batch)
        t1 = time()
        if use_gpu:
            probs = net(Variable(features).cuda(), doc_lens)
        else:
            probs = net(Variable(features), doc_lens)
        t2 = time()
        time_cost += t2 - t1
        start = 0
        for doc_id,doc_len in enumerate(doc_lens):
            stop = start + doc_len
            prob = probs[start:stop]
            topk = min(args.topk,doc_len)
            topk_indices = prob.topk(topk)[1].cpu().data.numpy()
            topk_indices.sort()
            doc = batch[doc_id].split('. ')[:doc_len]
            hyp = [doc[index] for index in topk_indices]
            with open(os.path.join(args.hyp,str(file_id)+'.txt'), 'w') as f:
                f.write('. '.join(hyp))
            start = stop
            file_id = file_id + 1
    print('Speed: %.2f docs / s' % (doc_num / time_cost)) 
开发者ID:hpzhao,项目名称:SummaRuNNer,代码行数:53,代码来源:main.py


注:本文中的utils.Vocab方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。