当前位置: 首页>>代码示例>>Python>>正文


Python learn_bpe.learn_bpe方法代码示例

本文整理汇总了Python中subword_nmt.learn_bpe.learn_bpe方法的典型用法代码示例。如果您正苦于以下问题:Python learn_bpe.learn_bpe方法的具体用法?Python learn_bpe.learn_bpe怎么用?Python learn_bpe.learn_bpe使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在subword_nmt.learn_bpe的用法示例。


在下文中一共展示了learn_bpe.learn_bpe方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: learn_bpe

# 需要导入模块: from subword_nmt import learn_bpe [as 别名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 别名]
def learn_bpe(self, item_list, from_filenames=True):
        logging.info('generating bpe codes file. saving to %s' %
                     self.codes_file)

        # get vocabulary at word level (before bpe)
        def segment_words(line): return _segment_words(line, self.pre_tokenize)
        vocab_words = _get_vocabulary(item_list,
                                      from_filenames=from_filenames,
                                      segment=segment_words)
        vocab_list = ['{0} {1}'.format(key, freq)
                      for (key, freq) in vocab_words.items()]
        # learn BPE on combined vocabulary
        with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output:
            learn_bpe.learn_bpe(vocab_list, output, num_symbols=self.num_symbols,
                                min_frequency=self.min_frequency, verbose=False,
                                is_dict=True, total_symbols=self.total_symbols)
        self.set_bpe(self.codes_file) 
开发者ID:eladhoffer,项目名称:seq2seq.pytorch,代码行数:19,代码来源:tokenizer.py

示例2: finalize

# 需要导入模块: from subword_nmt import learn_bpe [as 别名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 别名]
def finalize(self, frequencies, num_symbols=30000, minfreq=2):
        """
        Build the codecs.

        :param frequencies:
            dictionary of (token: frequency) pairs
        :param num_symbols:
            Number of BPE symbols. Recommend 30000-40000.  If <= 0, default
            30000 will be used.
        :param minfreq:
            Minimum frequency of a token before forced BPE decomposition. If <=
            0 will use subword-nmt default of 2.
        """
        if hasattr(self, 'bpe'):
            # we already finalized the codecs
            return False

        print('Dictionary: saving bpe codecs to {}'.format(self.codecs))

        dictionary = ("{} {}".format(k, v) for k, v in frequencies.items())

        if num_symbols <= 0:
            num_symbols = 30000
        if minfreq <= 0:
            minfreq = 2

        codec_dir, _ = os.path.split(self.codecs)
        os.makedirs(codec_dir, exist_ok=True)
        with open(self.codecs, 'w', encoding='utf-8') as outstream:
            learn_bpe.learn_bpe(
                dictionary,
                outstream,
                num_symbols=num_symbols,
                min_frequency=minfreq,
                is_dict=True,
            )

        self._load_from_codecs()
        return True 
开发者ID:natashamjaques,项目名称:neural_chat,代码行数:41,代码来源:dict.py

示例3: finalize

# 需要导入模块: from subword_nmt import learn_bpe [as 别名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 别名]
def finalize(self, frequencies, num_symbols=30000, minfreq=2):
        """Build the codecs.

        :param: dictionary of (token: frequency) pairs
        :param num_symbols: Number of BPE symbols. Recommend 30000-40000.
            If <= 0, default 30000 will be used.
        :param minfreq: Minimum frequency of a token before forced BPE
            decomposition. If <= 0 will use subword-nmt default of 2.
        """
        if hasattr(self, 'bpe'):
            # we already finalized the codecs
            return False

        print('Dictionary: saving bpe codecs to {}'.format(self.codecs))

        dictionary = ("{} {}".format(k, v) for k, v in frequencies.items())

        if num_symbols <= 0:
            num_symbols = 30000
        if minfreq <= 0:
            minfreq = 2
        with open(self.codecs, 'w') as outstream:
            learn_bpe.learn_bpe(
                dictionary,
                outstream,
                num_symbols=num_symbols,
                min_frequency=minfreq,
                is_dict=True,
            )

        self._load_from_codecs()
        return True 
开发者ID:natashamjaques,项目名称:neural_chat,代码行数:34,代码来源:dict_v1.py

示例4: finalize

# 需要导入模块: from subword_nmt import learn_bpe [as 别名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 别名]
def finalize(self, frequencies, num_symbols=30000, minfreq=2):
        """
        Build the codecs.

        :param frequencies:
            dictionary of (token: frequency) pairs
        :param num_symbols:
            Number of BPE symbols. Recommend 30000-40000.  If <= 0, default
            30000 will be used.
        :param minfreq:
            Minimum frequency of a token before forced BPE decomposition. If <=
            0 will use subword-nmt default of 2.
        """
        if hasattr(self, 'bpe'):
            # we already finalized the codecs
            return False

        print('Dictionary: saving bpe codecs to {}'.format(self.codecs))

        dictionary = ("{} {}".format(k, v) for k, v in frequencies.items())

        if num_symbols <= 0:
            num_symbols = 30000
        if minfreq <= 0:
            minfreq = 2
        with open(self.codecs, 'w') as outstream:
            learn_bpe.learn_bpe(
                dictionary,
                outstream,
                num_symbols=num_symbols,
                min_frequency=minfreq,
                is_dict=True,
            )

        self._load_from_codecs()
        return True 
开发者ID:THUDM,项目名称:KBRD,代码行数:38,代码来源:dict.py

示例5: build_vocab

# 需要导入模块: from subword_nmt import learn_bpe [as 别名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 别名]
def build_vocab(imgs, params):
  # count up the number of words
  captions = []
  for img in imgs:
    for sent in img['sentences']:
      captions.append(' '.join(sent['tokens']))
  captions='\n'.join(captions)
  all_captions = tempfile.NamedTemporaryFile(delete=False)
  all_captions.close()
  with open(all_captions.name, 'w') as txt_file:
    txt_file.write(captions)

  #
  codecs_output = tempfile.NamedTemporaryFile(delete=False)
  codecs_output.close()
  with codecs.open(codecs_output.name, 'w', encoding='UTF-8') as output:
    learn_bpe.learn_bpe(codecs.open(all_captions.name, encoding='UTF-8'), output, params['symbol_count'])

  with codecs.open(codecs_output.name, encoding='UTF-8') as codes:
    bpe = apply_bpe.BPE(codes)

  tmp = tempfile.NamedTemporaryFile(delete=False)
  tmp.close()

  tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

  for _, img in enumerate(imgs):
    img['final_captions'] = []
    for sent in img['sentences']:
      txt = ' '.join(sent['tokens'])
      txt = bpe.segment(txt).strip()
      img['final_captions'].append(txt.split(' '))
      tmpout.write(txt)
      tmpout.write('\n')
      if _ < 20:
        print(txt)

  tmpout.close()
  tmpin = codecs.open(tmp.name, encoding='UTF-8')

  vocab = learn_bpe.get_vocabulary(tmpin)
  vocab = sorted(vocab.keys(), key=lambda x: vocab[x], reverse=True)

  # Always insert UNK
  print('inserting the special UNK token')
  vocab.append('UNK')

  print('Vocab size:', len(vocab))

  os.remove(all_captions.name)
  with open(codecs_output.name, 'r') as codes:
    bpe = codes.read()
  os.remove(codecs_output.name)
  os.remove(tmp.name)

  return vocab, bpe 
开发者ID:husthuaan,项目名称:AAT,代码行数:58,代码来源:build_bpe_subword_nmt.py

示例6: finalize

# 需要导入模块: from subword_nmt import learn_bpe [as 别名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 别名]
def finalize(
        self, frequencies: Dict[str, int], num_symbols: int = 30000, minfreq: int = 2
    ) -> bool:
        """
        Build the codecs.

        :param frequencies:
            dictionary of (token: frequency) pairs
        :param num_symbols:
            Number of BPE symbols. Recommend 30000-40000.  If <= 0, default
            30000 will be used.
        :param minfreq:
            Minimum frequency of a token before forced BPE decomposition. If <=
            0 will use subword-nmt default of 2.

        :return did_finalize:
            return whether codecs are finalized this call.
        """
        if hasattr(self, 'bpe'):
            # we already finalized the codecs
            return False

        logging.debug(f'Saving bpe codecs to {self.codecs}')

        dictionary = ("{} {}".format(k, v) for k, v in frequencies.items())

        if num_symbols <= 0:
            num_symbols = 30000
        if minfreq <= 0:
            minfreq = 2

        codec_dir, _ = os.path.split(self.codecs)
        os.makedirs(codec_dir, exist_ok=True)
        with open(self.codecs, 'w', encoding='utf-8') as outstream:
            learn_bpe.learn_bpe(
                dictionary,
                outstream,
                num_symbols=num_symbols,
                min_frequency=minfreq,
                is_dict=True,
            )

        self._load_from_codecs()
        return True 
开发者ID:facebookresearch,项目名称:ParlAI,代码行数:46,代码来源:bpe.py

示例7: build_vocab

# 需要导入模块: from subword_nmt import learn_bpe [as 别名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 别名]
def build_vocab(imgs, params):
	# count up the number of words
	captions = []
	for img in imgs:
		for sent in img['sentences']:
			captions.append(' '.join(sent['tokens']))
	captions='\n'.join(captions)
	all_captions = tempfile.NamedTemporaryFile(delete=False)
	all_captions.close()
	with open(all_captions.name, 'w') as txt_file:
		txt_file.write(captions)

	#
	codecs_output = tempfile.NamedTemporaryFile(delete=False)
	codecs_output.close()
	with codecs.open(codecs_output.name, 'w', encoding='UTF-8') as output:
		learn_bpe.learn_bpe(codecs.open(all_captions.name, encoding='UTF-8'), output, params['symbol_count'])

	with codecs.open(codecs_output.name, encoding='UTF-8') as codes:
		bpe = apply_bpe.BPE(codes)

	tmp = tempfile.NamedTemporaryFile(delete=False)
	tmp.close()

	tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

	for _, img in enumerate(imgs):
		img['final_captions'] = []
		for sent in img['sentences']:
			txt = ' '.join(sent['tokens'])
			txt = bpe.segment(txt).strip()
			img['final_captions'].append(txt.split(' '))
			tmpout.write(txt)
			tmpout.write('\n')
			if _ < 20:
				print(txt)

	tmpout.close()
	tmpin = codecs.open(tmp.name, encoding='UTF-8')

	vocab = learn_bpe.get_vocabulary(tmpin)
	vocab = sorted(vocab.keys(), key=lambda x: vocab[x], reverse=True)

	# Always insert UNK
	print('inserting the special UNK token')
	vocab.append('UNK')

	print('Vocab size:', len(vocab))

	os.remove(all_captions.name)
	with open(codecs_output.name, 'r') as codes:
		bpe = codes.read()
	os.remove(codecs_output.name)
	os.remove(tmp.name)

	return vocab, bpe 
开发者ID:ruotianluo,项目名称:self-critical.pytorch,代码行数:58,代码来源:build_bpe_subword_nmt.py


注:本文中的subword_nmt.learn_bpe.learn_bpe方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。