Python learn_bpe.learn_bpe方法代碼示例

本文整理匯總了Python中subword_nmt.learn_bpe.learn_bpe方法的典型用法代碼示例。如果您正苦於以下問題：Python learn_bpe.learn_bpe方法的具體用法？Python learn_bpe.learn_bpe怎麽用？Python learn_bpe.learn_bpe使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類subword_nmt.learn_bpe的用法示例。

在下文中一共展示了learn_bpe.learn_bpe方法的7個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: learn_bpe

# 需要導入模塊: from subword_nmt import learn_bpe [as 別名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 別名]
def learn_bpe(self, item_list, from_filenames=True):
        logging.info('generating bpe codes file. saving to %s' %
                     self.codes_file)

        # get vocabulary at word level (before bpe)
        def segment_words(line): return _segment_words(line, self.pre_tokenize)
        vocab_words = _get_vocabulary(item_list,
                                      from_filenames=from_filenames,
                                      segment=segment_words)
        vocab_list = ['{0} {1}'.format(key, freq)
                      for (key, freq) in vocab_words.items()]
        # learn BPE on combined vocabulary
        with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output:
            learn_bpe.learn_bpe(vocab_list, output, num_symbols=self.num_symbols,
                                min_frequency=self.min_frequency, verbose=False,
                                is_dict=True, total_symbols=self.total_symbols)
        self.set_bpe(self.codes_file)

開發者ID:eladhoffer，項目名稱:seq2seq.pytorch，代碼行數:19，代碼來源:tokenizer.py

示例2: finalize

# 需要導入模塊: from subword_nmt import learn_bpe [as 別名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 別名]
def finalize(self, frequencies, num_symbols=30000, minfreq=2):
        """
        Build the codecs.

        :param frequencies:
            dictionary of (token: frequency) pairs
        :param num_symbols:
            Number of BPE symbols. Recommend 30000-40000.  If <= 0, default
            30000 will be used.
        :param minfreq:
            Minimum frequency of a token before forced BPE decomposition. If <=
            0 will use subword-nmt default of 2.
        """
        if hasattr(self, 'bpe'):
            # we already finalized the codecs
            return False

        print('Dictionary: saving bpe codecs to {}'.format(self.codecs))

        dictionary = ("{} {}".format(k, v) for k, v in frequencies.items())

        if num_symbols <= 0:
            num_symbols = 30000
        if minfreq <= 0:
            minfreq = 2

        codec_dir, _ = os.path.split(self.codecs)
        os.makedirs(codec_dir, exist_ok=True)
        with open(self.codecs, 'w', encoding='utf-8') as outstream:
            learn_bpe.learn_bpe(
                dictionary,
                outstream,
                num_symbols=num_symbols,
                min_frequency=minfreq,
                is_dict=True,
            )

        self._load_from_codecs()
        return True

開發者ID:natashamjaques，項目名稱:neural_chat，代碼行數:41，代碼來源:dict.py

示例3: finalize

# 需要導入模塊: from subword_nmt import learn_bpe [as 別名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 別名]
def finalize(self, frequencies, num_symbols=30000, minfreq=2):
        """Build the codecs.

        :param: dictionary of (token: frequency) pairs
        :param num_symbols: Number of BPE symbols. Recommend 30000-40000.
            If <= 0, default 30000 will be used.
        :param minfreq: Minimum frequency of a token before forced BPE
            decomposition. If <= 0 will use subword-nmt default of 2.
        """
        if hasattr(self, 'bpe'):
            # we already finalized the codecs
            return False

        print('Dictionary: saving bpe codecs to {}'.format(self.codecs))

        dictionary = ("{} {}".format(k, v) for k, v in frequencies.items())

        if num_symbols <= 0:
            num_symbols = 30000
        if minfreq <= 0:
            minfreq = 2
        with open(self.codecs, 'w') as outstream:
            learn_bpe.learn_bpe(
                dictionary,
                outstream,
                num_symbols=num_symbols,
                min_frequency=minfreq,
                is_dict=True,
            )

        self._load_from_codecs()
        return True

開發者ID:natashamjaques，項目名稱:neural_chat，代碼行數:34，代碼來源:dict_v1.py

示例4: finalize

# 需要導入模塊: from subword_nmt import learn_bpe [as 別名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 別名]
def finalize(self, frequencies, num_symbols=30000, minfreq=2):
        """
        Build the codecs.

        :param frequencies:
            dictionary of (token: frequency) pairs
        :param num_symbols:
            Number of BPE symbols. Recommend 30000-40000.  If <= 0, default
            30000 will be used.
        :param minfreq:
            Minimum frequency of a token before forced BPE decomposition. If <=
            0 will use subword-nmt default of 2.
        """
        if hasattr(self, 'bpe'):
            # we already finalized the codecs
            return False

        print('Dictionary: saving bpe codecs to {}'.format(self.codecs))

        dictionary = ("{} {}".format(k, v) for k, v in frequencies.items())

        if num_symbols <= 0:
            num_symbols = 30000
        if minfreq <= 0:
            minfreq = 2
        with open(self.codecs, 'w') as outstream:
            learn_bpe.learn_bpe(
                dictionary,
                outstream,
                num_symbols=num_symbols,
                min_frequency=minfreq,
                is_dict=True,
            )

        self._load_from_codecs()
        return True

開發者ID:THUDM，項目名稱:KBRD，代碼行數:38，代碼來源:dict.py

示例5: build_vocab

# 需要導入模塊: from subword_nmt import learn_bpe [as 別名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 別名]
def build_vocab(imgs, params):
  # count up the number of words
  captions = []
  for img in imgs:
    for sent in img['sentences']:
      captions.append(' '.join(sent['tokens']))
  captions='\n'.join(captions)
  all_captions = tempfile.NamedTemporaryFile(delete=False)
  all_captions.close()
  with open(all_captions.name, 'w') as txt_file:
    txt_file.write(captions)

  #
  codecs_output = tempfile.NamedTemporaryFile(delete=False)
  codecs_output.close()
  with codecs.open(codecs_output.name, 'w', encoding='UTF-8') as output:
    learn_bpe.learn_bpe(codecs.open(all_captions.name, encoding='UTF-8'), output, params['symbol_count'])

  with codecs.open(codecs_output.name, encoding='UTF-8') as codes:
    bpe = apply_bpe.BPE(codes)

  tmp = tempfile.NamedTemporaryFile(delete=False)
  tmp.close()

  tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

  for _, img in enumerate(imgs):
    img['final_captions'] = []
    for sent in img['sentences']:
      txt = ' '.join(sent['tokens'])
      txt = bpe.segment(txt).strip()
      img['final_captions'].append(txt.split(' '))
      tmpout.write(txt)
      tmpout.write('\n')
      if _ < 20:
        print(txt)

  tmpout.close()
  tmpin = codecs.open(tmp.name, encoding='UTF-8')

  vocab = learn_bpe.get_vocabulary(tmpin)
  vocab = sorted(vocab.keys(), key=lambda x: vocab[x], reverse=True)

  # Always insert UNK
  print('inserting the special UNK token')
  vocab.append('UNK')

  print('Vocab size:', len(vocab))

  os.remove(all_captions.name)
  with open(codecs_output.name, 'r') as codes:
    bpe = codes.read()
  os.remove(codecs_output.name)
  os.remove(tmp.name)

  return vocab, bpe

開發者ID:husthuaan，項目名稱:AAT，代碼行數:58，代碼來源:build_bpe_subword_nmt.py

示例6: finalize

# 需要導入模塊: from subword_nmt import learn_bpe [as 別名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 別名]
def finalize(
        self, frequencies: Dict[str, int], num_symbols: int = 30000, minfreq: int = 2
    ) -> bool:
        """
        Build the codecs.

        :param frequencies:
            dictionary of (token: frequency) pairs
        :param num_symbols:
            Number of BPE symbols. Recommend 30000-40000.  If <= 0, default
            30000 will be used.
        :param minfreq:
            Minimum frequency of a token before forced BPE decomposition. If <=
            0 will use subword-nmt default of 2.

        :return did_finalize:
            return whether codecs are finalized this call.
        """
        if hasattr(self, 'bpe'):
            # we already finalized the codecs
            return False

        logging.debug(f'Saving bpe codecs to {self.codecs}')

        dictionary = ("{} {}".format(k, v) for k, v in frequencies.items())

        if num_symbols <= 0:
            num_symbols = 30000
        if minfreq <= 0:
            minfreq = 2

        codec_dir, _ = os.path.split(self.codecs)
        os.makedirs(codec_dir, exist_ok=True)
        with open(self.codecs, 'w', encoding='utf-8') as outstream:
            learn_bpe.learn_bpe(
                dictionary,
                outstream,
                num_symbols=num_symbols,
                min_frequency=minfreq,
                is_dict=True,
            )

        self._load_from_codecs()
        return True

開發者ID:facebookresearch，項目名稱:ParlAI，代碼行數:46，代碼來源:bpe.py

示例7: build_vocab

# 需要導入模塊: from subword_nmt import learn_bpe [as 別名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 別名]
def build_vocab(imgs, params):
	# count up the number of words
	captions = []
	for img in imgs:
		for sent in img['sentences']:
			captions.append(' '.join(sent['tokens']))
	captions='\n'.join(captions)
	all_captions = tempfile.NamedTemporaryFile(delete=False)
	all_captions.close()
	with open(all_captions.name, 'w') as txt_file:
		txt_file.write(captions)

	#
	codecs_output = tempfile.NamedTemporaryFile(delete=False)
	codecs_output.close()
	with codecs.open(codecs_output.name, 'w', encoding='UTF-8') as output:
		learn_bpe.learn_bpe(codecs.open(all_captions.name, encoding='UTF-8'), output, params['symbol_count'])

	with codecs.open(codecs_output.name, encoding='UTF-8') as codes:
		bpe = apply_bpe.BPE(codes)

	tmp = tempfile.NamedTemporaryFile(delete=False)
	tmp.close()

	tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

	for _, img in enumerate(imgs):
		img['final_captions'] = []
		for sent in img['sentences']:
			txt = ' '.join(sent['tokens'])
			txt = bpe.segment(txt).strip()
			img['final_captions'].append(txt.split(' '))
			tmpout.write(txt)
			tmpout.write('\n')
			if _ < 20:
				print(txt)

	tmpout.close()
	tmpin = codecs.open(tmp.name, encoding='UTF-8')

	vocab = learn_bpe.get_vocabulary(tmpin)
	vocab = sorted(vocab.keys(), key=lambda x: vocab[x], reverse=True)

	# Always insert UNK
	print('inserting the special UNK token')
	vocab.append('UNK')

	print('Vocab size:', len(vocab))

	os.remove(all_captions.name)
	with open(codecs_output.name, 'r') as codes:
		bpe = codes.read()
	os.remove(codecs_output.name)
	os.remove(tmp.name)

	return vocab, bpe

開發者ID:ruotianluo，項目名稱:self-critical.pytorch，代碼行數:58，代碼來源:build_bpe_subword_nmt.py

注：本文中的subword_nmt.learn_bpe.learn_bpe方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。