本文整理汇总了Python中subword_nmt.learn_bpe.learn_bpe方法的典型用法代码示例。如果您正苦于以下问题:Python learn_bpe.learn_bpe方法的具体用法?Python learn_bpe.learn_bpe怎么用?Python learn_bpe.learn_bpe使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类subword_nmt.learn_bpe
的用法示例。
在下文中一共展示了learn_bpe.learn_bpe方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: learn_bpe
# 需要导入模块: from subword_nmt import learn_bpe [as 别名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 别名]
def learn_bpe(self, item_list, from_filenames=True):
logging.info('generating bpe codes file. saving to %s' %
self.codes_file)
# get vocabulary at word level (before bpe)
def segment_words(line): return _segment_words(line, self.pre_tokenize)
vocab_words = _get_vocabulary(item_list,
from_filenames=from_filenames,
segment=segment_words)
vocab_list = ['{0} {1}'.format(key, freq)
for (key, freq) in vocab_words.items()]
# learn BPE on combined vocabulary
with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output:
learn_bpe.learn_bpe(vocab_list, output, num_symbols=self.num_symbols,
min_frequency=self.min_frequency, verbose=False,
is_dict=True, total_symbols=self.total_symbols)
self.set_bpe(self.codes_file)
示例2: finalize
# 需要导入模块: from subword_nmt import learn_bpe [as 别名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 别名]
def finalize(self, frequencies, num_symbols=30000, minfreq=2):
"""
Build the codecs.
:param frequencies:
dictionary of (token: frequency) pairs
:param num_symbols:
Number of BPE symbols. Recommend 30000-40000. If <= 0, default
30000 will be used.
:param minfreq:
Minimum frequency of a token before forced BPE decomposition. If <=
0 will use subword-nmt default of 2.
"""
if hasattr(self, 'bpe'):
# we already finalized the codecs
return False
print('Dictionary: saving bpe codecs to {}'.format(self.codecs))
dictionary = ("{} {}".format(k, v) for k, v in frequencies.items())
if num_symbols <= 0:
num_symbols = 30000
if minfreq <= 0:
minfreq = 2
codec_dir, _ = os.path.split(self.codecs)
os.makedirs(codec_dir, exist_ok=True)
with open(self.codecs, 'w', encoding='utf-8') as outstream:
learn_bpe.learn_bpe(
dictionary,
outstream,
num_symbols=num_symbols,
min_frequency=minfreq,
is_dict=True,
)
self._load_from_codecs()
return True
示例3: finalize
# 需要导入模块: from subword_nmt import learn_bpe [as 别名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 别名]
def finalize(self, frequencies, num_symbols=30000, minfreq=2):
"""Build the codecs.
:param: dictionary of (token: frequency) pairs
:param num_symbols: Number of BPE symbols. Recommend 30000-40000.
If <= 0, default 30000 will be used.
:param minfreq: Minimum frequency of a token before forced BPE
decomposition. If <= 0 will use subword-nmt default of 2.
"""
if hasattr(self, 'bpe'):
# we already finalized the codecs
return False
print('Dictionary: saving bpe codecs to {}'.format(self.codecs))
dictionary = ("{} {}".format(k, v) for k, v in frequencies.items())
if num_symbols <= 0:
num_symbols = 30000
if minfreq <= 0:
minfreq = 2
with open(self.codecs, 'w') as outstream:
learn_bpe.learn_bpe(
dictionary,
outstream,
num_symbols=num_symbols,
min_frequency=minfreq,
is_dict=True,
)
self._load_from_codecs()
return True
示例4: finalize
# 需要导入模块: from subword_nmt import learn_bpe [as 别名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 别名]
def finalize(self, frequencies, num_symbols=30000, minfreq=2):
"""
Build the codecs.
:param frequencies:
dictionary of (token: frequency) pairs
:param num_symbols:
Number of BPE symbols. Recommend 30000-40000. If <= 0, default
30000 will be used.
:param minfreq:
Minimum frequency of a token before forced BPE decomposition. If <=
0 will use subword-nmt default of 2.
"""
if hasattr(self, 'bpe'):
# we already finalized the codecs
return False
print('Dictionary: saving bpe codecs to {}'.format(self.codecs))
dictionary = ("{} {}".format(k, v) for k, v in frequencies.items())
if num_symbols <= 0:
num_symbols = 30000
if minfreq <= 0:
minfreq = 2
with open(self.codecs, 'w') as outstream:
learn_bpe.learn_bpe(
dictionary,
outstream,
num_symbols=num_symbols,
min_frequency=minfreq,
is_dict=True,
)
self._load_from_codecs()
return True
示例5: build_vocab
# 需要导入模块: from subword_nmt import learn_bpe [as 别名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 别名]
def build_vocab(imgs, params):
# count up the number of words
captions = []
for img in imgs:
for sent in img['sentences']:
captions.append(' '.join(sent['tokens']))
captions='\n'.join(captions)
all_captions = tempfile.NamedTemporaryFile(delete=False)
all_captions.close()
with open(all_captions.name, 'w') as txt_file:
txt_file.write(captions)
#
codecs_output = tempfile.NamedTemporaryFile(delete=False)
codecs_output.close()
with codecs.open(codecs_output.name, 'w', encoding='UTF-8') as output:
learn_bpe.learn_bpe(codecs.open(all_captions.name, encoding='UTF-8'), output, params['symbol_count'])
with codecs.open(codecs_output.name, encoding='UTF-8') as codes:
bpe = apply_bpe.BPE(codes)
tmp = tempfile.NamedTemporaryFile(delete=False)
tmp.close()
tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')
for _, img in enumerate(imgs):
img['final_captions'] = []
for sent in img['sentences']:
txt = ' '.join(sent['tokens'])
txt = bpe.segment(txt).strip()
img['final_captions'].append(txt.split(' '))
tmpout.write(txt)
tmpout.write('\n')
if _ < 20:
print(txt)
tmpout.close()
tmpin = codecs.open(tmp.name, encoding='UTF-8')
vocab = learn_bpe.get_vocabulary(tmpin)
vocab = sorted(vocab.keys(), key=lambda x: vocab[x], reverse=True)
# Always insert UNK
print('inserting the special UNK token')
vocab.append('UNK')
print('Vocab size:', len(vocab))
os.remove(all_captions.name)
with open(codecs_output.name, 'r') as codes:
bpe = codes.read()
os.remove(codecs_output.name)
os.remove(tmp.name)
return vocab, bpe
示例6: finalize
# 需要导入模块: from subword_nmt import learn_bpe [as 别名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 别名]
def finalize(
self, frequencies: Dict[str, int], num_symbols: int = 30000, minfreq: int = 2
) -> bool:
"""
Build the codecs.
:param frequencies:
dictionary of (token: frequency) pairs
:param num_symbols:
Number of BPE symbols. Recommend 30000-40000. If <= 0, default
30000 will be used.
:param minfreq:
Minimum frequency of a token before forced BPE decomposition. If <=
0 will use subword-nmt default of 2.
:return did_finalize:
return whether codecs are finalized this call.
"""
if hasattr(self, 'bpe'):
# we already finalized the codecs
return False
logging.debug(f'Saving bpe codecs to {self.codecs}')
dictionary = ("{} {}".format(k, v) for k, v in frequencies.items())
if num_symbols <= 0:
num_symbols = 30000
if minfreq <= 0:
minfreq = 2
codec_dir, _ = os.path.split(self.codecs)
os.makedirs(codec_dir, exist_ok=True)
with open(self.codecs, 'w', encoding='utf-8') as outstream:
learn_bpe.learn_bpe(
dictionary,
outstream,
num_symbols=num_symbols,
min_frequency=minfreq,
is_dict=True,
)
self._load_from_codecs()
return True
示例7: build_vocab
# 需要导入模块: from subword_nmt import learn_bpe [as 别名]
# 或者: from subword_nmt.learn_bpe import learn_bpe [as 别名]
def build_vocab(imgs, params):
# count up the number of words
captions = []
for img in imgs:
for sent in img['sentences']:
captions.append(' '.join(sent['tokens']))
captions='\n'.join(captions)
all_captions = tempfile.NamedTemporaryFile(delete=False)
all_captions.close()
with open(all_captions.name, 'w') as txt_file:
txt_file.write(captions)
#
codecs_output = tempfile.NamedTemporaryFile(delete=False)
codecs_output.close()
with codecs.open(codecs_output.name, 'w', encoding='UTF-8') as output:
learn_bpe.learn_bpe(codecs.open(all_captions.name, encoding='UTF-8'), output, params['symbol_count'])
with codecs.open(codecs_output.name, encoding='UTF-8') as codes:
bpe = apply_bpe.BPE(codes)
tmp = tempfile.NamedTemporaryFile(delete=False)
tmp.close()
tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')
for _, img in enumerate(imgs):
img['final_captions'] = []
for sent in img['sentences']:
txt = ' '.join(sent['tokens'])
txt = bpe.segment(txt).strip()
img['final_captions'].append(txt.split(' '))
tmpout.write(txt)
tmpout.write('\n')
if _ < 20:
print(txt)
tmpout.close()
tmpin = codecs.open(tmp.name, encoding='UTF-8')
vocab = learn_bpe.get_vocabulary(tmpin)
vocab = sorted(vocab.keys(), key=lambda x: vocab[x], reverse=True)
# Always insert UNK
print('inserting the special UNK token')
vocab.append('UNK')
print('Vocab size:', len(vocab))
os.remove(all_captions.name)
with open(codecs_output.name, 'r') as codes:
bpe = codes.read()
os.remove(codecs_output.name)
os.remove(tmp.name)
return vocab, bpe