本文整理汇总了Python中subword_nmt.apply_bpe.BPE属性的典型用法代码示例。如果您正苦于以下问题:Python apply_bpe.BPE属性的具体用法?Python apply_bpe.BPE怎么用?Python apply_bpe.BPE使用的例子?那么, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在类subword_nmt.apply_bpe
的用法示例。
在下文中一共展示了apply_bpe.BPE属性的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def __init__(self, args):
if args.bpe_codes is None:
raise ValueError('--bpe-codes is required for --bpe=subword_nmt')
codes = file_utils.cached_path(args.bpe_codes)
try:
from subword_nmt import apply_bpe
bpe_parser = apply_bpe.create_parser()
bpe_args = bpe_parser.parse_args([
'--codes', codes,
'--separator', args.bpe_separator,
])
self.bpe = apply_bpe.BPE(
bpe_args.codes,
bpe_args.merges,
bpe_args.separator,
None,
bpe_args.glossaries,
)
self.bpe_symbol = bpe_args.separator + ' '
except ImportError:
raise ImportError('Please install subword_nmt with: pip install subword-nmt')
示例2: finalize
# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def finalize(
self, frequencies: Dict[str, int], num_symbols: int, minfreq: int
) -> bool:
"""
Build the codecs.
Default helpers are pre-trained and thus do not build their own codecs
:param frequencies:
dictionary of (token: frequency) pairs
:param num_symbols:
Number of BPE symbols. Recommend 30000-40000. If <= 0, default
30000 will be used.
:param minfreq:
Minimum frequency of a token before forced BPE decomposition. If <=
0 will use subword-nmt default of 2.
:return did_finalize:
return whether codecs are finalized this call.
"""
return False
示例3: __init__
# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def __init__(self, opt: Opt, shared: TShared = None):
"""
Initialize the BPE module.
:param opt:
options
:param shared:
shared dictionary
"""
super().__init__(opt, shared)
if not SUBWORD_BPE_INSTALLED:
raise RuntimeError(
"Please run \"pip install 'git+https://github.com/rsennrich"
"/subword-nmt.git#egg=subword-nmt'\""
)
if not opt.get('dict_file'):
raise RuntimeError('--dict-file is mandatory.')
self.splitter = re.compile(r'\w+|[^\w\s]', re.UNICODE)
self.codecs = f"{opt['dict_file']}.codecs"
if os.path.exists(self.codecs):
self._load_from_codecs()
示例4: helper_decode
# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def helper_decode(
self, tokens: List[str], token_ids: List[int], delimiter: str
) -> str:
"""
Decode list of tokens into text string.
:param tokens:
list of tokens
:param token_ids:
list of token ids
:param delimiter:
string delimiter for tokens
:return text:
decoded text
"""
text = delimiter.join(tokens)
text = text.replace('@@ ', '')
# It's also possible that we get a BPE encoding on the end of the word
if text.endswith('@@'):
text = text[:-2]
text = text.replace('__newln__', '\n')
return text
示例5: sync_with_dict
# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def sync_with_dict(self, dict_agent):
"""
Sync with dictionary agent.
Just add all of the tokens to the dict
NOTE: How does this handle special tokens?
:param dict_agent:
A DictionaryAgent instantiation
"""
for each_token in self.encoder.values():
dict_agent.add_token(each_token)
dict_agent.freq[each_token] = 1
###################
# HuggingFace BPE #
###################
示例6: __init__
# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def __init__(self, bpe_codes: Union[str, TextIO],
bpe_vocab: Union[str, TextIO]):
f_bpe_codes = None
f_bpe_vocab = None
try:
if isinstance(bpe_codes, str):
f_bpe_codes = open(bpe_codes, 'r', encoding='utf-8')
if isinstance(bpe_vocab, str):
f_bpe_vocab = open(bpe_vocab, 'r', encoding='utf-8')
self.bpe = subword_nmt_bpe(codes=BPECodesAdapter(f_bpe_codes
or bpe_codes),
vocab=read_vocabulary(f_bpe_vocab
or bpe_vocab,
threshold=None))
self.bpe.version = (0, 2)
finally:
if f_bpe_codes:
f_bpe_codes.close()
if f_bpe_vocab:
f_bpe_vocab.close()
示例7: vec2txt
# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def vec2txt(self, vector, delimiter=' '):
"""
Convert a vector of IDs to a string.
Converts a vector (iterable of ints) into a string, with each token
separated by the delimiter (default ``' '``).
"""
text = delimiter.join(self[int(idx)] for idx in vector)
# if we used a BPE tokenizer we need to rejoin the encodings
if self.tokenizer == 'bpe' and not self.opt.get('bpe_debug', False):
text = text.replace('@@ ', '')
# It's also possible that we get a BPE encoding on the end of the word
if text.endswith('@@'):
text = text[:-2]
text = text.replace('__newln__', '\n')
return text
示例8: __init__
# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def __init__(self, codecs_filename):
"""
Initialize the BPE module.
If `codecs_filename` already exists, loads the pretrained codecs.
If it does not, codecs will be saved there after a call to `finalize()`.
:param codecs_filename:
place to save/load codecs.
"""
if not BPE_INSTALLED:
raise RuntimeError(
"Please run \"pip install 'git+https://github.com/rsennrich"
"/subword-nmt.git#egg=subword-nmt'\""
)
self.splitter = re.compile(r'\w+|[^\w\s]', re.UNICODE)
self.codecs = codecs_filename
if os.path.exists(self.codecs):
self._load_from_codecs()
示例9: tokenize
# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def tokenize(self, text):
"""
Tokenize the text with bpe if codecs are already finalized.
Otherwise, returns the regularly split tokens that will train the bpe.
:param text: str. Raw text to tokenize.
:return: a list of tokens. Will use BPE once finalized.
"""
text = text.replace('\n', ' __newln__ ')
tokens = self.splitter.findall(text)
if hasattr(self, 'bpe'):
return self.bpe.segment_tokens(tokens)
else:
return tokens
示例10: __init__
# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def __init__(self, codecs_filename):
"""Initialize the BPE module.
If `codecs_filename` already exists, loads the pretrained codecs.
If it does not, codecs will be saved there after a call to `finalize()`.
:param codecs_filename: place to save/load codecs.
"""
if not BPE_INSTALLED:
raise RuntimeError(
"Please run \"pip install 'git+https://github.com/rsennrich"
"/subword-nmt.git#egg=subword-nmt'\""
)
self.splitter = re.compile(r'\w+|[^\w\s]', re.UNICODE)
self.codecs = codecs_filename
if os.path.exists(self.codecs):
self._load_from_codecs()
示例11: main
# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--host', default='localhost', help='model server host')
parser.add_argument('--port', type=int, default=9000, help='model server port')
parser.add_argument('--model_name', required=True, help='model name')
parser.add_argument('--preprocessor', help='tokenization script')
parser.add_argument('--postprocessor', help='postprocessing script')
parser.add_argument('--bpe_codes', required=True, help='BPE codes')
parser.add_argument('-n', type=int, default=5, help='Number of comments to sample per title')
args = parser.parse_args()
generator = Generator(host=args.host,
port=args.port,
model_name=args.model_name,
preprocessor=args.preprocessor,
postprocessor=args.postprocessor,
bpe_codes=args.bpe_codes)
for title in sys.stdin:
hyps = generator(title, args.n)
for prediction, score in hyps:
sys.stdout.write('{}\t{}\n'.format(title.strip(), prediction))
示例12: learn_bpe
# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def learn_bpe(self, item_list, from_filenames=True):
logging.info('generating bpe codes file. saving to %s' %
self.codes_file)
# get vocabulary at word level (before bpe)
def segment_words(line): return _segment_words(line, self.pre_tokenize)
vocab_words = _get_vocabulary(item_list,
from_filenames=from_filenames,
segment=segment_words)
vocab_list = ['{0} {1}'.format(key, freq)
for (key, freq) in vocab_words.items()]
# learn BPE on combined vocabulary
with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output:
learn_bpe.learn_bpe(vocab_list, output, num_symbols=self.num_symbols,
min_frequency=self.min_frequency, verbose=False,
is_dict=True, total_symbols=self.total_symbols)
self.set_bpe(self.codes_file)
示例13: __init__
# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def __init__(self, model_path):
super().__init__()
from subword_nmt.apply_bpe import BPE
with open(model_path) as f:
self.model = BPE(f)
示例14: add_args
# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def add_args(parser):
# fmt: off
parser.add_argument('--bpe-codes', type=str,
help='path to subword NMT BPE')
parser.add_argument('--bpe-separator', default='@@',
help='BPE separator')
# fmt: on
示例15: should_sort
# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def should_sort(self) -> bool:
"""
Return whether tokens should be sorted for this particular helper.
DictionaryAgent sorts tokens upon saving; we don't generally want to sort with
our pre-trained dictionaries, so default is False.
"""
return False
###############
# Subword BPE #
###############