当前位置: 首页>>代码示例>>Python>>正文


Python apply_bpe.BPE属性代码示例

本文整理汇总了Python中subword_nmt.apply_bpe.BPE属性的典型用法代码示例。如果您正苦于以下问题:Python apply_bpe.BPE属性的具体用法?Python apply_bpe.BPE怎么用?Python apply_bpe.BPE使用的例子?那么恭喜您, 这里精选的属性代码示例或许可以为您提供帮助。您也可以进一步了解该属性所在subword_nmt.apply_bpe的用法示例。


在下文中一共展示了apply_bpe.BPE属性的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def __init__(self, args):
        if args.bpe_codes is None:
            raise ValueError('--bpe-codes is required for --bpe=subword_nmt')
        codes = file_utils.cached_path(args.bpe_codes)
        try:
            from subword_nmt import apply_bpe
            bpe_parser = apply_bpe.create_parser()
            bpe_args = bpe_parser.parse_args([
                '--codes', codes,
                '--separator', args.bpe_separator,
            ])
            self.bpe = apply_bpe.BPE(
                bpe_args.codes,
                bpe_args.merges,
                bpe_args.separator,
                None,
                bpe_args.glossaries,
            )
            self.bpe_symbol = bpe_args.separator + ' '
        except ImportError:
            raise ImportError('Please install subword_nmt with: pip install subword-nmt') 
开发者ID:pytorch,项目名称:fairseq,代码行数:23,代码来源:subword_nmt_bpe.py

示例2: finalize

# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def finalize(
        self, frequencies: Dict[str, int], num_symbols: int, minfreq: int
    ) -> bool:
        """
        Build the codecs.

        Default helpers are pre-trained and thus do not build their own codecs

        :param frequencies:
            dictionary of (token: frequency) pairs
        :param num_symbols:
            Number of BPE symbols. Recommend 30000-40000.  If <= 0, default
            30000 will be used.
        :param minfreq:
            Minimum frequency of a token before forced BPE decomposition. If <=
            0 will use subword-nmt default of 2.

        :return did_finalize:
            return whether codecs are finalized this call.
        """
        return False 
开发者ID:facebookresearch,项目名称:ParlAI,代码行数:23,代码来源:bpe.py

示例3: __init__

# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def __init__(self, opt: Opt, shared: TShared = None):
        """
        Initialize the BPE module.

        :param opt:
            options
        :param shared:
            shared dictionary
        """
        super().__init__(opt, shared)
        if not SUBWORD_BPE_INSTALLED:
            raise RuntimeError(
                "Please run \"pip install 'git+https://github.com/rsennrich"
                "/subword-nmt.git#egg=subword-nmt'\""
            )
        if not opt.get('dict_file'):
            raise RuntimeError('--dict-file is mandatory.')

        self.splitter = re.compile(r'\w+|[^\w\s]', re.UNICODE)

        self.codecs = f"{opt['dict_file']}.codecs"
        if os.path.exists(self.codecs):
            self._load_from_codecs() 
开发者ID:facebookresearch,项目名称:ParlAI,代码行数:25,代码来源:bpe.py

示例4: helper_decode

# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def helper_decode(
        self, tokens: List[str], token_ids: List[int], delimiter: str
    ) -> str:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param token_ids:
            list of token ids
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        text = delimiter.join(tokens)
        text = text.replace('@@ ', '')
        # It's also possible that we get a BPE encoding on the end of the word
        if text.endswith('@@'):
            text = text[:-2]
        text = text.replace('__newln__', '\n')
        return text 
开发者ID:facebookresearch,项目名称:ParlAI,代码行数:25,代码来源:bpe.py

示例5: sync_with_dict

# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def sync_with_dict(self, dict_agent):
        """
        Sync with dictionary agent.

        Just add all of the tokens to the dict

        NOTE: How does this handle special tokens?

        :param dict_agent:
            A DictionaryAgent instantiation
        """
        for each_token in self.encoder.values():
            dict_agent.add_token(each_token)
            dict_agent.freq[each_token] = 1


###################
# HuggingFace BPE #
################### 
开发者ID:facebookresearch,项目名称:ParlAI,代码行数:21,代码来源:bpe.py

示例6: __init__

# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def __init__(self, bpe_codes: Union[str, TextIO],
                 bpe_vocab: Union[str, TextIO]):

        f_bpe_codes = None
        f_bpe_vocab = None

        try:
            if isinstance(bpe_codes, str):
                f_bpe_codes = open(bpe_codes, 'r', encoding='utf-8')
            if isinstance(bpe_vocab, str):
                f_bpe_vocab = open(bpe_vocab, 'r', encoding='utf-8')

            self.bpe = subword_nmt_bpe(codes=BPECodesAdapter(f_bpe_codes
                                                             or bpe_codes),
                                       vocab=read_vocabulary(f_bpe_vocab
                                                             or bpe_vocab,
                                                             threshold=None))
            self.bpe.version = (0, 2)

        finally:
            if f_bpe_codes:
                f_bpe_codes.close()
            if f_bpe_vocab:
                f_bpe_vocab.close() 
开发者ID:yannvgn,项目名称:laserembeddings,代码行数:26,代码来源:preprocessing.py

示例7: vec2txt

# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def vec2txt(self, vector, delimiter=' '):
        """
        Convert a vector of IDs to a string.

        Converts a vector (iterable of ints) into a string, with each token
        separated by the delimiter (default ``' '``).
        """
        text = delimiter.join(self[int(idx)] for idx in vector)
        # if we used a BPE tokenizer we need to rejoin the encodings
        if self.tokenizer == 'bpe' and not self.opt.get('bpe_debug', False):
            text = text.replace('@@ ', '')
            # It's also possible that we get a BPE encoding on the end of the word
            if text.endswith('@@'):
                text = text[:-2]
            text = text.replace('__newln__', '\n')
        return text 
开发者ID:natashamjaques,项目名称:neural_chat,代码行数:18,代码来源:dict.py

示例8: __init__

# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def __init__(self, codecs_filename):
        """
        Initialize the BPE module.

        If `codecs_filename` already exists, loads the pretrained codecs.
        If it does not, codecs will be saved there after a call to `finalize()`.

        :param codecs_filename:
            place to save/load codecs.
        """
        if not BPE_INSTALLED:
            raise RuntimeError(
                "Please run \"pip install 'git+https://github.com/rsennrich"
                "/subword-nmt.git#egg=subword-nmt'\""
            )

        self.splitter = re.compile(r'\w+|[^\w\s]', re.UNICODE)

        self.codecs = codecs_filename
        if os.path.exists(self.codecs):
            self._load_from_codecs() 
开发者ID:natashamjaques,项目名称:neural_chat,代码行数:23,代码来源:dict.py

示例9: tokenize

# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def tokenize(self, text):
        """
        Tokenize the text with bpe if codecs are already finalized.

        Otherwise, returns the regularly split tokens that will train the bpe.

        :param text: str. Raw text to tokenize.
        :return: a list of tokens. Will use BPE once finalized.
        """
        text = text.replace('\n', ' __newln__ ')
        tokens = self.splitter.findall(text)

        if hasattr(self, 'bpe'):
            return self.bpe.segment_tokens(tokens)
        else:
            return tokens 
开发者ID:natashamjaques,项目名称:neural_chat,代码行数:18,代码来源:dict.py

示例10: __init__

# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def __init__(self, codecs_filename):
        """Initialize the BPE module.

        If `codecs_filename` already exists, loads the pretrained codecs.
        If it does not, codecs will be saved there after a call to `finalize()`.

        :param codecs_filename: place to save/load codecs.
        """
        if not BPE_INSTALLED:
            raise RuntimeError(
                "Please run \"pip install 'git+https://github.com/rsennrich"
                "/subword-nmt.git#egg=subword-nmt'\""
            )

        self.splitter = re.compile(r'\w+|[^\w\s]', re.UNICODE)

        self.codecs = codecs_filename
        if os.path.exists(self.codecs):
            self._load_from_codecs() 
开发者ID:natashamjaques,项目名称:neural_chat,代码行数:21,代码来源:dict_v1.py

示例11: main

# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def main():
  parser = argparse.ArgumentParser(description=__doc__)
  parser.add_argument('--host', default='localhost', help='model server host')
  parser.add_argument('--port', type=int, default=9000, help='model server port')
  parser.add_argument('--model_name', required=True, help='model name')
  parser.add_argument('--preprocessor', help='tokenization script')
  parser.add_argument('--postprocessor', help='postprocessing script')
  parser.add_argument('--bpe_codes', required=True, help='BPE codes')
  parser.add_argument('-n', type=int, default=5, help='Number of comments to sample per title')

  args = parser.parse_args()
 
  generator = Generator(host=args.host,
                        port=args.port,
                        model_name=args.model_name,
                        preprocessor=args.preprocessor,
                        postprocessor=args.postprocessor,
                        bpe_codes=args.bpe_codes)

  for title in sys.stdin:
    hyps = generator(title, args.n)

    for prediction, score in hyps:
      sys.stdout.write('{}\t{}\n'.format(title.strip(), prediction)) 
开发者ID:leod,项目名称:hncynic,代码行数:26,代码来源:client.py

示例12: learn_bpe

# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def learn_bpe(self, item_list, from_filenames=True):
        logging.info('generating bpe codes file. saving to %s' %
                     self.codes_file)

        # get vocabulary at word level (before bpe)
        def segment_words(line): return _segment_words(line, self.pre_tokenize)
        vocab_words = _get_vocabulary(item_list,
                                      from_filenames=from_filenames,
                                      segment=segment_words)
        vocab_list = ['{0} {1}'.format(key, freq)
                      for (key, freq) in vocab_words.items()]
        # learn BPE on combined vocabulary
        with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output:
            learn_bpe.learn_bpe(vocab_list, output, num_symbols=self.num_symbols,
                                min_frequency=self.min_frequency, verbose=False,
                                is_dict=True, total_symbols=self.total_symbols)
        self.set_bpe(self.codes_file) 
开发者ID:eladhoffer,项目名称:seq2seq.pytorch,代码行数:19,代码来源:tokenizer.py

示例13: __init__

# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def __init__(self, model_path):
        super().__init__()
        from subword_nmt.apply_bpe import BPE
        with open(model_path) as f:
            self.model = BPE(f) 
开发者ID:pytorch,项目名称:fairseq,代码行数:7,代码来源:word_splitter.py

示例14: add_args

# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def add_args(parser):
        # fmt: off
        parser.add_argument('--bpe-codes', type=str,
                            help='path to subword NMT BPE')
        parser.add_argument('--bpe-separator', default='@@',
                            help='BPE separator')
        # fmt: on 
开发者ID:pytorch,项目名称:fairseq,代码行数:9,代码来源:subword_nmt_bpe.py

示例15: should_sort

# 需要导入模块: from subword_nmt import apply_bpe [as 别名]
# 或者: from subword_nmt.apply_bpe import BPE [as 别名]
def should_sort(self) -> bool:
        """
        Return whether tokens should be sorted for this particular helper.

        DictionaryAgent sorts tokens upon saving; we don't generally want to sort with
        our pre-trained dictionaries, so default is False.
        """
        return False


###############
# Subword BPE #
############### 
开发者ID:facebookresearch,项目名称:ParlAI,代码行数:15,代码来源:bpe.py


注:本文中的subword_nmt.apply_bpe.BPE属性示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。