当前位置: 首页>>代码示例>>Python>>正文


Python sentencepiece.SentencePieceProcessor方法代码示例

本文整理汇总了Python中sentencepiece.SentencePieceProcessor方法的典型用法代码示例。如果您正苦于以下问题:Python sentencepiece.SentencePieceProcessor方法的具体用法?Python sentencepiece.SentencePieceProcessor怎么用?Python sentencepiece.SentencePieceProcessor使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在sentencepiece的用法示例。


在下文中一共展示了sentencepiece.SentencePieceProcessor方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
        self.vocab = None
        self.sp_model = None
        if spm_model_file:
            import sentencepiece as spm

            self.sp_model = spm.SentencePieceProcessor()
            tf.compat.v1.logging.info("loading sentence piece model")
            self.sp_model.Load(spm_model_file)
            # Note(mingdachen): For the purpose of consisent API, we are
            # generating a vocabulary for the sentence piece tokenizer.
            self.vocab = {self.sp_model.IdToPiece(i): i for i
                          in range(self.sp_model.GetPieceSize())}
        else:
            self.vocab = load_vocab(vocab_file)
            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
            self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.inv_vocab = {v: k for k, v in self.vocab.items()} 
开发者ID:kpe,项目名称:bert-for-tf2,代码行数:20,代码来源:albert_tokenization.py

示例2: train_sentence_piece_tokenizer

# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def train_sentence_piece_tokenizer(documents, vocab_size):
    '''
    :param documents: list-like, a list of str documents
    :vocab_size int: the size of the vocabulary to output

    :return sentencepiece.SentencePieceProcessor
    '''
    sp = None
    with tempfile.NamedTemporaryFile(delete=True) as tempf:
        with tempfile.NamedTemporaryFile(delete=True) as tempm:
            tempf.write(('\n'.join(documents)).encode())
            mod = spm.SentencePieceTrainer.Train('--input=%s --model_prefix=%s --vocab_size=%s'
                                                 % (tempf.name, tempm.name, vocab_size))
            sp = spm.SentencePieceProcessor()
            sp.load(tempm.name + '.model')
    return sp 
开发者ID:JasonKessler,项目名称:scattertext,代码行数:18,代码来源:demo_sentence_piece.py

示例3: __init__

# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, text_corpus_address: Optional[str], model_name: str = 'spm',
                 vocab_size: int = 30000, spm_model_type: str = 'unigram') -> None:
        super().__init__(vocab_size)
        if not os.path.exists('{}.model'.format(model_name)):
            if spm_model_type.lower() not in ('unigram', 'bpe', 'char', 'word'):
                raise ValueError(
                    '{} is not a valid model_type for sentence piece, '
                    'valid options are: unigram, bpe, char, word'.format(spm_model_type))
            spm.SentencePieceTrainer.Train(
                '--input={input} --model_prefix={model_name} --vocab_size={vocab_size} '
                '--character_coverage={coverage} --model_type={model_type} '
                '--pad_id=-1 --unk_id=0 --bos_id=-1 --eos_id=-1 --input_sentence_size=100000000 '
                '--training_sentence_size=100000000'.format(
                    input=text_corpus_address, model_name=model_name, vocab_size=vocab_size, coverage=1,
                    model_type=spm_model_type.lower()))
        self.sp = spm.SentencePieceProcessor()
        self.sp.load('{}.model'.format(model_name)) 
开发者ID:yyht,项目名称:BERT,代码行数:19,代码来源:vocab.py

示例4: __init__

# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, params):
        configure_logger(params['output_dir'])
        log('Parameters {}'.format(params))
        self.params = params
        self.binding = load_bindings(params['rom_file_path'])
        self.max_word_length = self.binding['max_word_length']
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(params['spm_file'])
        kg_env = KGA2CEnv(params['rom_file_path'], params['seed'], self.sp,
                          params['tsv_file'], step_limit=params['reset_steps'],
                          stuck_steps=params['stuck_steps'], gat=params['gat'])
        self.vec_env = VecEnv(params['batch_size'], kg_env, params['openie_path'])
        self.template_generator = TemplateActionGenerator(self.binding)
        env = FrotzEnv(params['rom_file_path'])
        self.vocab_act, self.vocab_act_rev = load_vocab(env)
        self.model = KGA2C(params, self.template_generator.templates, self.max_word_length,
                           self.vocab_act, self.vocab_act_rev, len(self.sp), gat=self.params['gat']).cuda()
        self.batch_size = params['batch_size']
        if params['preload_weights']:
            self.model = torch.load(self.params['preload_weights'])['model']
        self.optimizer = optim.Adam(self.model.parameters(), lr=params['lr'])

        self.loss_fn1 = nn.BCELoss()
        self.loss_fn2 = nn.BCEWithLogitsLoss()
        self.loss_fn3 = nn.MSELoss() 
开发者ID:rajammanabrolu,项目名称:KG-A2C,代码行数:27,代码来源:gdqn.py

示例5: __init__

# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, vocab_path: str, model_path: str):
        self.vocab_path = vocab_path
        self.model_path = model_path

        # Load pretrained tokenizer model.
        self.model = sp.SentencePieceProcessor()
        self.model.Load(model_path)

        # Load vocabulary mapping (and inverse mapping) between token and id.
        self._token_to_id: Dict[str, int] = {}
        self._id_to_token: Dict[int, str] = {}

        with open(vocab_path, "r") as vocab_file:
            reader = csv.DictReader(
                vocab_file, delimiter="\t", fieldnames=["token", "logprob"]
            )
            for index, row in enumerate(reader):
                self._token_to_id[row["token"]] = index
                self._id_to_token[index] = row["token"] 
开发者ID:kdexd,项目名称:virtex,代码行数:21,代码来源:tokenizers.py

示例6: __init__

# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, vocab_file, max_len=None,
                 do_lower_case=False, remove_space=True, keep_accents=False,
                 bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
                 pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",
                 additional_special_tokens=["<eop>", "<eod>"], **kwargs):
        super(XLNetTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
                                             unk_token=unk_token, sep_token=sep_token,
                                             pad_token=pad_token, cls_token=cls_token,
                                             mask_token=mask_token, additional_special_tokens=
                                             additional_special_tokens, **kwargs)
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")

        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file

        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(vocab_file) 
开发者ID:linhaow,项目名称:TextClassify,代码行数:25,代码来源:tokenization_xlnet.py

示例7: __init__

# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
    self.vocab = None
    self.sp_model = None
    if spm_model_file:
      self.sp_model = spm.SentencePieceProcessor()
      tf.logging.info("loading sentence piece model")
      # Handle cases where SP can't load the file, but gfile can.
      sp_model_ = tf.gfile.GFile(spm_model_file, "rb").read()
      self.sp_model.LoadFromSerializedProto(sp_model_)
      # Note(mingdachen): For the purpose of consisent API, we are
      # generating a vocabulary for the sentence piece tokenizer.
      self.vocab = {self.sp_model.IdToPiece(i): i for i
                    in range(self.sp_model.GetPieceSize())}
    else:
      self.vocab = load_vocab(vocab_file)
      self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
      self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
    self.inv_vocab = {v: k for k, v in self.vocab.items()} 
开发者ID:google-research,项目名称:albert,代码行数:20,代码来源:tokenization.py

示例8: __init__

# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, sp_model_path, *args, **kwargs):
        super(SpTokenizer, self).__init__(*args, **kwargs)
        import sentencepiece as spm
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(sp_model_path)
        self._token_pad = self.sp_model.id_to_piece(self.sp_model.pad_id())
        self._token_unk = self.sp_model.id_to_piece(self.sp_model.unk_id())
        self._vocab_size = self.sp_model.get_piece_size()

        for token in ['pad', 'unk', 'mask', 'start', 'end']:
            try:
                _token = getattr(self, '_token_%s' % token)
                _token_id = self.sp_model.piece_to_id(_token)
                setattr(self, '_token_%s_id' % token, _token_id)
            except:
                pass 
开发者ID:bojone,项目名称:bert4keras,代码行数:18,代码来源:tokenizers.py

示例9: __init__

# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, text_corpus_address: Optional[str], model_name: str = 'spm',
                 vocab_size: int = 30000, spm_model_type: str = 'unigram') -> None:
        super().__init__(vocab_size)
        if not os.path.exists('{}.model'.format(model_name)):
            if spm_model_type.lower() not in ('unigram', 'bpe', 'char', 'word'):
                raise ValueError(
                    '{} is not a valid model_type for sentence piece, '
                    'valid options are: unigram, bpe, char, word'.format(spm_model_type))
            spm.SentencePieceTrainer.Train(
                '--input={input} --model_prefix={model_name} --vocab_size={vocab_size} '
                '--character_coverage={coverage} --model_type={model_type} '
                '--pad_id=-1 --unk_id=0 --bos_id=-1 --eos_id=-1 --input_sentence_size=100000000 '.format(
                    input=text_corpus_address, model_name=model_name, vocab_size=vocab_size, coverage=1,
                    model_type=spm_model_type.lower()))
        self.sp = spm.SentencePieceProcessor()
        self.sp.load('{}.model'.format(model_name)) 
开发者ID:Separius,项目名称:BERT-keras,代码行数:18,代码来源:vocab.py

示例10: __init__

# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
        self.vocab = None
        self.sp_model = None
        if spm_model_file:
            self.sp_model = spm.SentencePieceProcessor()
            tf.logging.info("loading sentence piece model")
            self.sp_model.Load(spm_model_file)
            # Note(mingdachen): For the purpose of consisent API, we are
            # generating a vocabulary for the sentence piece tokenizer.
            self.vocab = {self.sp_model.IdToPiece(i): i for i
                          in range(self.sp_model.GetPieceSize())}
        else:
            self.vocab = load_vocab(vocab_file)
            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
            self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.inv_vocab = {v: k for k, v in self.vocab.items()} 
开发者ID:amansrivastava17,项目名称:embedding-as-service,代码行数:18,代码来源:tokenization.py

示例11: spm_srcs

# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def spm_srcs(tmp_path: Path):
    input_text = tmp_path / "text"
    vocabsize = len(string.ascii_letters) + 4
    model_prefix = tmp_path / "model"
    model = str(model_prefix) + ".model"
    input_sentence_size = 100000

    with input_text.open("w") as f:
        f.write(string.ascii_letters + "\n")

    spm.SentencePieceTrainer.Train(
        f"--input={input_text} "
        f"--vocab_size={vocabsize} "
        f"--model_prefix={model_prefix} "
        f"--input_sentence_size={input_sentence_size}"
    )
    sp = spm.SentencePieceProcessor()
    sp.load(model)

    with input_text.open("r") as f:
        vocabs = {"<unk>", "▁"}
        for line in f:
            tokens = sp.DecodePieces(list(line.strip()))
        vocabs |= set(tokens)
    return model, vocabs 
开发者ID:espnet,项目名称:espnet,代码行数:27,代码来源:test_sentencepiece_tokenizer.py

示例12: __init__

# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, filename, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)
        self.max_len_single_sentence = 1024 # no default special tokens - you can update this value if you add special tokens
        self.max_len_sentences_pair = 1024 # no default special tokens - you can update this value if you add special tokens

        if os.path.isdir(filename): filename = os.path.join(filename, self.def_name)

        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(filename)
        self.hash = hashlib.sha512(open(filename, 'rb').read()).hexdigest()[:10]
        self.filename = filename
        # for some reason SentencePiece inserts a blank line id before special token if that is the only 
        # token in the line. I'd like to remove that blank line id from encoding.
        nl_ids = self.sp.EncodeAsIds(NEW_LINE)
        assert(len(nl_ids) == 2)
        self.blank_line_id = nl_ids[0] 
开发者ID:mgrankin,项目名称:ru_transformers,代码行数:18,代码来源:sp_encoder.py

示例13: main

# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--spm_model_path', default='spm_model/ch.model', type=str, required=False, help='sentencepiece模型地址')
    parser.add_argument('--raw_data_path', default='data/train_test.txt', type=str, required=False, help='原始语料地址')
    parser.add_argument('--save_tfrecord_path', default='data/tokenized/', type=str, required=False, help='处理后的语料存放地址')
    parser.add_argument('--min_length', default=10, type=int, required=False, help='最短收录句子长度')
    parser.add_argument('--n_ctx', default=512, type=int, required=False, help='每个训练样本的长度')
    parser.add_argument('--batch_size', default=8, type=int, required=False, help='只用于XL模型,XL模型的batch size,GPT2设置为1')
    parser.add_argument('--pad', default=0, type=int, required=False, help='PAD值')
    parser.add_argument('--epochs', default=1, type=int, required=False, help='只用于XL模型,GPT2设置为1')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    ch_sp = spm.SentencePieceProcessor()
    ch_sp.Load(args.spm_model_path)

    build_tfrecord(args.raw_data_path, args.save_tfrecord_path, ch_sp, args.min_length, args.n_ctx,
                   args.batch_size, pad=args.pad, epochs=args.epochs) 
开发者ID:Morizeyao,项目名称:Decoders-Chinese-TF2.0,代码行数:22,代码来源:prepare_data.py

示例14: translate_text

# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def translate_text(text, source, target):
    if source == target:
        # The easy case ;-)
        return text

    t = translations[source][target]
    s = spm.SentencePieceProcessor()
    s.Load(os.path.join(ROOT_DIR, 'models', t["sentencepiece_model"]))
    pieces = s.encode_as_pieces(text)

    # Ensure any trailing words without terminating punctuation is also translated.
    if pieces[-1] != '.':
        pieces.append('.')
    # For other languages we will need a better system for chunking sentences or parts of text.
    indices = [i for i, _x in enumerate(pieces) if _x in [".", "!", "?"]]
    
    complete_result = []
    start=0
    for i in indices:
        x = " ".join([e for e in pieces[start:i+1]])
        result = _translate(x, translate_model=t['translate_model'])
        y = s.decode_pieces(result[1][0].split(" "))
        complete_result.append(y)
        start = i
    return "\n".join(complete_result) 
开发者ID:singnet,项目名称:nlp-services,代码行数:27,代码来源:translate_server.py

示例15: __init__

# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
    self.vocab = None
    self.sp_model = None
    print(spm_model_file)
    if spm_model_file:
      self.sp_model = spm.SentencePieceProcessor()
      logger.info("loading sentence piece model")
      self.sp_model.Load(str(spm_model_file))
      # # Note(mingdachen): For the purpose of consisent API, we are
      # # generating a vocabulary for the sentence piece tokenizer.
      self.vocab = {self.sp_model.IdToPiece(i): i for i
                    in range(self.sp_model.GetPieceSize())}
    else:
      print("load vocab")
      self.vocab = load_vocab(vocab_file)
      print("load token")
      self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
      self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,unk_token="[UNK]", max_input_chars_per_word=100)
    self.inv_vocab = {v: k for k, v in self.vocab.items()} 
开发者ID:lonePatient,项目名称:Bert-Multi-Label-Text-Classification,代码行数:21,代码来源:tokenization_albert.py


注:本文中的sentencepiece.SentencePieceProcessor方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。