本文整理汇总了Python中sentencepiece.SentencePieceProcessor方法的典型用法代码示例。如果您正苦于以下问题:Python sentencepiece.SentencePieceProcessor方法的具体用法?Python sentencepiece.SentencePieceProcessor怎么用?Python sentencepiece.SentencePieceProcessor使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sentencepiece
的用法示例。
在下文中一共展示了sentencepiece.SentencePieceProcessor方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
self.vocab = None
self.sp_model = None
if spm_model_file:
import sentencepiece as spm
self.sp_model = spm.SentencePieceProcessor()
tf.compat.v1.logging.info("loading sentence piece model")
self.sp_model.Load(spm_model_file)
# Note(mingdachen): For the purpose of consisent API, we are
# generating a vocabulary for the sentence piece tokenizer.
self.vocab = {self.sp_model.IdToPiece(i): i for i
in range(self.sp_model.GetPieceSize())}
else:
self.vocab = load_vocab(vocab_file)
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
示例2: train_sentence_piece_tokenizer
# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def train_sentence_piece_tokenizer(documents, vocab_size):
'''
:param documents: list-like, a list of str documents
:vocab_size int: the size of the vocabulary to output
:return sentencepiece.SentencePieceProcessor
'''
sp = None
with tempfile.NamedTemporaryFile(delete=True) as tempf:
with tempfile.NamedTemporaryFile(delete=True) as tempm:
tempf.write(('\n'.join(documents)).encode())
mod = spm.SentencePieceTrainer.Train('--input=%s --model_prefix=%s --vocab_size=%s'
% (tempf.name, tempm.name, vocab_size))
sp = spm.SentencePieceProcessor()
sp.load(tempm.name + '.model')
return sp
示例3: __init__
# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, text_corpus_address: Optional[str], model_name: str = 'spm',
vocab_size: int = 30000, spm_model_type: str = 'unigram') -> None:
super().__init__(vocab_size)
if not os.path.exists('{}.model'.format(model_name)):
if spm_model_type.lower() not in ('unigram', 'bpe', 'char', 'word'):
raise ValueError(
'{} is not a valid model_type for sentence piece, '
'valid options are: unigram, bpe, char, word'.format(spm_model_type))
spm.SentencePieceTrainer.Train(
'--input={input} --model_prefix={model_name} --vocab_size={vocab_size} '
'--character_coverage={coverage} --model_type={model_type} '
'--pad_id=-1 --unk_id=0 --bos_id=-1 --eos_id=-1 --input_sentence_size=100000000 '
'--training_sentence_size=100000000'.format(
input=text_corpus_address, model_name=model_name, vocab_size=vocab_size, coverage=1,
model_type=spm_model_type.lower()))
self.sp = spm.SentencePieceProcessor()
self.sp.load('{}.model'.format(model_name))
示例4: __init__
# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, params):
configure_logger(params['output_dir'])
log('Parameters {}'.format(params))
self.params = params
self.binding = load_bindings(params['rom_file_path'])
self.max_word_length = self.binding['max_word_length']
self.sp = spm.SentencePieceProcessor()
self.sp.Load(params['spm_file'])
kg_env = KGA2CEnv(params['rom_file_path'], params['seed'], self.sp,
params['tsv_file'], step_limit=params['reset_steps'],
stuck_steps=params['stuck_steps'], gat=params['gat'])
self.vec_env = VecEnv(params['batch_size'], kg_env, params['openie_path'])
self.template_generator = TemplateActionGenerator(self.binding)
env = FrotzEnv(params['rom_file_path'])
self.vocab_act, self.vocab_act_rev = load_vocab(env)
self.model = KGA2C(params, self.template_generator.templates, self.max_word_length,
self.vocab_act, self.vocab_act_rev, len(self.sp), gat=self.params['gat']).cuda()
self.batch_size = params['batch_size']
if params['preload_weights']:
self.model = torch.load(self.params['preload_weights'])['model']
self.optimizer = optim.Adam(self.model.parameters(), lr=params['lr'])
self.loss_fn1 = nn.BCELoss()
self.loss_fn2 = nn.BCEWithLogitsLoss()
self.loss_fn3 = nn.MSELoss()
示例5: __init__
# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, vocab_path: str, model_path: str):
self.vocab_path = vocab_path
self.model_path = model_path
# Load pretrained tokenizer model.
self.model = sp.SentencePieceProcessor()
self.model.Load(model_path)
# Load vocabulary mapping (and inverse mapping) between token and id.
self._token_to_id: Dict[str, int] = {}
self._id_to_token: Dict[int, str] = {}
with open(vocab_path, "r") as vocab_file:
reader = csv.DictReader(
vocab_file, delimiter="\t", fieldnames=["token", "logprob"]
)
for index, row in enumerate(reader):
self._token_to_id[row["token"]] = index
self._id_to_token[index] = row["token"]
示例6: __init__
# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, vocab_file, max_len=None,
do_lower_case=False, remove_space=True, keep_accents=False,
bos_token="<s>", eos_token="</s>", unk_token="<unk>", sep_token="<sep>",
pad_token="<pad>", cls_token="<cls>", mask_token="<mask>",
additional_special_tokens=["<eop>", "<eod>"], **kwargs):
super(XLNetTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token,
unk_token=unk_token, sep_token=sep_token,
pad_token=pad_token, cls_token=cls_token,
mask_token=mask_token, additional_special_tokens=
additional_special_tokens, **kwargs)
try:
import sentencepiece as spm
except ImportError:
logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece")
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(vocab_file)
示例7: __init__
# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
self.vocab = None
self.sp_model = None
if spm_model_file:
self.sp_model = spm.SentencePieceProcessor()
tf.logging.info("loading sentence piece model")
# Handle cases where SP can't load the file, but gfile can.
sp_model_ = tf.gfile.GFile(spm_model_file, "rb").read()
self.sp_model.LoadFromSerializedProto(sp_model_)
# Note(mingdachen): For the purpose of consisent API, we are
# generating a vocabulary for the sentence piece tokenizer.
self.vocab = {self.sp_model.IdToPiece(i): i for i
in range(self.sp_model.GetPieceSize())}
else:
self.vocab = load_vocab(vocab_file)
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
示例8: __init__
# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, sp_model_path, *args, **kwargs):
super(SpTokenizer, self).__init__(*args, **kwargs)
import sentencepiece as spm
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(sp_model_path)
self._token_pad = self.sp_model.id_to_piece(self.sp_model.pad_id())
self._token_unk = self.sp_model.id_to_piece(self.sp_model.unk_id())
self._vocab_size = self.sp_model.get_piece_size()
for token in ['pad', 'unk', 'mask', 'start', 'end']:
try:
_token = getattr(self, '_token_%s' % token)
_token_id = self.sp_model.piece_to_id(_token)
setattr(self, '_token_%s_id' % token, _token_id)
except:
pass
示例9: __init__
# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, text_corpus_address: Optional[str], model_name: str = 'spm',
vocab_size: int = 30000, spm_model_type: str = 'unigram') -> None:
super().__init__(vocab_size)
if not os.path.exists('{}.model'.format(model_name)):
if spm_model_type.lower() not in ('unigram', 'bpe', 'char', 'word'):
raise ValueError(
'{} is not a valid model_type for sentence piece, '
'valid options are: unigram, bpe, char, word'.format(spm_model_type))
spm.SentencePieceTrainer.Train(
'--input={input} --model_prefix={model_name} --vocab_size={vocab_size} '
'--character_coverage={coverage} --model_type={model_type} '
'--pad_id=-1 --unk_id=0 --bos_id=-1 --eos_id=-1 --input_sentence_size=100000000 '.format(
input=text_corpus_address, model_name=model_name, vocab_size=vocab_size, coverage=1,
model_type=spm_model_type.lower()))
self.sp = spm.SentencePieceProcessor()
self.sp.load('{}.model'.format(model_name))
示例10: __init__
# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
self.vocab = None
self.sp_model = None
if spm_model_file:
self.sp_model = spm.SentencePieceProcessor()
tf.logging.info("loading sentence piece model")
self.sp_model.Load(spm_model_file)
# Note(mingdachen): For the purpose of consisent API, we are
# generating a vocabulary for the sentence piece tokenizer.
self.vocab = {self.sp_model.IdToPiece(i): i for i
in range(self.sp_model.GetPieceSize())}
else:
self.vocab = load_vocab(vocab_file)
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
示例11: spm_srcs
# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def spm_srcs(tmp_path: Path):
input_text = tmp_path / "text"
vocabsize = len(string.ascii_letters) + 4
model_prefix = tmp_path / "model"
model = str(model_prefix) + ".model"
input_sentence_size = 100000
with input_text.open("w") as f:
f.write(string.ascii_letters + "\n")
spm.SentencePieceTrainer.Train(
f"--input={input_text} "
f"--vocab_size={vocabsize} "
f"--model_prefix={model_prefix} "
f"--input_sentence_size={input_sentence_size}"
)
sp = spm.SentencePieceProcessor()
sp.load(model)
with input_text.open("r") as f:
vocabs = {"<unk>", "▁"}
for line in f:
tokens = sp.DecodePieces(list(line.strip()))
vocabs |= set(tokens)
return model, vocabs
示例12: __init__
# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, filename, *inputs, **kwargs):
super().__init__(*inputs, **kwargs)
self.max_len_single_sentence = 1024 # no default special tokens - you can update this value if you add special tokens
self.max_len_sentences_pair = 1024 # no default special tokens - you can update this value if you add special tokens
if os.path.isdir(filename): filename = os.path.join(filename, self.def_name)
self.sp = spm.SentencePieceProcessor()
self.sp.Load(filename)
self.hash = hashlib.sha512(open(filename, 'rb').read()).hexdigest()[:10]
self.filename = filename
# for some reason SentencePiece inserts a blank line id before special token if that is the only
# token in the line. I'd like to remove that blank line id from encoding.
nl_ids = self.sp.EncodeAsIds(NEW_LINE)
assert(len(nl_ids) == 2)
self.blank_line_id = nl_ids[0]
示例13: main
# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--spm_model_path', default='spm_model/ch.model', type=str, required=False, help='sentencepiece模型地址')
parser.add_argument('--raw_data_path', default='data/train_test.txt', type=str, required=False, help='原始语料地址')
parser.add_argument('--save_tfrecord_path', default='data/tokenized/', type=str, required=False, help='处理后的语料存放地址')
parser.add_argument('--min_length', default=10, type=int, required=False, help='最短收录句子长度')
parser.add_argument('--n_ctx', default=512, type=int, required=False, help='每个训练样本的长度')
parser.add_argument('--batch_size', default=8, type=int, required=False, help='只用于XL模型,XL模型的batch size,GPT2设置为1')
parser.add_argument('--pad', default=0, type=int, required=False, help='PAD值')
parser.add_argument('--epochs', default=1, type=int, required=False, help='只用于XL模型,GPT2设置为1')
args = parser.parse_args()
print('args:\n' + args.__repr__())
ch_sp = spm.SentencePieceProcessor()
ch_sp.Load(args.spm_model_path)
build_tfrecord(args.raw_data_path, args.save_tfrecord_path, ch_sp, args.min_length, args.n_ctx,
args.batch_size, pad=args.pad, epochs=args.epochs)
示例14: translate_text
# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def translate_text(text, source, target):
if source == target:
# The easy case ;-)
return text
t = translations[source][target]
s = spm.SentencePieceProcessor()
s.Load(os.path.join(ROOT_DIR, 'models', t["sentencepiece_model"]))
pieces = s.encode_as_pieces(text)
# Ensure any trailing words without terminating punctuation is also translated.
if pieces[-1] != '.':
pieces.append('.')
# For other languages we will need a better system for chunking sentences or parts of text.
indices = [i for i, _x in enumerate(pieces) if _x in [".", "!", "?"]]
complete_result = []
start=0
for i in indices:
x = " ".join([e for e in pieces[start:i+1]])
result = _translate(x, translate_model=t['translate_model'])
y = s.decode_pieces(result[1][0].split(" "))
complete_result.append(y)
start = i
return "\n".join(complete_result)
示例15: __init__
# 需要导入模块: import sentencepiece [as 别名]
# 或者: from sentencepiece import SentencePieceProcessor [as 别名]
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
self.vocab = None
self.sp_model = None
print(spm_model_file)
if spm_model_file:
self.sp_model = spm.SentencePieceProcessor()
logger.info("loading sentence piece model")
self.sp_model.Load(str(spm_model_file))
# # Note(mingdachen): For the purpose of consisent API, we are
# # generating a vocabulary for the sentence piece tokenizer.
self.vocab = {self.sp_model.IdToPiece(i): i for i
in range(self.sp_model.GetPieceSize())}
else:
print("load vocab")
self.vocab = load_vocab(vocab_file)
print("load token")
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab,unk_token="[UNK]", max_input_chars_per_word=100)
self.inv_vocab = {v: k for k, v in self.vocab.items()}