本文整理汇总了Python中tokenizers.ByteLevelBPETokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python tokenizers.ByteLevelBPETokenizer方法的具体用法?Python tokenizers.ByteLevelBPETokenizer怎么用?Python tokenizers.ByteLevelBPETokenizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tokenizers
的用法示例。
在下文中一共展示了tokenizers.ByteLevelBPETokenizer方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: import tokenizers [as 别名]
# 或者: from tokenizers import ByteLevelBPETokenizer [as 别名]
def __init__(self, args):
try:
from tokenizers import ByteLevelBPETokenizer
except ImportError:
raise ImportError(
'Please install huggingface/tokenizers with: '
'pip install tokenizers'
)
self.bpe = ByteLevelBPETokenizer(
args.bpe_vocab,
args.bpe_merges,
add_prefix_space=getattr(args, 'bpe_add_prefix_space', False),
)
示例2: train_tokenizer
# 需要导入模块: import tokenizers [as 别名]
# 或者: from tokenizers import ByteLevelBPETokenizer [as 别名]
def train_tokenizer(
files: Union[str, List[str]],
dropout: float = None,
vocab_size: int = 5000,
min_frequency: int = 2,
save_path: str = "",
added_tokens: List[str] = [],
bos_token: str = "<|endoftext|>",
eos_token: str = "<|endoftext|>",
unk_token: str = "<|endoftext|>",
) -> None:
"""
Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package.
See: https://huggingface.co/blog/how-to-train
For consistency, this function makes opinionated assuptions.
:param files: path to file(s) to train tokenizer on
:param dropout: Trainign dropout
:param vocab_size: Final vocabulary size
:param min_frequency: Minimum number of occurences to add to vocab
:param save_path: Where to save the final tokenizer
:param added_tokens: List of tokens to add to the tokenizer (currently not working)
:param bos_token: Beginning-of-string special token
:param eos_token: End-of-string special token
:param unk_token: Unknown special token
"""
assert isinstance(files, str) or isinstance(
files, list
), "files must be a string or a list."
tokenizer = ByteLevelBPETokenizer(dropout=dropout)
tokenizer.train(
files,
vocab_size=vocab_size,
min_frequency=min_frequency,
special_tokens=[bos_token, eos_token, unk_token],
show_progress=True,
)
# Currently doesn't do anything
# See: https://github.com/huggingface/tokenizers/issues/233
# tokenizer.add_tokens(added_tokens)
PREFIX = "aitextgen"
save_path_str = "the current directory" if save_path == "" else save_path
logger.info(
f"Saving {PREFIX}-vocab.json and {PREFIX}-merges.txt to {save_path_str}. "
+ "You will need both files to build the GPT2Tokenizer."
)
tokenizer.save(save_path, PREFIX)
示例3: __init__
# 需要导入模块: import tokenizers [as 别名]
# 或者: from tokenizers import ByteLevelBPETokenizer [as 别名]
def __init__(self, opt: Opt, shared: TShared = None):
super().__init__(opt, shared)
# Default true for HF
self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
if self.add_prefix_space is None:
self.add_prefix_space = True
if opt.get('dict_loaded'):
dfname = opt['dict_file']
if os.path.isfile(f'{dfname}-merges.txt'):
opt['bpe_merge'] = f'{dfname}-merges.txt'
if os.path.isfile(f'{dfname}-vocab.json'):
opt['bpe_vocab'] = f'{dfname}-vocab.json'
try:
from tokenizers import ByteLevelBPETokenizer
except ImportError:
raise ImportError(
'Please install HuggingFace tokenizer with: pip install tokenizers'
)
if self.lower:
warn_once('Are you sure you want to lower case your BPE dictionary?')
if self.maxtokens > 0 or self.minfreq > 0:
raise ValueError(
'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe'
' (no --dict-minfreq or --dict-maxtokens).'
)
if 'bpe_vocab' not in opt:
raise ValueError('--bpe-vocab is required for loading pretrained tokenizer')
if 'bpe_merge' not in opt:
raise ValueError('--bpe-merge is required for loading pretrained tokenizer')
self.vocab_path = opt['bpe_vocab']
self.merge_path = opt['bpe_merge']
if not self.vocab_path or not self.merge_path:
raise IOError(
'--bpe-vocab and --bpe-merge are mandatory with '
'--dict-tokenizer bytelevelbpe'
)
if not os.path.isfile(self.vocab_path):
raise IOError(
f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
)
if not os.path.isfile(self.merge_path):
raise IOError(
f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
)
self.tokenizer = ByteLevelBPETokenizer(
self.vocab_path, self.merge_path, self.add_prefix_space
)