本文整理汇总了Python中transformers.PreTrainedTokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python transformers.PreTrainedTokenizer方法的具体用法?Python transformers.PreTrainedTokenizer怎么用?Python transformers.PreTrainedTokenizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类transformers
的用法示例。
在下文中一共展示了transformers.PreTrainedTokenizer方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import PreTrainedTokenizer [as 别名]
def __init__(self, lang: str = 'en', n_components: Optional[int] = None,
text_columns: List[str] = None, pooling_strategy: str = 'reduce_mean',
use_cuda: bool = False, tokenizer: transformers.PreTrainedTokenizer = None,
model=None, return_same_type: bool = True, column_format: str = '{col}_{idx}'):
if tokenizer is not None:
assert model is not None
self.tokenizer = tokenizer
self.model = model
if lang == 'en':
self.tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
self.model = transformers.BertModel.from_pretrained('bert-base-uncased')
elif lang == 'jp':
self.tokenizer = transformers.BertJapaneseTokenizer.from_pretrained('bert-base-japanese-whole-word-masking')
self.model = transformers.BertModel.from_pretrained('bert-base-japanese-whole-word-masking')
else:
raise ValueError('Specified language type () is invalid.'.format(lang))
self.lang = lang
self.n_components = n_components
self.text_columns = text_columns
self.pooling_strategy = pooling_strategy
self.use_cuda = use_cuda
self.return_same_type = return_same_type
self.svd = {}
self.column_format = column_format
示例2: tokenizer_lowercases
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import PreTrainedTokenizer [as 别名]
def tokenizer_lowercases(tokenizer: PreTrainedTokenizer) -> bool:
# Huggingface tokenizers have different ways of remembering whether they lowercase or not. Detecting it
# this way seems like the least brittle way to do it.
tokenized = tokenizer.tokenize(
"A"
) # Use a single character that won't be cut into word pieces.
detokenized = " ".join(tokenized)
return "a" in detokenized
示例3: get_tokenizer
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import PreTrainedTokenizer [as 别名]
def get_tokenizer(model_name: str, **kwargs) -> transformers.PreTrainedTokenizer:
cache_key = (model_name, frozenset(kwargs.items()))
global _tokenizer_cache
tokenizer = _tokenizer_cache.get(cache_key, None)
if tokenizer is None:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, **kwargs)
_tokenizer_cache[cache_key] = tokenizer
return tokenizer
示例4: __init__
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import PreTrainedTokenizer [as 别名]
def __init__(self, real_texts: List[str], fake_texts: List[str], tokenizer: PreTrainedTokenizer,
max_sequence_length: int = None, min_sequence_length: int = None, epoch_size: int = None,
token_dropout: float = None, seed: int = None):
self.real_texts = real_texts
self.fake_texts = fake_texts
self.tokenizer = tokenizer
self.max_sequence_length = max_sequence_length
self.min_sequence_length = min_sequence_length
self.epoch_size = epoch_size
self.token_dropout = token_dropout
self.random = np.random.RandomState(seed)
示例5: mask_tokens
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import PreTrainedTokenizer [as 别名]
def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]:
""" Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
if tokenizer.mask_token is None:
raise ValueError(
"This tokenizer does not have a mask token which is necessary for masked language modeling."
"Set 'mlm' to False in args if you want to use this tokenizer."
)
labels = inputs.clone()
# We sample a few tokens in each sequence for masked-LM training
# (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
probability_matrix = torch.full(labels.shape, args.mlm_probability)
special_tokens_mask = [
tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
if tokenizer._pad_token is not None:
padding_mask = labels.eq(tokenizer.pad_token_id)
probability_matrix.masked_fill_(padding_mask, value=0.0)
masked_indices = torch.bernoulli(probability_matrix).bool()
labels[~masked_indices] = -100 # We only compute loss on masked tokens
if args.model_type == "electra" and False:
# For ELECTRA, we replace all masked input tokens with tokenizer.mask_token
inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
else:
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
# 10% of the time, we replace masked input tokens with random word
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
inputs[indices_random] = random_words[indices_random]
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels
示例6: check_tokenizer_from_pretrained
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import PreTrainedTokenizer [as 别名]
def check_tokenizer_from_pretrained(self, tokenizer_class):
s3_models = list(tokenizer_class.max_model_input_sizes.keys())
for model_name in s3_models[:1]:
tokenizer = tokenizer_class.from_pretrained(model_name)
self.assertIsNotNone(tokenizer)
self.assertIsInstance(tokenizer, tokenizer_class)
self.assertIsInstance(tokenizer, PreTrainedTokenizer)
for special_tok in tokenizer.all_special_tokens:
self.assertIsInstance(special_tok, str)
special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
self.assertIsInstance(special_tok_id, int)
示例7: __init__
# 需要导入模块: import transformers [as 别名]
# 或者: from transformers import PreTrainedTokenizer [as 别名]
def __init__(
self,
texts: Iterable[str],
tokenizer: Union[str, PreTrainedTokenizer],
max_seq_length: int = None,
sort: bool = True,
lazy: bool = False,
):
"""
Args:
texts (Iterable): Iterable object with text
tokenizer (str or tokenizer): pre trained
huggingface tokenizer or model name
max_seq_length (int): max sequence length to tokenize
sort (bool): If True then sort all sequences by length
for efficient padding
lazy (bool): If True then tokenize and encode sequence
in __getitem__ method
else will tokenize in __init__ also
if set to true sorting is unavialible
"""
if sort and lazy:
raise Exception(
"lazy is set to True so we can't sort"
" sequences by length.\n"
"You should set sort=False and lazy=True"
" if you want to encode text in __get_item__ function"
)
if isinstance(tokenizer, str):
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer)
elif isinstance(
tokenizer, transformers.tokenization_utils.PreTrainedTokenizer
):
self.tokenizer = tokenizer
else:
raise TypeError(
"tokenizer argument should be a model name"
+ " or huggingface PreTrainedTokenizer"
)
self.max_seq_length = max_seq_length
self.lazy = lazy
if lazy:
self.texts = texts
if not lazy:
pbar = tqdm(texts, desc="tokenizing texts")
self.encoded = [
self.tokenizer.encode(text, max_length=max_seq_length)
for text in pbar
]
if sort:
self.encoded.sort(key=len)
self.length = len(texts)
self._getitem_fn = (
self._getitem_lazy if lazy else self._getitem_encoded
)