本文整理汇总了Python中transformers.AutoTokenizer.from_pretrained方法的典型用法代码示例。如果您正苦于以下问题:Python AutoTokenizer.from_pretrained方法的具体用法?Python AutoTokenizer.from_pretrained怎么用?Python AutoTokenizer.from_pretrained使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类transformers.AutoTokenizer
的用法示例。
在下文中一共展示了AutoTokenizer.from_pretrained方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: seg
# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def seg(args):
tokenizer = AutoTokenizer.from_pretrained(
args.model_name_or_path, do_lower_case=True
)
seg_file(
os.path.join(args.output_dir, args.data_split + ".txt.tmp"),
tokenizer,
args.max_len,
)
seg_file(
os.path.join(args.output_dir, args.data_split + "_box.txt.tmp"),
tokenizer,
args.max_len,
)
seg_file(
os.path.join(args.output_dir, args.data_split + "_image.txt.tmp"),
tokenizer,
args.max_len,
)
示例2: get_defaults
# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def get_defaults(self, model, tokenizer, framework):
task_defaults = SUPPORTED_TASKS[self.task]
if model is None:
if framework == "tf":
model = task_defaults["tf"].from_pretrained(task_defaults["default"]["model"]["tf"])
elif framework == "pt":
model = task_defaults["pt"].from_pretrained(task_defaults["default"]["model"]["pt"])
else:
raise ValueError("Provided framework should be either 'tf' for TensorFlow or 'pt' for PyTorch.")
if tokenizer is None:
default_tokenizer = task_defaults["default"]["tokenizer"]
if isinstance(default_tokenizer, tuple):
# For tuple we have (tokenizer name, {kwargs})
tokenizer = AutoTokenizer.from_pretrained(default_tokenizer[0], **default_tokenizer[1])
else:
tokenizer = AutoTokenizer.from_pretrained(default_tokenizer)
return model, tokenizer
示例3: __init__
# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def __init__(
self,
model_path,
label_path,
multi_label=False,
model_type="bert",
use_fast_tokenizer=True,
do_lower_case=True,
):
self.model_path = model_path
self.label_path = label_path
self.multi_label = multi_label
self.model_type = model_type
self.do_lower_case = do_lower_case
# Use auto-tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_path, use_fast=use_fast_tokenizer
)
self.learner = self.get_learner()
示例4: __init__
# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def __init__(self,
alias: str,
cache_dir: Optional[str] = None,
max_len_truncate: int = 500,
add_special_tokens: bool = True, **kwargs) -> None:
"""Initialize a pretrained tokenizer.
Parameters
----------
alias: str
Alias of a pretrained tokenizer.
cache_dir: str, optional
A directory where to cache the downloaded vocabularies.
max_len_truncate: int, default = 500
Truncates the length of the tokenized sequence.
Because several pretrained models crash when this is
> 500, it defaults to 500
add_special_tokens: bool, optional
Add the special tokens to the inputs. Default ``True``.
"""
self._tokenizer = AutoTokenizer.from_pretrained(alias, cache_dir=cache_dir, **kwargs)
self.max_len_truncate = max_len_truncate
self.add_special_tokens = add_special_tokens
示例5: test_is_running
# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def test_is_running():
"""Test if perplexity is running normal"""
tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelWithLMHead.from_pretrained("distilbert-base-uncased")
dataset = LanguageModelingDataset(texts, tok)
collate_fn = DataCollatorForLanguageModeling(tok).collate_batch
dataloader = torch.utils.data.DataLoader(dataset, collate_fn=collate_fn)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
runner = HuggingFaceRunner()
runner.train(
model=model,
optimizer=optimizer,
loaders={"train": dataloader},
callbacks={
"optimizer": dl.OptimizerCallback(),
"perplexity": PerplexityMetricCallback(),
},
check=True,
)
示例6: __init__
# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
from transformers import BertTokenizer, BertForSequenceClassification
# download the model or load the model path
path_emotion = download_model('bert.emotion', cache_dir,
process_func=_unzip_process_func,
verbose=verbose)
path_emotion = os.path.join(path_emotion,'bert.emotion')
path_reject = download_model('bert.noemotion', cache_dir,
process_func=_unzip_process_func,
verbose=verbose)
path_reject = os.path.join(path_reject,'bert.noemotion')
# load the models
self.tokenizer_rejct = BertTokenizer.from_pretrained(path_reject)
self.model_reject = BertForSequenceClassification.from_pretrained(path_reject)
self.tokenizer = BertTokenizer.from_pretrained(path_emotion)
self.model = BertForSequenceClassification.from_pretrained(path_emotion)
# load the class names mapping
self.catagories = {5: 'Foragt/Modvilje', 2: 'Forventning/Interrese',
0: 'Glæde/Sindsro', 3: 'Overasket/Målløs',
1: 'Tillid/Accept',
4: 'Vrede/Irritation', 6: 'Sorg/trist',
7: 'Frygt/Bekymret'}
示例7: __init__
# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
sess_options = rt.SessionOptions()
self.model_dir = glob.glob(os.path.join(self.model_dir, '*.onnx'))[0]
# Set graph optimization level to ORT_ENABLE_EXTENDED to enable bert optimization.
sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
# To enable model serialization and store the optimized graph to desired location.
sess_options.optimized_model_filepath = self.model_dir
self.session = rt.InferenceSession(self.model_dir, sess_options)
if 'albert' in self.model_dir:
self.tokenizer = AutoTokenizer.from_pretrained('albert-base-uncased')
else:
self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
示例8: __init__
# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def __init__(self,
model_dir: str = 'nboost/pt-tinybert-msmarco',
verbose: bool = defaults.verbose,
max_seq_len: int = defaults.max_seq_len,
**kwargs):
super().__init__(**kwargs)
self.logger = set_logger(model_dir, verbose=verbose)
self.max_seq_len = max_seq_len
self.logger.info('Loading from checkpoint %s' % model_dir)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if self.device == torch.device("cpu"):
self.logger.info("RUNNING ON CPU")
else:
self.logger.info("RUNNING ON CUDA")
torch.cuda.synchronize(self.device)
self.rerank_model = AutoModelForSequenceClassification.from_pretrained(model_dir)
self.tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)
self.rerank_model.to(self.device, non_blocking=True)
示例9: __init__
# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def __init__(
self,
model_name="bert-base-cased",
to_lower=False,
custom_tokenize=None,
cache_dir=".",
):
self.model_name = model_name
self.tokenizer = AutoTokenizer.from_pretrained(
model_name,
do_lower_case=to_lower,
cache_dir=cache_dir,
output_loading_info=False,
)
self.do_lower_case = to_lower
self.custom_tokenize = custom_tokenize
示例10: __init__
# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, model_args: Dict = {}, cache_dir: Optional[str] = None ):
super(Transformer, self).__init__()
self.config_keys = ['max_seq_length']
self.max_seq_length = max_seq_length
config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
示例11: run
# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def run(self):
from transformers import AutoModel, AutoTokenizer
AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
示例12: tokenizer
# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def tokenizer(self):
if self._tokenizer is None:
self._tokenizer = AutoTokenizer.from_pretrained(self.pretrained_name)
return self._tokenizer
示例13: train_model
# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def train_model(self, data: TransfoDatabunch):
self.set_seed("class")
self.train_started = time.time()
num_labels = data.num_labels
config = AutoConfig.from_pretrained(self.pretrained_name, num_labels=num_labels) #, finetuning_task=args.task_name
model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_name, config=config)
train(self, data.train_ds, data.valid_ds, model.to(self.device), self._tokenizer)
model.to("cpu")
return model
示例14: test_tokenizer_tokenizer
# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def test_tokenizer_tokenizer():
"""Test initialization with tokenizer"""
tok = AutoTokenizer.from_pretrained("bert-base-uncased")
dataset = LanguageModelingDataset(texts, tok)
assert dataset[0] is not None
assert len(dataset) == 2
示例15: test_exception_with_sort
# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def test_exception_with_sort():
"""Test lazy=True sort=True case"""
tok = AutoTokenizer.from_pretrained("bert-base-uncased")
dataset = LanguageModelingDataset( # noqa: F841
texts, tok, lazy=True, sort=True
)