当前位置: 首页>>代码示例>>Python>>正文


Python AutoTokenizer.from_pretrained方法代码示例

本文整理汇总了Python中transformers.AutoTokenizer.from_pretrained方法的典型用法代码示例。如果您正苦于以下问题:Python AutoTokenizer.from_pretrained方法的具体用法?Python AutoTokenizer.from_pretrained怎么用?Python AutoTokenizer.from_pretrained使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在transformers.AutoTokenizer的用法示例。


在下文中一共展示了AutoTokenizer.from_pretrained方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: seg

# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def seg(args):
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name_or_path, do_lower_case=True
    )
    seg_file(
        os.path.join(args.output_dir, args.data_split + ".txt.tmp"),
        tokenizer,
        args.max_len,
    )
    seg_file(
        os.path.join(args.output_dir, args.data_split + "_box.txt.tmp"),
        tokenizer,
        args.max_len,
    )
    seg_file(
        os.path.join(args.output_dir, args.data_split + "_image.txt.tmp"),
        tokenizer,
        args.max_len,
    ) 
开发者ID:microsoft,项目名称:unilm,代码行数:21,代码来源:preprocess.py

示例2: get_defaults

# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def get_defaults(self, model, tokenizer, framework):
        task_defaults = SUPPORTED_TASKS[self.task]
        if model is None:
            if framework == "tf":
                model = task_defaults["tf"].from_pretrained(task_defaults["default"]["model"]["tf"])
            elif framework == "pt":
                model = task_defaults["pt"].from_pretrained(task_defaults["default"]["model"]["pt"])
            else:
                raise ValueError("Provided framework should be either 'tf' for TensorFlow or 'pt' for PyTorch.")

        if tokenizer is None:
            default_tokenizer = task_defaults["default"]["tokenizer"]
            if isinstance(default_tokenizer, tuple):
                # For tuple we have (tokenizer name, {kwargs})
                tokenizer = AutoTokenizer.from_pretrained(default_tokenizer[0], **default_tokenizer[1])
            else:
                tokenizer = AutoTokenizer.from_pretrained(default_tokenizer)

        return model, tokenizer 
开发者ID:bhoov,项目名称:exbert,代码行数:21,代码来源:pipelines.py

示例3: __init__

# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def __init__(
        self,
        model_path,
        label_path,
        multi_label=False,
        model_type="bert",
        use_fast_tokenizer=True,
        do_lower_case=True,
    ):
        self.model_path = model_path
        self.label_path = label_path
        self.multi_label = multi_label
        self.model_type = model_type
        self.do_lower_case = do_lower_case

        # Use auto-tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_path, use_fast=use_fast_tokenizer
        )

        self.learner = self.get_learner() 
开发者ID:kaushaltrivedi,项目名称:fast-bert,代码行数:23,代码来源:prediction.py

示例4: __init__

# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def __init__(self,
                 alias: str,
                 cache_dir: Optional[str] = None,
                 max_len_truncate: int = 500,
                 add_special_tokens: bool = True, **kwargs) -> None:
        """Initialize a pretrained tokenizer.

        Parameters
        ----------
        alias: str
            Alias of a pretrained tokenizer.
        cache_dir: str, optional
            A directory where to cache the downloaded vocabularies.
        max_len_truncate: int, default = 500
            Truncates the length of the tokenized sequence.
            Because several pretrained models crash when this is
            > 500, it defaults to 500
        add_special_tokens: bool, optional
            Add the special tokens to the inputs. Default ``True``.

        """
        self._tokenizer = AutoTokenizer.from_pretrained(alias, cache_dir=cache_dir, **kwargs)
        self.max_len_truncate = max_len_truncate
        self.add_special_tokens = add_special_tokens 
开发者ID:asappresearch,项目名称:flambe,代码行数:26,代码来源:field.py

示例5: test_is_running

# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def test_is_running():
    """Test if perplexity is running normal"""
    tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    model = AutoModelWithLMHead.from_pretrained("distilbert-base-uncased")
    dataset = LanguageModelingDataset(texts, tok)
    collate_fn = DataCollatorForLanguageModeling(tok).collate_batch
    dataloader = torch.utils.data.DataLoader(dataset, collate_fn=collate_fn)
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

    runner = HuggingFaceRunner()
    runner.train(
        model=model,
        optimizer=optimizer,
        loaders={"train": dataloader},
        callbacks={
            "optimizer": dl.OptimizerCallback(),
            "perplexity": PerplexityMetricCallback(),
        },
        check=True,
    ) 
开发者ID:catalyst-team,项目名称:catalyst,代码行数:22,代码来源:test_perplexity_callback.py

示例6: __init__

# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):
        from transformers import BertTokenizer, BertForSequenceClassification

        # download the model or load the model path
        path_emotion = download_model('bert.emotion', cache_dir,
                                       process_func=_unzip_process_func,
                                       verbose=verbose)
        path_emotion = os.path.join(path_emotion,'bert.emotion')
        path_reject = download_model('bert.noemotion', cache_dir,
                                       process_func=_unzip_process_func,
                                       verbose=verbose)
        path_reject = os.path.join(path_reject,'bert.noemotion')
        # load the models
        self.tokenizer_rejct = BertTokenizer.from_pretrained(path_reject)
        self.model_reject = BertForSequenceClassification.from_pretrained(path_reject)
        
        self.tokenizer = BertTokenizer.from_pretrained(path_emotion)
        self.model = BertForSequenceClassification.from_pretrained(path_emotion)
        
        # load the class names mapping
        self.catagories = {5: 'Foragt/Modvilje', 2: 'Forventning/Interrese',
                           0: 'Glæde/Sindsro', 3: 'Overasket/Målløs',
                           1: 'Tillid/Accept',
                           4: 'Vrede/Irritation', 6: 'Sorg/trist',
                           7: 'Frygt/Bekymret'} 
开发者ID:alexandrainst,项目名称:danlp,代码行数:27,代码来源:bert_models.py

示例7: __init__

# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        sess_options = rt.SessionOptions()

        self.model_dir = glob.glob(os.path.join(self.model_dir, '*.onnx'))[0]

        # Set graph optimization level to ORT_ENABLE_EXTENDED to enable bert optimization.
        sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_EXTENDED

        # To enable model serialization and store the optimized graph to desired location.
        sess_options.optimized_model_filepath = self.model_dir
        self.session = rt.InferenceSession(self.model_dir, sess_options)
        if 'albert' in self.model_dir:
            self.tokenizer = AutoTokenizer.from_pretrained('albert-base-uncased')
        else:
            self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') 
开发者ID:koursaros-ai,项目名称:nboost,代码行数:18,代码来源:onnxbert.py

示例8: __init__

# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def __init__(self,
                 model_dir: str = 'nboost/pt-tinybert-msmarco',
                 verbose: bool = defaults.verbose,
                 max_seq_len: int = defaults.max_seq_len,
                 **kwargs):
        super().__init__(**kwargs)
        self.logger = set_logger(model_dir, verbose=verbose)
        self.max_seq_len = max_seq_len

        self.logger.info('Loading from checkpoint %s' % model_dir)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        if self.device == torch.device("cpu"):
            self.logger.info("RUNNING ON CPU")
        else:
            self.logger.info("RUNNING ON CUDA")
            torch.cuda.synchronize(self.device)

        self.rerank_model = AutoModelForSequenceClassification.from_pretrained(model_dir)
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=True)

        self.rerank_model.to(self.device, non_blocking=True) 
开发者ID:koursaros-ai,项目名称:nboost,代码行数:24,代码来源:transformers.py

示例9: __init__

# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def __init__(
        self,
        model_name="bert-base-cased",
        to_lower=False,
        custom_tokenize=None,
        cache_dir=".",
    ):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            do_lower_case=to_lower,
            cache_dir=cache_dir,
            output_loading_info=False,
        )
        self.do_lower_case = to_lower
        self.custom_tokenize = custom_tokenize 
开发者ID:microsoft,项目名称:nlp-recipes,代码行数:18,代码来源:question_answering.py

示例10: __init__

# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, model_args: Dict = {}, cache_dir: Optional[str] = None ):
        super(Transformer, self).__init__()
        self.config_keys = ['max_seq_length']
        self.max_seq_length = max_seq_length

        config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
        self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir) 
开发者ID:UKPLab,项目名称:sentence-transformers,代码行数:10,代码来源:Transformer.py

示例11: run

# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def run(self):
        from transformers import AutoModel, AutoTokenizer

        AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
        AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 
开发者ID:bhoov,项目名称:exbert,代码行数:7,代码来源:download.py

示例12: tokenizer

# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def tokenizer(self):
        if self._tokenizer is None:
            self._tokenizer = AutoTokenizer.from_pretrained(self.pretrained_name)
        return self._tokenizer 
开发者ID:paperswithcode,项目名称:axcell,代码行数:6,代码来源:transfo_experiment.py

示例13: train_model

# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def train_model(self, data: TransfoDatabunch):
        self.set_seed("class")
        self.train_started = time.time()
        num_labels = data.num_labels
        config = AutoConfig.from_pretrained(self.pretrained_name, num_labels=num_labels) #, finetuning_task=args.task_name
        model = AutoModelForSequenceClassification.from_pretrained(self.pretrained_name, config=config)
        train(self, data.train_ds, data.valid_ds, model.to(self.device), self._tokenizer)
        model.to("cpu")
        return model 
开发者ID:paperswithcode,项目名称:axcell,代码行数:11,代码来源:transfo_experiment.py

示例14: test_tokenizer_tokenizer

# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def test_tokenizer_tokenizer():
    """Test initialization with tokenizer"""
    tok = AutoTokenizer.from_pretrained("bert-base-uncased")
    dataset = LanguageModelingDataset(texts, tok)
    assert dataset[0] is not None
    assert len(dataset) == 2 
开发者ID:catalyst-team,项目名称:catalyst,代码行数:8,代码来源:test_language_modeling_dataset.py

示例15: test_exception_with_sort

# 需要导入模块: from transformers import AutoTokenizer [as 别名]
# 或者: from transformers.AutoTokenizer import from_pretrained [as 别名]
def test_exception_with_sort():
    """Test lazy=True sort=True case"""
    tok = AutoTokenizer.from_pretrained("bert-base-uncased")
    dataset = LanguageModelingDataset(  # noqa: F841
        texts, tok, lazy=True, sort=True
    ) 
开发者ID:catalyst-team,项目名称:catalyst,代码行数:8,代码来源:test_language_modeling_dataset.py


注:本文中的transformers.AutoTokenizer.from_pretrained方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。