当前位置: 首页>>代码示例>>Python>>正文


Python BertTokenizer.from_pretrained方法代码示例

本文整理汇总了Python中pytorch_transformers.tokenization_bert.BertTokenizer.from_pretrained方法的典型用法代码示例。如果您正苦于以下问题:Python BertTokenizer.from_pretrained方法的具体用法?Python BertTokenizer.from_pretrained怎么用?Python BertTokenizer.from_pretrained使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pytorch_transformers.tokenization_bert.BertTokenizer的用法示例。


在下文中一共展示了BertTokenizer.from_pretrained方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: bertForPreTraining

# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def bertForPreTraining(*args, **kwargs):
    """
    BERT model with pre-training heads.
    This module comprises the BERT model followed by the two pre-training heads
        - the masked language modeling head, and
        - the next sentence classification head.

    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        tokenized_text = tokenizer.tokenize(text)
        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        # Load bertForPreTraining
        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
        masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
    """
    model = BertForPreTraining.from_pretrained(*args, **kwargs)
    return model 
开发者ID:linhaow,项目名称:TextClassify,代码行数:25,代码来源:bert_hubconf.py

示例2: __init__

# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def __init__(self, params):
        super(BiEncoderModule, self).__init__()
        ctxt_bert = BertModel.from_pretrained(params["bert_model"])
        cand_bert = BertModel.from_pretrained(params['bert_model'])
        self.context_encoder = BertEncoder(
            ctxt_bert,
            params["out_dim"],
            layer_pulled=params["pull_from_layer"],
            add_linear=params["add_linear"],
        )
        self.cand_encoder = BertEncoder(
            cand_bert,
            params["out_dim"],
            layer_pulled=params["pull_from_layer"],
            add_linear=params["add_linear"],
        )
        self.config = ctxt_bert.config 
开发者ID:facebookresearch,项目名称:BLINK,代码行数:19,代码来源:biencoder.py

示例3: bertModel

# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def bertModel(*args, **kwargs):
    """
    BertModel is the basic BERT Transformer model with a layer of summed token,
    position and sequence embeddings followed by a series of identical
    self-attention blocks (12 for BERT-base, 24 for BERT-large).

    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        # Load bertModel
        model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
        model.eval()
        # Predict hidden states features for each layer
        with torch.no_grad():
                encoded_layers, _ = model(tokens_tensor, segments_tensors)
    """
    model = BertModel.from_pretrained(*args, **kwargs)
    return model 
开发者ID:linhaow,项目名称:TextClassify,代码行数:28,代码来源:bert_hubconf.py

示例4: bertForNextSentencePrediction

# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def bertForNextSentencePrediction(*args, **kwargs):
    """
    BERT model with next sentence prediction head.
    This module comprises the BERT model followed by the next sentence
    classification head.

    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        # Load bertForNextSentencePrediction
        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
        model.eval()
        # Predict the next sentence classification logits
        with torch.no_grad():
                next_sent_classif_logits = model(tokens_tensor, segments_tensors)
    """
    model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
    return model 
开发者ID:linhaow,项目名称:TextClassify,代码行数:28,代码来源:bert_hubconf.py

示例5: bertForMaskedLM

# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def bertForMaskedLM(*args, **kwargs):
    """
    BertForMaskedLM includes the BertModel Transformer followed by the
    (possibly) pre-trained masked language modeling head.

    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        tokenized_text = tokenizer.tokenize(text)
        masked_index = 8
        tokenized_text[masked_index] = '[MASK]'
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        # Load bertForMaskedLM
        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
        model.eval()
        # Predict all tokens
        with torch.no_grad():
                predictions = model(tokens_tensor, segments_tensors)
        predicted_index = torch.argmax(predictions[0, masked_index]).item()
        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        'henson'
    """
    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
    return model 
开发者ID:linhaow,项目名称:TextClassify,代码行数:32,代码来源:bert_hubconf.py

示例6: bertForMultipleChoice

# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def bertForMultipleChoice(*args, **kwargs):
    """
    BertForMultipleChoice is a fine-tuning model that includes BertModel and a
    linear layer on top of the BertModel. Note that the multiple choice head is
    only initialized and has to be trained.

    Args:
    num_choices: the number (>=2) of classes for the classifier.

    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
        segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
        # Load bertForMultipleChoice
        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
        model.eval()
        # Predict the multiple choice logits
        with torch.no_grad():
                multiple_choice_logits = model(tokens_tensor, segments_tensors)
        # Or get the multiple choice loss
        labels = torch.tensor([1])
        multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
    """
    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
    return model 
开发者ID:linhaow,项目名称:TextClassify,代码行数:34,代码来源:bert_hubconf.py

示例7: bertForQuestionAnswering

# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def bertForQuestionAnswering(*args, **kwargs):
    """
    BertForQuestionAnswering is a fine-tuning model that includes BertModel
    with a token-level classifiers on top of the full sequence of last hidden
    states. Note that the classification head is only initialized
    and has to be trained.

    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        # Load bertForQuestionAnswering
        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
        model.eval()
        # Predict the start and end positions logits
        with torch.no_grad():
                start_logits, end_logits = model(tokens_tensor, segments_tensors)
        # Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
        start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
        # set model.train() before if training this loss
        multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
    """
    model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
    return model 
开发者ID:linhaow,项目名称:TextClassify,代码行数:33,代码来源:bert_hubconf.py

示例8: bertForTokenClassification

# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def bertForTokenClassification(*args, **kwargs):
    """
    BertForTokenClassification is a fine-tuning model that includes BertModel
    and a token-level classifier on top of the BertModel. Note that the classification
    head is only initialized and has to be trained.

    The token-level classifier is a linear layer that takes as input the last
    hidden state of the sequence.

    Args:
    num_labels: the number (>=2) of classes for the classifier.

    Example:
        # Load the tokenizer
        import torch
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        # Load bertForTokenClassification
        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
        model.eval()
        # Predict the token classification logits
        with torch.no_grad():
                classif_logits = model(tokens_tensor, segments_tensors)
        # Or get the token classification loss
        labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
        classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
    """
    model = BertForTokenClassification.from_pretrained(*args, **kwargs)
    return model 
开发者ID:linhaow,项目名称:TextClassify,代码行数:37,代码来源:bert_hubconf.py

示例9: get_model

# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def get_model(parameters):
        model = BertForReranking.from_pretrained(
            parameters["path_to_model"],
            num_labels=parameters["top_k"],
            cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), "local"),
        )

        if parameters["dataparallel_bert"]:
            model.bert = torch.nn.DataParallel(model.bert)
            print("Data parallel Bert")

        return model 
开发者ID:facebookresearch,项目名称:BLINK,代码行数:14,代码来源:bert_reranking.py

示例10: get_tokenizer

# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def get_tokenizer(parameters):
        tokenizer = BertTokenizer.from_pretrained(
            parameters["path_to_model"], do_lower_case=parameters["lowercase_flag"]
        )
        return tokenizer 
开发者ID:facebookresearch,项目名称:BLINK,代码行数:7,代码来源:bert_reranking.py

示例11: __init__

# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def __init__(self, params, tokenizer):
        super(CrossEncoderModule, self).__init__()
        model_path = params["bert_model"]
        if params.get("roberta"):
            encoder_model = RobertaModel.from_pretrained(model_path)
        else:
            encoder_model = BertModel.from_pretrained(model_path)
        encoder_model.resize_token_embeddings(len(tokenizer))
        self.encoder = BertEncoder(
            encoder_model,
            params["out_dim"],
            layer_pulled=params["pull_from_layer"],
            add_linear=params["add_linear"],
        )
        self.config = self.encoder.bert_model.config 
开发者ID:facebookresearch,项目名称:BLINK,代码行数:17,代码来源:crossencoder.py

示例12: __init__

# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def __init__(self) -> None:
        os.environ['CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format(os.environ['HOME'])
        self.client: CoreNLPClient = CoreNLPClient()
        self.client.ensure_alive()
        self.do_lower_case = '-cased' not in config.bert_model
        self.basic_tokenizer: BasicTokenizer \
            = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer 
开发者ID:yahshibu,项目名称:nested-ner-tacl2020-transformers,代码行数:9,代码来源:parse_ace2005.py

示例13: __init__

# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def __init__(self, config, mode, *args, **params):
        super().__init__(config, mode, *args, **params)

        self.tokenizer = BertTokenizer.from_pretrained(config.get("model", "bert_path"))
        self.max_len = config.getint("data", "max_seq_length")
        self.mode = mode 
开发者ID:haoxizhong,项目名称:pytorch-worker,代码行数:8,代码来源:BasicBertFormatter.py

示例14: bertTokenizer

# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def bertTokenizer(*args, **kwargs):
    """
    Instantiate a BertTokenizer from a pre-trained/customized vocab file
    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * bert-base-uncased
                                       * bert-large-uncased
                                       * bert-base-cased
                                       * bert-large-cased
                                       * bert-base-multilingual-uncased
                                       * bert-base-multilingual-cased
                                       * bert-base-chinese
    Keyword args:
    cache_dir: an optional path to a specific directory to download and cache
               the pre-trained model weights.
               Default: None
    do_lower_case: Whether to lower case the input.
                   Only has an effect when do_wordpiece_only=False
                   Default: True
    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
                       Default: True
    max_len: An artificial maximum length to truncate tokenized sequences to;
             Effective maximum length is always the minimum of this
             value (if specified) and the underlying BERT model's
             sequence length.
             Default: None
    never_split: List of tokens which will never be split during tokenization.
                 Only has an effect when do_wordpiece_only=False
                 Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]

    Example:
        import torch
        sentence = 'Hello, World!'
        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        toks = tokenizer.tokenize(sentence)
        ['Hello', '##,', 'World', '##!']
        ids = tokenizer.convert_tokens_to_ids(toks)
        [8667, 28136, 1291, 28125]
    """
    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer 
开发者ID:linhaow,项目名称:TextClassify,代码行数:44,代码来源:bert_hubconf.py

示例15: main

# 需要导入模块: from pytorch_transformers.tokenization_bert import BertTokenizer [as 别名]
# 或者: from pytorch_transformers.tokenization_bert.BertTokenizer import from_pretrained [as 别名]
def main():
    parser = ArgumentParser()
    parser.add_argument('--train_corpus', type=Path, required=True)
    parser.add_argument("--output_dir", type=Path, required=True)
    parser.add_argument("--bert_model", type=str, required=True,
                        choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
                                 "bert-base-multilingual-uncased", "bert-base-chinese", "bert-base-multilingual-cased"])
    parser.add_argument("--do_lower_case", action="store_true")
    parser.add_argument("--do_whole_word_mask", action="store_true",
                        help="Whether to use whole word masking rather than per-WordPiece masking.")
    parser.add_argument("--reduce_memory", action="store_true",
                        help="Reduce memory usage for large datasets by keeping data on disc rather than in memory")

    parser.add_argument("--num_workers", type=int, default=1,
                        help="The number of workers to use to write the files")
    parser.add_argument("--epochs_to_generate", type=int, default=3,
                        help="Number of epochs of data to pregenerate")
    parser.add_argument("--max_seq_len", type=int, default=128)
    parser.add_argument("--short_seq_prob", type=float, default=0.1,
                        help="Probability of making a short sentence as a training example")
    parser.add_argument("--masked_lm_prob", type=float, default=0.15,
                        help="Probability of masking each token for the LM task")
    parser.add_argument("--max_predictions_per_seq", type=int, default=20,
                        help="Maximum number of tokens to mask in each sequence")

    args = parser.parse_args()

    if args.num_workers > 1 and args.reduce_memory:
        raise ValueError("Cannot use multiple workers while reducing memory")

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    vocab_list = list(tokenizer.vocab.keys())
    with DocumentDatabase(reduce_memory=args.reduce_memory) as docs:
        with args.train_corpus.open() as f:
            doc = []
            for line in tqdm(f, desc="Loading Dataset", unit=" lines"):
                line = line.strip()
                if line == "":
                    docs.add_document(doc)
                    doc = []
                else:
                    tokens = tokenizer.tokenize(line)
                    doc.append(tokens)
            if doc:
                docs.add_document(doc)  # If the last doc didn't end on a newline, make sure it still gets added
        if len(docs) <= 1:
            exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to "
                 "ensure that random NextSentences are not sampled from the same document. Please add blank lines to "
                 "indicate breaks between documents in your input file. If your dataset does not contain multiple "
                 "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, "
                 "sections or paragraphs.")

        args.output_dir.mkdir(exist_ok=True)

        if args.num_workers > 1:
            writer_workers = Pool(min(args.num_workers, args.epochs_to_generate))
            arguments = [(docs, vocab_list, args, idx) for idx in range(args.epochs_to_generate)]
            writer_workers.starmap(create_training_file, arguments)
        else:
            for epoch in trange(args.epochs_to_generate, desc="Epoch"):
                create_training_file(docs, vocab_list, args, epoch) 
开发者ID:guoday,项目名称:CCF-BDCI-Sentiment-Analysis-Baseline,代码行数:63,代码来源:pregenerate_training_data.py


注:本文中的pytorch_transformers.tokenization_bert.BertTokenizer.from_pretrained方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。