当前位置: 首页>>代码示例>>Python>>正文


Python BertTokenizer.from_pretrained方法代码示例

本文整理汇总了Python中pytorch_pretrained_bert.tokenization.BertTokenizer.from_pretrained方法的典型用法代码示例。如果您正苦于以下问题:Python BertTokenizer.from_pretrained方法的具体用法?Python BertTokenizer.from_pretrained怎么用?Python BertTokenizer.from_pretrained使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pytorch_pretrained_bert.tokenization.BertTokenizer的用法示例。


在下文中一共展示了BertTokenizer.from_pretrained方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def __init__(self, language=Language.ENGLISH, num_labels=2, cache_dir="."):
        """Initializes the classifier and the underlying pretrained model.

        Args:
            language (Language, optional): The pretrained model's language.
                                           Defaults to Language.ENGLISH.
            num_labels (int, optional): The number of unique labels in the
                training data. Defaults to 2.
            cache_dir (str, optional): Location of BERT's cache directory.
                Defaults to ".".
        """
        if num_labels < 2:
            raise ValueError("Number of labels should be at least 2.")

        self.language = language
        self.num_labels = num_labels
        self.cache_dir = cache_dir

        # create classifier
        self.model = BertForSequenceClassification.from_pretrained(
            language, cache_dir=cache_dir, num_labels=num_labels
        )
        self.has_cuda = self.cuda 
开发者ID:interpretml,项目名称:interpret-text,代码行数:25,代码来源:utils_bert.py

示例2: load_model

# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def load_model(config, num_train_steps, label_list):
    # device = torch.device(torch.cuda.is_available())
    device = torch.device("cuda") 
    n_gpu = torch.cuda.device_count()
    model = BertTagger(config, num_labels=len(label_list)) 
    # model = BertForTagger.from_pretrained(config.bert_model, num_labels=13)
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # prepare  optimzier 
    param_optimizer = list(model.named_parameters())

        
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
    {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01},
    {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}]

    # optimizer = Adam(optimizer_grouped_parameters, lr=config.learning_rate) 
    optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=config.warmup_proportion, t_total=num_train_steps, max_grad_norm=config.clip_grad) 

    return model, optimizer, device, n_gpu 
开发者ID:pranciskus,项目名称:mrc-for-flat-nested-ner,代码行数:25,代码来源:run_bert_tagger.py

示例3: __init__

# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def __init__(self, archive_file, model_file=None, use_cuda=False):
        if not os.path.isfile(archive_file):
            if not model_file:
                raise Exception("No model for DA-predictor is specified!")
            archive_file = cached_path(model_file)
        model_dir = os.path.dirname(os.path.abspath(__file__))
        if not os.path.exists(os.path.join(model_dir, 'checkpoints')):
            archive = zipfile.ZipFile(archive_file, 'r')
            archive.extractall(model_dir)
        
        load_dir = os.path.join(model_dir, "checkpoints/predictor/save_step_15120")
        if not os.path.exists(load_dir):
            archive = zipfile.ZipFile(f'{load_dir}.zip', 'r')
            archive.extractall(os.path.dirname(load_dir))
        
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=False)
        self.max_seq_length = 256
        self.domain = 'restaurant'
        self.model = BertForSequenceClassification.from_pretrained(load_dir, 
            cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(-1)), num_labels=44)
        self.device = 'cuda' if use_cuda else 'cpu'
        self.model.to(self.device) 
开发者ID:ConvLab,项目名称:ConvLab,代码行数:24,代码来源:predictor.py

示例4: __init__

# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def __init__(self, pretrained_model: str,
                 requires_grad: bool = False,
                 dropout: float = 0.1,
                 layer_dropout: float = 0.1,
                 combine_layers: str = "mix") -> None:
        model = BertModel.from_pretrained(pretrained_model)

        for param in model.parameters():
            param.requires_grad = requires_grad

        super().__init__(bert_model=model,
                         layer_dropout=layer_dropout,
                         combine_layers=combine_layers)

        self.model = model
        self.dropout = dropout
        self.set_dropout(dropout) 
开发者ID:Hyperparticle,项目名称:udify,代码行数:19,代码来源:bert_pretrained.py

示例5: __init__

# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def __init__(self, conversations, labels, conversation_length, sentence_length, data=None):

        # [total_data_size, max_conversation_length, max_sentence_length]
        # tokenized raw text of sentences
        self.conversations = conversations
        self.labels = labels

        # conversation length of each batch
        # [total_data_size]
        self.conversation_length = conversation_length

        # list of length of sentences
        # [total_data_size, max_conversation_length]
        self.sentence_length = sentence_length
        self.data = data
        self.len = len(conversations)

        # Prepare for BERT
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
        self.prepare_BERT() 
开发者ID:declare-lab,项目名称:conv-emotion,代码行数:22,代码来源:data_loader.py

示例6: __init__

# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def __init__(self, path, batch_size=32, gpu=True, labels=None,
                 has_labels=True, is_train=True, dropout_w=0.005, maxlen=128):
        self.batch_size = batch_size
        self.has_labels = has_labels
        self.gpu = gpu
        self.labels = labels
        self.is_train = is_train
        # Explicit cache dir required for some reason -- default doesn't exist in the docker
        # container, maybe?
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', cache_dir='/tmp')
        self.data = self.load(path, maxlen, has_labels)
        if self.is_train:
            indices = list(range(len(self.data)))
            random.shuffle(indices)
            data = [self.data[i] for i in indices]
        self.data = GobbliBatchGen.make_batches(self.data, batch_size)
        self.offset = 0
        self.dropout_w = dropout_w 
开发者ID:RTIInternational,项目名称:gobbli,代码行数:20,代码来源:gobbli_batcher.py

示例7: main

# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def main():
    torch.manual_seed(args.seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices
    use_gpu = torch.cuda.is_available()
    args.use_gpu = use_gpu

    if use_gpu:
        print("Currently using GPU {}".format(args.gpu_devices))
        cudnn.benchmark = True
        torch.cuda.manual_seed_all(args.seed)
    else:
        print("Currently using CPU (GPU is highly recommended)")

    tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
    bert_model = BertModel.from_pretrained("bert-base-chinese")

    if use_gpu:
        bert_model = bert_model.cuda()

    processor = Preprocess(args, tokenizer, bert_model)
    processor.do_preprocess() 
开发者ID:tracy-talent,项目名称:curriculum,代码行数:23,代码来源:preprocess_embedding.py

示例8: _prepare_model

# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def _prepare_model(self) -> BertPreTrainedModel:
        if self.args.cache_dir:
            cache_dir = self.args.cache_dir
        else:
            cache_dir = os.path.join(
                str(PYTORCH_PRETRAINED_BERT_CACHE),
                f"distributed_{self.args.local_rank}",
            )
        model = BertForSequenceClassification.from_pretrained(
            self.args.bert_model, cache_dir=cache_dir, num_labels=self.num_labels
        )
        model.to(self.device)
        return model 
开发者ID:microsoft,项目名称:botbuilder-python,代码行数:15,代码来源:bert_train_eval.py

示例9: bertTokenizer

# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def bertTokenizer(*args, **kwargs):
    """
    Instantiate a BertTokenizer from a pre-trained/customized vocab file
    Args:
    pretrained_model_name_or_path: Path to pretrained model archive
                                   or one of pre-trained vocab configs below.
                                       * bert-base-uncased
                                       * bert-large-uncased
                                       * bert-base-cased
                                       * bert-large-cased
                                       * bert-base-multilingual-uncased
                                       * bert-base-multilingual-cased
                                       * bert-base-chinese
    Keyword args:
    cache_dir: an optional path to a specific directory to download and cache
               the pre-trained model weights.
               Default: None
    do_lower_case: Whether to lower case the input.
                   Only has an effect when do_wordpiece_only=False
                   Default: True
    do_basic_tokenize: Whether to do basic tokenization before wordpiece.
                       Default: True
    max_len: An artificial maximum length to truncate tokenized sequences to;
             Effective maximum length is always the minimum of this
             value (if specified) and the underlying BERT model's
             sequence length.
             Default: None
    never_split: List of tokens which will never be split during tokenization.
                 Only has an effect when do_wordpiece_only=False
                 Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]

    Example:
        >>> sentence = 'Hello, World!'
        >>> tokenizer = torch.hub.load('ailzhang/pytorch-pretrained-BERT:hubconf', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False, force_reload=False)
        >>> toks = tokenizer.tokenize(sentence)
        ['Hello', '##,', 'World', '##!']
        >>> ids = tokenizer.convert_tokens_to_ids(toks)
        [8667, 28136, 1291, 28125]
    """
    tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
    return tokenizer 
开发者ID:martiansideofthemoon,项目名称:squash-generation,代码行数:43,代码来源:hubconf.py

示例10: bertModel

# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def bertModel(*args, **kwargs):
    """
    BertModel is the basic BERT Transformer model with a layer of summed token,
    position and sequence embeddings followed by a series of identical
    self-attention blocks (12 for BERT-base, 24 for BERT-large).
    """
    model = BertModel.from_pretrained(*args, **kwargs)
    return model 
开发者ID:martiansideofthemoon,项目名称:squash-generation,代码行数:10,代码来源:hubconf.py

示例11: bertForNextSentencePrediction

# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def bertForNextSentencePrediction(*args, **kwargs):
    """
    BERT model with next sentence prediction head.
    This module comprises the BERT model followed by the next sentence
    classification head.
    """
    model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
    return model 
开发者ID:martiansideofthemoon,项目名称:squash-generation,代码行数:10,代码来源:hubconf.py

示例12: bertForPreTraining

# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def bertForPreTraining(*args, **kwargs):
    """
    BERT model with pre-training heads.
    This module comprises the BERT model followed by the two pre-training heads
        - the masked language modeling head, and
        - the next sentence classification head.
    """
    model = BertForPreTraining.from_pretrained(*args, **kwargs)
    return model 
开发者ID:martiansideofthemoon,项目名称:squash-generation,代码行数:11,代码来源:hubconf.py

示例13: bertForMaskedLM

# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def bertForMaskedLM(*args, **kwargs):
    """
    BertForMaskedLM includes the BertModel Transformer followed by the
    (possibly) pre-trained masked language modeling head.
    """
    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
    return model 
开发者ID:martiansideofthemoon,项目名称:squash-generation,代码行数:9,代码来源:hubconf.py

示例14: bertForMultipleChoice

# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def bertForMultipleChoice(*args, **kwargs):
    """
    BertForMultipleChoice is a fine-tuning model that includes BertModel and a
    linear layer on top of the BertModel.
    """
    model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
    return model 
开发者ID:martiansideofthemoon,项目名称:squash-generation,代码行数:9,代码来源:hubconf.py

示例15: bertForQuestionAnswering

# 需要导入模块: from pytorch_pretrained_bert.tokenization import BertTokenizer [as 别名]
# 或者: from pytorch_pretrained_bert.tokenization.BertTokenizer import from_pretrained [as 别名]
def bertForQuestionAnswering(*args, **kwargs):
    """
    BertForQuestionAnswering is a fine-tuning model that includes BertModel
    with a token-level classifiers on top of the full sequence of last hidden
    states.
    """
    model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
    return model 
开发者ID:martiansideofthemoon,项目名称:squash-generation,代码行数:10,代码来源:hubconf.py


注:本文中的pytorch_pretrained_bert.tokenization.BertTokenizer.from_pretrained方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。