当前位置: 首页>>代码示例>>Python>>正文


Python GPT2Tokenizer.from_pretrained方法代码示例

本文整理汇总了Python中transformers.GPT2Tokenizer.from_pretrained方法的典型用法代码示例。如果您正苦于以下问题:Python GPT2Tokenizer.from_pretrained方法的具体用法?Python GPT2Tokenizer.from_pretrained怎么用?Python GPT2Tokenizer.from_pretrained使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在transformers.GPT2Tokenizer的用法示例。


在下文中一共展示了GPT2Tokenizer.from_pretrained方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from transformers import GPT2Tokenizer [as 别名]
# 或者: from transformers.GPT2Tokenizer import from_pretrained [as 别名]
def __init__(
        self,
        pretrained_model=None,
        vocab_file=None,
        merges_file=None,
        errors='replace',
        bos_token="<|endoftext|>",
        eos_token="<|endoftext|>",
    ):
        if pretrained_model:
            self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
        self.vocab_size = self.tokenizer.vocab_size
        special_tokens_dict = {}
        if self.tokenizer.unk_token is None:
            self.tokenizer.unk_token = "<|unk|>"
            special_tokens_dict["unk_token"] = "<|unk|>"
        if self.tokenizer.bos_token is None:
            special_tokens_dict["bos_token"] = bos_token
        if self.tokenizer.eos_token is None:
            special_tokens_dict["eos_token"] = eos_token
        if self.tokenizer.pad_token is None:
            special_tokens_dict["pad_token"] = "<|pad|>"
        self.tokenizer.add_special_tokens(special_tokens_dict) 
开发者ID:NVIDIA,项目名称:NeMo,代码行数:25,代码来源:gpt2_tokenizer.py

示例2: get_tokenizer

# 需要导入模块: from transformers import GPT2Tokenizer [as 别名]
# 或者: from transformers.GPT2Tokenizer import from_pretrained [as 别名]
def get_tokenizer(self, opt):
        """
        Instantiate tokenizer.
        """
        model_sz = opt['gpt2_size']
        fle_key = 'gpt2' if model_sz == 'small' else f'gpt2-{model_sz}'
        return GPT2Tokenizer.from_pretrained(fle_key) 
开发者ID:facebookresearch,项目名称:ParlAI,代码行数:9,代码来源:dict.py

示例3: __init__

# 需要导入模块: from transformers import GPT2Tokenizer [as 别名]
# 或者: from transformers.GPT2Tokenizer import from_pretrained [as 别名]
def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"):
        super().__init__()
        self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
        self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
        self.embed_size = self.encoder.transformer.config.hidden_size
        self.classifier_head = ClassificationHead(class_size=class_size, embed_size=self.embed_size)
        self.cached_mode = cached_mode
        self.device = device 
开发者ID:bhoov,项目名称:exbert,代码行数:10,代码来源:run_pplm_discrim_train.py

示例4: call

# 需要导入模块: from transformers import GPT2Tokenizer [as 别名]
# 或者: from transformers.GPT2Tokenizer import from_pretrained [as 别名]
def call(self, inputs, **kwargs):
        r"""
    Return:
        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the last layer of the model.
        past (:obj:`List[tf.Tensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`tf.Tensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import tensorflow as tf
        from transformers import GPT2Tokenizer, TFGPT2Model

        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = TFGPT2Model.from_pretrained('gpt2')
        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
        outputs = model(input_ids)
        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

    """
        outputs = self.transformer(inputs, **kwargs)
        return outputs 
开发者ID:bhoov,项目名称:exbert,代码行数:38,代码来源:modeling_tf_gpt2.py

示例5: __init__

# 需要导入模块: from transformers import GPT2Tokenizer [as 别名]
# 或者: from transformers.GPT2Tokenizer import from_pretrained [as 别名]
def __init__(
            self,
            class_size=None,
            pretrained_model="gpt2-medium",
            classifier_head=None,
            cached_mode=False,
            device='cpu'
    ):
        super(Discriminator, self).__init__()
        if pretrained_model.startswith("gpt2"):
            self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
            self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
            self.embed_size = self.encoder.transformer.config.hidden_size
        elif pretrained_model.startswith("bert"):
            self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
            self.encoder = BertModel.from_pretrained(pretrained_model)
            self.embed_size = self.encoder.config.hidden_size
        else:
            raise ValueError(
                "{} model not yet supported".format(pretrained_model)
            )
        if classifier_head:
            self.classifier_head = classifier_head
        else:
            if not class_size:
                raise ValueError("must specify class_size")
            self.classifier_head = ClassificationHead(
                class_size=class_size,
                embed_size=self.embed_size
            )
        self.cached_mode = cached_mode
        self.device = device 
开发者ID:uber-research,项目名称:PPLM,代码行数:34,代码来源:run_pplm_discrim_train.py

示例6: main

# 需要导入模块: from transformers import GPT2Tokenizer [as 别名]
# 或者: from transformers.GPT2Tokenizer import from_pretrained [as 别名]
def main():
    parser = argparse.ArgumentParser(
        description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids)."
    )
    parser.add_argument("--file_path", type=str, default="data/dump.txt", help="The path to the data.")
    parser.add_argument("--tokenizer_type", type=str, default="bert", choices=["bert", "roberta", "gpt2"])
    parser.add_argument("--tokenizer_name", type=str, default="bert-base-uncased", help="The tokenizer to use.")
    parser.add_argument("--dump_file", type=str, default="data/dump", help="The dump file prefix.")
    args = parser.parse_args()

    logger.info(f"Loading Tokenizer ({args.tokenizer_name})")
    if args.tokenizer_type == "bert":
        tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map["cls_token"]  # `[CLS]`
        sep = tokenizer.special_tokens_map["sep_token"]  # `[SEP]`
    elif args.tokenizer_type == "roberta":
        tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map["cls_token"]  # `<s>`
        sep = tokenizer.special_tokens_map["sep_token"]  # `</s>`
    elif args.tokenizer_type == "gpt2":
        tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map["bos_token"]  # `<|endoftext|>`
        sep = tokenizer.special_tokens_map["eos_token"]  # `<|endoftext|>`

    logger.info(f"Loading text from {args.file_path}")
    with open(args.file_path, "r", encoding="utf8") as fp:
        data = fp.readlines()

    logger.info(f"Start encoding")
    logger.info(f"{len(data)} examples to process.")

    rslt = []
    iter = 0
    interval = 10000
    start = time.time()
    for text in data:
        text = f"{bos} {text.strip()} {sep}"
        token_ids = tokenizer.encode(text, add_special_tokens=False)
        rslt.append(token_ids)

        iter += 1
        if iter % interval == 0:
            end = time.time()
            logger.info(f"{iter} examples processed. - {(end-start):.2f}s/{interval}expl")
            start = time.time()
    logger.info("Finished binarization")
    logger.info(f"{len(data)} examples processed.")

    dp_file = f"{args.dump_file}.{args.tokenizer_name}.pickle"
    vocab_size = tokenizer.vocab_size
    if vocab_size < (1 << 16):
        rslt_ = [np.uint16(d) for d in rslt]
    else:
        rslt_ = [np.int32(d) for d in rslt]
    random.shuffle(rslt_)
    logger.info(f"Dump to {dp_file}")
    with open(dp_file, "wb") as handle:
        pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL) 
开发者ID:bhoov,项目名称:exbert,代码行数:60,代码来源:binarized_data.py

示例7: main

# 需要导入模块: from transformers import GPT2Tokenizer [as 别名]
# 或者: from transformers.GPT2Tokenizer import from_pretrained [as 别名]
def main():
    parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
    parser.add_argument('--file_path', type=str, default='data/dump.txt',
                        help='The path to the data.')
    parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2', 'kobert'])
    parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
                        help="The tokenizer to use.")
    parser.add_argument('--dump_file', type=str, default='data/dump',
                        help='The dump file prefix.')
    args = parser.parse_args()

    logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
    if args.tokenizer_type == 'bert':
        tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map['cls_token']  # `[CLS]`
        sep = tokenizer.special_tokens_map['sep_token']  # `[SEP]`
    elif args.tokenizer_type == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map['cls_token']  # `<s>`
        sep = tokenizer.special_tokens_map['sep_token']  # `</s>`
    elif args.tokenizer_type == 'gpt2':
        tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
        bos = tokenizer.special_tokens_map['bos_token']  # `<|endoftext|>`
        sep = tokenizer.special_tokens_map['eos_token']  # `<|endoftext|>`
    elif args.tokenizer_type == 'kobert':
        tokenizer = KoBertTokenizer.from_pretrained('kobert')
        bos = tokenizer.special_tokens_map['cls_token']
        sep = tokenizer.special_tokens_map['sep_token']

    logger.info(f'Loading text from {args.file_path}')
    with open(args.file_path, 'r', encoding='utf8') as fp:
        data = fp.readlines()

    logger.info(f'Start encoding')
    logger.info(f'{len(data)} examples to process.')

    rslt = []
    iter = 0
    interval = 10000
    start = time.time()
    for text in data:
        text = f'{bos} {text.strip()} {sep}'
        token_ids = tokenizer.encode(text, add_special_tokens=False)
        rslt.append(token_ids)

        iter += 1
        if iter % interval == 0:
            end = time.time()
            logger.info(f'{iter} examples processed. - {(end-start)/interval:.2f}s/expl')
            start = time.time()
    logger.info('Finished binarization')
    logger.info(f'{len(data)} examples processed.')

    dp_file = f'{args.dump_file}.{args.tokenizer_name}.pickle'
    rslt_ = [np.uint16(d) for d in rslt]
    random.shuffle(rslt_)
    logger.info(f'Dump to {dp_file}')
    with open(dp_file, 'wb') as handle:
        pickle.dump(rslt_, handle, protocol=pickle.HIGHEST_PROTOCOL) 
开发者ID:monologg,项目名称:DistilKoBERT,代码行数:61,代码来源:binarized_data.py


注:本文中的transformers.GPT2Tokenizer.from_pretrained方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。