当前位置: 首页>>代码示例>>Python>>正文


Python tokenization.BasicTokenizer方法代码示例

本文整理汇总了Python中bert.tokenization.BasicTokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python tokenization.BasicTokenizer方法的具体用法?Python tokenization.BasicTokenizer怎么用?Python tokenization.BasicTokenizer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在bert.tokenization的用法示例。


在下文中一共展示了tokenization.BasicTokenizer方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def __init__(self):
        self._tokenizer = BasicTokenizer(do_lower_case=False) 
开发者ID:thunlp,项目名称:XQA,代码行数:4,代码来源:evidence_corpus.py

示例2: test_chinese

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def test_chinese(self):
        tokenizer = tokenization.BasicTokenizer()

        self.assertAllEqual(
            tokenizer.tokenize(u"ah\u535A\u63A8zz"),
            [u"ah", u"\u535A", u"\u63A8", u"zz"]) 
开发者ID:a414351664,项目名称:Bert-TextClassification,代码行数:8,代码来源:tokenization_test.py

示例3: test_basic_tokenizer_lower

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def test_basic_tokenizer_lower(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=True)

        self.assertAllEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["hello", "!", "how", "are", "you", "?"])
        self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) 
开发者ID:a414351664,项目名称:Bert-TextClassification,代码行数:9,代码来源:tokenization_test.py

示例4: test_basic_tokenizer_no_lower

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def test_basic_tokenizer_no_lower(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=False)

        self.assertAllEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["HeLLo", "!", "how", "Are", "yoU", "?"]) 
开发者ID:a414351664,项目名称:Bert-TextClassification,代码行数:8,代码来源:tokenization_test.py

示例5: test_chinese

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def test_chinese(self):
    tokenizer = tokenization.BasicTokenizer()

    self.assertAllEqual(
        tokenizer.tokenize(u"ah\u535A\u63A8zz"),
        [u"ah", u"\u535A", u"\u63A8", u"zz"]) 
开发者ID:ZhangShiyue,项目名称:QGforQA,代码行数:8,代码来源:tokenization_test.py

示例6: test_basic_tokenizer_lower

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def test_basic_tokenizer_lower(self):
    tokenizer = tokenization.BasicTokenizer(do_lower_case=True)

    self.assertAllEqual(
        tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
        ["hello", "!", "how", "are", "you", "?"])
    self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) 
开发者ID:ZhangShiyue,项目名称:QGforQA,代码行数:9,代码来源:tokenization_test.py

示例7: test_basic_tokenizer_no_lower

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def test_basic_tokenizer_no_lower(self):
    tokenizer = tokenization.BasicTokenizer(do_lower_case=False)

    self.assertAllEqual(
        tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
        ["HeLLo", "!", "how", "Are", "yoU", "?"]) 
开发者ID:ZhangShiyue,项目名称:QGforQA,代码行数:8,代码来源:tokenization_test.py

示例8: build_wiki_corpus

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def build_wiki_corpus(n_processes):
    build_dataset("wiki", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"),
                      dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"),
                      train=join(TRIVIA_QA, "qa", "wikipedia-train.json"),
                      test=join(TRIVIA_QA, "qa", "wikipedia-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes) 
开发者ID:huminghao16,项目名称:RE3QA,代码行数:11,代码来源:build_span_corpus.py

示例9: build_web_corpus

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def build_web_corpus(n_processes):
    build_dataset("web", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      verified=join(TRIVIA_QA, "qa", "verified-web-dev.json"),
                      dev=join(TRIVIA_QA, "qa", "web-dev.json"),
                      train=join(TRIVIA_QA, "qa", "web-train.json"),
                      test=join(TRIVIA_QA, "qa", "web-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes) 
开发者ID:huminghao16,项目名称:RE3QA,代码行数:11,代码来源:build_span_corpus.py

示例10: build_unfiltered_corpus

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def build_unfiltered_corpus(n_processes):
    build_dataset("unfiltered", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"),
                      train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"),
                      test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes) 
开发者ID:huminghao16,项目名称:RE3QA,代码行数:10,代码来源:build_span_corpus.py

示例11: build_wiki_sample_corpus

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def build_wiki_sample_corpus(n_processes):
    build_dataset("wiki-sample", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"),
                      dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"),
                      train=join(TRIVIA_QA, "qa", "wikipedia-train.json"),
                      test=join(TRIVIA_QA, "qa", "wikipedia-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes, sample=20) 
开发者ID:huminghao16,项目名称:RE3QA,代码行数:11,代码来源:build_span_corpus.py

示例12: build_unfiltered_sample_corpus

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def build_unfiltered_sample_corpus(n_processes):
    build_dataset("unfiltered-sample", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"),
                      train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"),
                      test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes, sample=20) 
开发者ID:huminghao16,项目名称:RE3QA,代码行数:10,代码来源:build_span_corpus.py

示例13: main

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def main():
    parse = argparse.ArgumentParser("Pre-tokenize the TriviaQA evidence corpus")
    parse.add_argument("-o", "--output_dir", type=str, default=join("data", "triviaqa", "evidence"))
    parse.add_argument("-s", "--source", type=str, default=join(TRIVIA_QA, "evidence"))
    # This is slow, using more processes is recommended
    parse.add_argument("-n", "--n_processes", type=int, default=1, help="Number of processes to use")
    parse.add_argument("--max_tokens", type=int, default=200, help="Number of maximal tokens in each merged paragraph")
    parse.add_argument("--wiki_only", action="store_true")
    args = parse.parse_args()

    tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
    splitter = MergeParagraphs(args.max_tokens)
    build_tokenized_corpus(args.source, tokenizer, splitter, args.output_dir,
                           n_processes=args.n_processes, wiki_only=args.wiki_only) 
开发者ID:huminghao16,项目名称:RE3QA,代码行数:16,代码来源:evidence_corpus.py

示例14: main

# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def main():
    parse = argparse.ArgumentParser("Pre-tokenize the SQuAD open dev file")
    parse.add_argument("--input_file", type=str, default=join("data", "squad", "squad_dev_open.pkl"))
    # This is slow, using more processes is recommended
    parse.add_argument("--max_tokens", type=int, default=200, help="Number of maximal tokens in each merged paragraph")
    parse.add_argument("--n_to_select", type=int, default=30, help="Number of paragraphs to retrieve")
    parse.add_argument("--sort_passage", type=bool, default=True, help="Sort passage according to order")
    parse.add_argument("--debug", type=bool, default=False, help="Whether to run in debug mode")
    args = parse.parse_args()

    dev_examples = pickle.load(open(args.input_file, 'rb'))

    tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
    splitter = MergeParagraphs(args.max_tokens)
    tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=stop_words)
    detector = FastNormalizedAnswerDetector()

    ir_count, total_doc_length, pruned_doc_length = 0, 0, 0
    out = []
    for example_ix, example in tqdm(enumerate(dev_examples), total=len(dev_examples)):
        paras = [x for x in example.doc_text.split("\n") if len(x) > 0]
        paragraphs = [tokenizer.tokenize(x) for x in paras]
        merged_paragraphs = splitter.merge(paragraphs)

        scores = rank(tfidf, [example.question_text], [" ".join(x) for x in merged_paragraphs])
        para_scores = scores[0]
        para_ranks = np.argsort(para_scores)
        selection = [i for i in para_ranks[:args.n_to_select]]

        if args.sort_passage:
            selection = np.sort(selection)

        doc_tokens = []
        for idx in selection:
            current_para = merged_paragraphs[idx]
            doc_tokens += current_para

        tokenized_answers = [tokenizer.tokenize(x) for x in example.answer_texts]
        detector.set_question(tokenized_answers)
        if len(detector.any_found(doc_tokens)) > 0:
            ir_count += 1

        total_doc_length += sum(len(para) for para in merged_paragraphs)
        pruned_doc_length += len(doc_tokens)

        out.append(DocumentAndQuestion(example_ix, example.qas_id, example.question_text, doc_tokens,
                                       '', 0, 0, True))
        if args.debug and example_ix > 5:
            break
    print("Recall of answer existence in documents: {:.3f}".format(ir_count / len(out)))
    print("Average length of documents: {:.3f}".format(total_doc_length / len(out)))
    print("Average pruned length of documents: {:.3f}".format(pruned_doc_length / len(out)))
    output_file = join("data", "squad", "eval_open_{}paras_examples.pkl".format(args.n_to_select))
    pickle.dump(out, open(output_file, 'wb')) 
开发者ID:huminghao16,项目名称:RE3QA,代码行数:56,代码来源:convert_squad_open.py


注:本文中的bert.tokenization.BasicTokenizer方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。