當前位置: 首頁>>代碼示例>>Python>>正文


Python tokenization.BasicTokenizer方法代碼示例

本文整理匯總了Python中bert.tokenization.BasicTokenizer方法的典型用法代碼示例。如果您正苦於以下問題:Python tokenization.BasicTokenizer方法的具體用法?Python tokenization.BasicTokenizer怎麽用?Python tokenization.BasicTokenizer使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在bert.tokenization的用法示例。


在下文中一共展示了tokenization.BasicTokenizer方法的14個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: __init__

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import BasicTokenizer [as 別名]
def __init__(self):
        self._tokenizer = BasicTokenizer(do_lower_case=False) 
開發者ID:thunlp,項目名稱:XQA,代碼行數:4,代碼來源:evidence_corpus.py

示例2: test_chinese

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import BasicTokenizer [as 別名]
def test_chinese(self):
        tokenizer = tokenization.BasicTokenizer()

        self.assertAllEqual(
            tokenizer.tokenize(u"ah\u535A\u63A8zz"),
            [u"ah", u"\u535A", u"\u63A8", u"zz"]) 
開發者ID:a414351664,項目名稱:Bert-TextClassification,代碼行數:8,代碼來源:tokenization_test.py

示例3: test_basic_tokenizer_lower

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import BasicTokenizer [as 別名]
def test_basic_tokenizer_lower(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=True)

        self.assertAllEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["hello", "!", "how", "are", "you", "?"])
        self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) 
開發者ID:a414351664,項目名稱:Bert-TextClassification,代碼行數:9,代碼來源:tokenization_test.py

示例4: test_basic_tokenizer_no_lower

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import BasicTokenizer [as 別名]
def test_basic_tokenizer_no_lower(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=False)

        self.assertAllEqual(
            tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
            ["HeLLo", "!", "how", "Are", "yoU", "?"]) 
開發者ID:a414351664,項目名稱:Bert-TextClassification,代碼行數:8,代碼來源:tokenization_test.py

示例5: test_chinese

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import BasicTokenizer [as 別名]
def test_chinese(self):
    tokenizer = tokenization.BasicTokenizer()

    self.assertAllEqual(
        tokenizer.tokenize(u"ah\u535A\u63A8zz"),
        [u"ah", u"\u535A", u"\u63A8", u"zz"]) 
開發者ID:ZhangShiyue,項目名稱:QGforQA,代碼行數:8,代碼來源:tokenization_test.py

示例6: test_basic_tokenizer_lower

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import BasicTokenizer [as 別名]
def test_basic_tokenizer_lower(self):
    tokenizer = tokenization.BasicTokenizer(do_lower_case=True)

    self.assertAllEqual(
        tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
        ["hello", "!", "how", "are", "you", "?"])
    self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) 
開發者ID:ZhangShiyue,項目名稱:QGforQA,代碼行數:9,代碼來源:tokenization_test.py

示例7: test_basic_tokenizer_no_lower

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import BasicTokenizer [as 別名]
def test_basic_tokenizer_no_lower(self):
    tokenizer = tokenization.BasicTokenizer(do_lower_case=False)

    self.assertAllEqual(
        tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
        ["HeLLo", "!", "how", "Are", "yoU", "?"]) 
開發者ID:ZhangShiyue,項目名稱:QGforQA,代碼行數:8,代碼來源:tokenization_test.py

示例8: build_wiki_corpus

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import BasicTokenizer [as 別名]
def build_wiki_corpus(n_processes):
    build_dataset("wiki", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"),
                      dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"),
                      train=join(TRIVIA_QA, "qa", "wikipedia-train.json"),
                      test=join(TRIVIA_QA, "qa", "wikipedia-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes) 
開發者ID:huminghao16,項目名稱:RE3QA,代碼行數:11,代碼來源:build_span_corpus.py

示例9: build_web_corpus

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import BasicTokenizer [as 別名]
def build_web_corpus(n_processes):
    build_dataset("web", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      verified=join(TRIVIA_QA, "qa", "verified-web-dev.json"),
                      dev=join(TRIVIA_QA, "qa", "web-dev.json"),
                      train=join(TRIVIA_QA, "qa", "web-train.json"),
                      test=join(TRIVIA_QA, "qa", "web-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes) 
開發者ID:huminghao16,項目名稱:RE3QA,代碼行數:11,代碼來源:build_span_corpus.py

示例10: build_unfiltered_corpus

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import BasicTokenizer [as 別名]
def build_unfiltered_corpus(n_processes):
    build_dataset("unfiltered", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"),
                      train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"),
                      test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes) 
開發者ID:huminghao16,項目名稱:RE3QA,代碼行數:10,代碼來源:build_span_corpus.py

示例11: build_wiki_sample_corpus

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import BasicTokenizer [as 別名]
def build_wiki_sample_corpus(n_processes):
    build_dataset("wiki-sample", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"),
                      dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"),
                      train=join(TRIVIA_QA, "qa", "wikipedia-train.json"),
                      test=join(TRIVIA_QA, "qa", "wikipedia-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes, sample=20) 
開發者ID:huminghao16,項目名稱:RE3QA,代碼行數:11,代碼來源:build_span_corpus.py

示例12: build_unfiltered_sample_corpus

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import BasicTokenizer [as 別名]
def build_unfiltered_sample_corpus(n_processes):
    build_dataset("unfiltered-sample", tokenization.BasicTokenizer(do_lower_case=True),
                  dict(
                      dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"),
                      train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"),
                      test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json")
                  ),
                  FastNormalizedAnswerDetector(), n_processes, sample=20) 
開發者ID:huminghao16,項目名稱:RE3QA,代碼行數:10,代碼來源:build_span_corpus.py

示例13: main

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import BasicTokenizer [as 別名]
def main():
    parse = argparse.ArgumentParser("Pre-tokenize the TriviaQA evidence corpus")
    parse.add_argument("-o", "--output_dir", type=str, default=join("data", "triviaqa", "evidence"))
    parse.add_argument("-s", "--source", type=str, default=join(TRIVIA_QA, "evidence"))
    # This is slow, using more processes is recommended
    parse.add_argument("-n", "--n_processes", type=int, default=1, help="Number of processes to use")
    parse.add_argument("--max_tokens", type=int, default=200, help="Number of maximal tokens in each merged paragraph")
    parse.add_argument("--wiki_only", action="store_true")
    args = parse.parse_args()

    tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
    splitter = MergeParagraphs(args.max_tokens)
    build_tokenized_corpus(args.source, tokenizer, splitter, args.output_dir,
                           n_processes=args.n_processes, wiki_only=args.wiki_only) 
開發者ID:huminghao16,項目名稱:RE3QA,代碼行數:16,代碼來源:evidence_corpus.py

示例14: main

# 需要導入模塊: from bert import tokenization [as 別名]
# 或者: from bert.tokenization import BasicTokenizer [as 別名]
def main():
    parse = argparse.ArgumentParser("Pre-tokenize the SQuAD open dev file")
    parse.add_argument("--input_file", type=str, default=join("data", "squad", "squad_dev_open.pkl"))
    # This is slow, using more processes is recommended
    parse.add_argument("--max_tokens", type=int, default=200, help="Number of maximal tokens in each merged paragraph")
    parse.add_argument("--n_to_select", type=int, default=30, help="Number of paragraphs to retrieve")
    parse.add_argument("--sort_passage", type=bool, default=True, help="Sort passage according to order")
    parse.add_argument("--debug", type=bool, default=False, help="Whether to run in debug mode")
    args = parse.parse_args()

    dev_examples = pickle.load(open(args.input_file, 'rb'))

    tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
    splitter = MergeParagraphs(args.max_tokens)
    tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=stop_words)
    detector = FastNormalizedAnswerDetector()

    ir_count, total_doc_length, pruned_doc_length = 0, 0, 0
    out = []
    for example_ix, example in tqdm(enumerate(dev_examples), total=len(dev_examples)):
        paras = [x for x in example.doc_text.split("\n") if len(x) > 0]
        paragraphs = [tokenizer.tokenize(x) for x in paras]
        merged_paragraphs = splitter.merge(paragraphs)

        scores = rank(tfidf, [example.question_text], [" ".join(x) for x in merged_paragraphs])
        para_scores = scores[0]
        para_ranks = np.argsort(para_scores)
        selection = [i for i in para_ranks[:args.n_to_select]]

        if args.sort_passage:
            selection = np.sort(selection)

        doc_tokens = []
        for idx in selection:
            current_para = merged_paragraphs[idx]
            doc_tokens += current_para

        tokenized_answers = [tokenizer.tokenize(x) for x in example.answer_texts]
        detector.set_question(tokenized_answers)
        if len(detector.any_found(doc_tokens)) > 0:
            ir_count += 1

        total_doc_length += sum(len(para) for para in merged_paragraphs)
        pruned_doc_length += len(doc_tokens)

        out.append(DocumentAndQuestion(example_ix, example.qas_id, example.question_text, doc_tokens,
                                       '', 0, 0, True))
        if args.debug and example_ix > 5:
            break
    print("Recall of answer existence in documents: {:.3f}".format(ir_count / len(out)))
    print("Average length of documents: {:.3f}".format(total_doc_length / len(out)))
    print("Average pruned length of documents: {:.3f}".format(pruned_doc_length / len(out)))
    output_file = join("data", "squad", "eval_open_{}paras_examples.pkl".format(args.n_to_select))
    pickle.dump(out, open(output_file, 'wb')) 
開發者ID:huminghao16,項目名稱:RE3QA,代碼行數:56,代碼來源:convert_squad_open.py


注:本文中的bert.tokenization.BasicTokenizer方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。