本文整理汇总了Python中bert.tokenization.BasicTokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python tokenization.BasicTokenizer方法的具体用法?Python tokenization.BasicTokenizer怎么用?Python tokenization.BasicTokenizer使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类bert.tokenization
的用法示例。
在下文中一共展示了tokenization.BasicTokenizer方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def __init__(self):
self._tokenizer = BasicTokenizer(do_lower_case=False)
示例2: test_chinese
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def test_chinese(self):
tokenizer = tokenization.BasicTokenizer()
self.assertAllEqual(
tokenizer.tokenize(u"ah\u535A\u63A8zz"),
[u"ah", u"\u535A", u"\u63A8", u"zz"])
示例3: test_basic_tokenizer_lower
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def test_basic_tokenizer_lower(self):
tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
self.assertAllEqual(
tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
["hello", "!", "how", "are", "you", "?"])
self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
示例4: test_basic_tokenizer_no_lower
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def test_basic_tokenizer_no_lower(self):
tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
self.assertAllEqual(
tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
["HeLLo", "!", "how", "Are", "yoU", "?"])
示例5: test_chinese
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def test_chinese(self):
tokenizer = tokenization.BasicTokenizer()
self.assertAllEqual(
tokenizer.tokenize(u"ah\u535A\u63A8zz"),
[u"ah", u"\u535A", u"\u63A8", u"zz"])
示例6: test_basic_tokenizer_lower
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def test_basic_tokenizer_lower(self):
tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
self.assertAllEqual(
tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
["hello", "!", "how", "are", "you", "?"])
self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
示例7: test_basic_tokenizer_no_lower
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def test_basic_tokenizer_no_lower(self):
tokenizer = tokenization.BasicTokenizer(do_lower_case=False)
self.assertAllEqual(
tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "),
["HeLLo", "!", "how", "Are", "yoU", "?"])
示例8: build_wiki_corpus
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def build_wiki_corpus(n_processes):
build_dataset("wiki", tokenization.BasicTokenizer(do_lower_case=True),
dict(
verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"),
dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"),
train=join(TRIVIA_QA, "qa", "wikipedia-train.json"),
test=join(TRIVIA_QA, "qa", "wikipedia-test-without-answers.json")
),
FastNormalizedAnswerDetector(), n_processes)
示例9: build_web_corpus
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def build_web_corpus(n_processes):
build_dataset("web", tokenization.BasicTokenizer(do_lower_case=True),
dict(
verified=join(TRIVIA_QA, "qa", "verified-web-dev.json"),
dev=join(TRIVIA_QA, "qa", "web-dev.json"),
train=join(TRIVIA_QA, "qa", "web-train.json"),
test=join(TRIVIA_QA, "qa", "web-test-without-answers.json")
),
FastNormalizedAnswerDetector(), n_processes)
示例10: build_unfiltered_corpus
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def build_unfiltered_corpus(n_processes):
build_dataset("unfiltered", tokenization.BasicTokenizer(do_lower_case=True),
dict(
dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"),
train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"),
test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json")
),
FastNormalizedAnswerDetector(), n_processes)
示例11: build_wiki_sample_corpus
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def build_wiki_sample_corpus(n_processes):
build_dataset("wiki-sample", tokenization.BasicTokenizer(do_lower_case=True),
dict(
verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"),
dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"),
train=join(TRIVIA_QA, "qa", "wikipedia-train.json"),
test=join(TRIVIA_QA, "qa", "wikipedia-test-without-answers.json")
),
FastNormalizedAnswerDetector(), n_processes, sample=20)
示例12: build_unfiltered_sample_corpus
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def build_unfiltered_sample_corpus(n_processes):
build_dataset("unfiltered-sample", tokenization.BasicTokenizer(do_lower_case=True),
dict(
dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"),
train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"),
test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json")
),
FastNormalizedAnswerDetector(), n_processes, sample=20)
示例13: main
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def main():
parse = argparse.ArgumentParser("Pre-tokenize the TriviaQA evidence corpus")
parse.add_argument("-o", "--output_dir", type=str, default=join("data", "triviaqa", "evidence"))
parse.add_argument("-s", "--source", type=str, default=join(TRIVIA_QA, "evidence"))
# This is slow, using more processes is recommended
parse.add_argument("-n", "--n_processes", type=int, default=1, help="Number of processes to use")
parse.add_argument("--max_tokens", type=int, default=200, help="Number of maximal tokens in each merged paragraph")
parse.add_argument("--wiki_only", action="store_true")
args = parse.parse_args()
tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
splitter = MergeParagraphs(args.max_tokens)
build_tokenized_corpus(args.source, tokenizer, splitter, args.output_dir,
n_processes=args.n_processes, wiki_only=args.wiki_only)
示例14: main
# 需要导入模块: from bert import tokenization [as 别名]
# 或者: from bert.tokenization import BasicTokenizer [as 别名]
def main():
parse = argparse.ArgumentParser("Pre-tokenize the SQuAD open dev file")
parse.add_argument("--input_file", type=str, default=join("data", "squad", "squad_dev_open.pkl"))
# This is slow, using more processes is recommended
parse.add_argument("--max_tokens", type=int, default=200, help="Number of maximal tokens in each merged paragraph")
parse.add_argument("--n_to_select", type=int, default=30, help="Number of paragraphs to retrieve")
parse.add_argument("--sort_passage", type=bool, default=True, help="Sort passage according to order")
parse.add_argument("--debug", type=bool, default=False, help="Whether to run in debug mode")
args = parse.parse_args()
dev_examples = pickle.load(open(args.input_file, 'rb'))
tokenizer = tokenization.BasicTokenizer(do_lower_case=True)
splitter = MergeParagraphs(args.max_tokens)
tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=stop_words)
detector = FastNormalizedAnswerDetector()
ir_count, total_doc_length, pruned_doc_length = 0, 0, 0
out = []
for example_ix, example in tqdm(enumerate(dev_examples), total=len(dev_examples)):
paras = [x for x in example.doc_text.split("\n") if len(x) > 0]
paragraphs = [tokenizer.tokenize(x) for x in paras]
merged_paragraphs = splitter.merge(paragraphs)
scores = rank(tfidf, [example.question_text], [" ".join(x) for x in merged_paragraphs])
para_scores = scores[0]
para_ranks = np.argsort(para_scores)
selection = [i for i in para_ranks[:args.n_to_select]]
if args.sort_passage:
selection = np.sort(selection)
doc_tokens = []
for idx in selection:
current_para = merged_paragraphs[idx]
doc_tokens += current_para
tokenized_answers = [tokenizer.tokenize(x) for x in example.answer_texts]
detector.set_question(tokenized_answers)
if len(detector.any_found(doc_tokens)) > 0:
ir_count += 1
total_doc_length += sum(len(para) for para in merged_paragraphs)
pruned_doc_length += len(doc_tokens)
out.append(DocumentAndQuestion(example_ix, example.qas_id, example.question_text, doc_tokens,
'', 0, 0, True))
if args.debug and example_ix > 5:
break
print("Recall of answer existence in documents: {:.3f}".format(ir_count / len(out)))
print("Average length of documents: {:.3f}".format(total_doc_length / len(out)))
print("Average pruned length of documents: {:.3f}".format(pruned_doc_length / len(out)))
output_file = join("data", "squad", "eval_open_{}paras_examples.pkl".format(args.n_to_select))
pickle.dump(out, open(output_file, 'wb'))