本文整理汇总了Python中flashtext.KeywordProcessor方法的典型用法代码示例。如果您正苦于以下问题:Python flashtext.KeywordProcessor方法的具体用法?Python flashtext.KeywordProcessor怎么用?Python flashtext.KeywordProcessor使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类flashtext
的用法示例。
在下文中一共展示了flashtext.KeywordProcessor方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _build_kp
# 需要导入模块: import flashtext [as 别名]
# 或者: from flashtext import KeywordProcessor [as 别名]
def _build_kp(self, case_sensitive=True):
## Prepare tokenizer and flashtext keyword processor
keyword_processor = KeywordProcessor(case_sensitive=case_sensitive)
id_to_key_dict = load_keyword_dict_v1_3(
config.DATA_ROOT / "id_dict.jsonl", filtering=True)
exact_match_rule_dict = set_priority(id_to_key_dict, priority=5.0)
noisy_key_dict = id_dict_key_word_expand(id_to_key_dict,
create_new_key_word_dict=True)
noisy_parenthese_rule_dict = set_priority(noisy_key_dict, priority=1.0)
build_processor(keyword_processor,
exact_match_rule_dict)
build_processor(keyword_processor,
noisy_parenthese_rule_dict)
## Change priorities of digital numbers
KeywordRuleBuilder.eliminate_pure_digits_in_place(keyword_processor)
KeywordRuleBuilder.eliminate_ordinals_in_place(keyword_processor)
KeywordRuleBuilder.eliminate_stop_words_in_place(keyword_processor)
return keyword_processor
示例2: __init__
# 需要导入模块: import flashtext [as 别名]
# 或者: from flashtext import KeywordProcessor [as 别名]
def __init__(self, keywords_list=[], keywords_dict={}, keywords_file=None,
label='', case_sensitive=False,
attrs=('has_entities', 'is_entity', 'entity_desc', 'entities', 'canonical')):
"""Initialise the pipeline component.
"""
self._has_entities, self._is_entity, self._entity_desc, self._entities, self.canonical = attrs
# Set up the KeywordProcessor
self.keyword_processor = KeywordProcessor(case_sensitive=case_sensitive)
self.keyword_processor.add_keywords_from_list(keywords_list)
self.keyword_processor.add_keywords_from_dict(keywords_dict)
if keywords_file:
self.keyword_processor.add_keyword_from_file(keywords_file)
self.label = label
# Register attribute on the Doc and Span
Doc.set_extension(self._has_entities, getter=self.has_entities, force=True)
Doc.set_extension(self._entities, getter=self.iter_entities, force=True)
Span.set_extension(self._has_entities, getter=self.has_entities, force=True)
Span.set_extension(self._entities, getter=self.iter_entities, force=True)
# Register attribute on the Token.
Token.set_extension(self._is_entity, default=False, force=True)
Token.set_extension(self._entity_desc, getter=self.get_entity_desc, force=True)
Token.set_extension(self.canonical, default=None, force=True)
示例3: __init__
# 需要导入模块: import flashtext [as 别名]
# 或者: from flashtext import KeywordProcessor [as 别名]
def __init__(self,
language='english',
preprocess_type='nltk',
stopwords_remove=True,
length_limit=10,
debug=False):
self.language = language
self.preprocess_type = preprocess_type
self.stopwords_remove = stopwords_remove
self.length_limit = length_limit
self.debug = debug
if stopwords_remove:
stopword_remover = flashtext.KeywordProcessor()
for stopword in stopwords.words(self.language):
stopword_remover.add_keyword(stopword, '')
self.stopword_remover = stopword_remover
return
示例4: get_kwterm_matching
# 需要导入模块: import flashtext [as 别名]
# 或者: from flashtext import KeywordProcessor [as 别名]
def get_kwterm_matching(kw_terms, d_list, chuck_size=10_000_000):
kw_terms = list(kw_terms)
kw_terms_total_size = len(kw_terms)
for start in range(0, kw_terms_total_size, chuck_size):
print(start, start + chuck_size)
current_kw_terms = kw_terms[start:start + chuck_size]
keyword_processor = KeywordProcessor(case_sensitive=True)
for word in tqdm(current_kw_terms):
keyword_processor.add_keyword(word)
for item in tqdm(d_list):
query = item['question']
terms = query_get_terms(query, keyword_processor)
if 'kw_matches' not in item:
item['kw_matches'] = []
item['kw_matches'].extend(terms)
del keyword_processor
return d_list
示例5: test_remove_keywords_dictionary_compare
# 需要导入模块: import flashtext [as 别名]
# 或者: from flashtext import KeywordProcessor [as 别名]
def test_remove_keywords_dictionary_compare(self):
"""For each of the test case initialize a new KeywordProcessor.
Add the keywords the test case to KeywordProcessor.
Remove the keywords in remove_keyword_dict
Extract keywords and check if they match the expected result for the test case.
"""
for test_id, test_case in enumerate(self.test_cases):
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_dict(test_case['keyword_dict'])
keyword_processor.remove_keywords_from_dict(test_case['remove_keyword_dict'])
keyword_trie_dict = keyword_processor.keyword_trie_dict
new_dictionary = defaultdict(list)
for key, values in test_case['keyword_dict'].items():
for value in values:
if not(key in test_case['remove_keyword_dict'] and value in test_case['remove_keyword_dict'][key]):
new_dictionary[key].append(value)
keyword_processor_two = KeywordProcessor()
keyword_processor_two.add_keywords_from_dict(new_dictionary)
keyword_trie_dict_two = keyword_processor_two.keyword_trie_dict
self.assertTrue(keyword_trie_dict == keyword_trie_dict_two,
"keywords_extracted don't match the expected results for test case: {}".format(test_id))
示例6: test_extract_keywords
# 需要导入模块: import flashtext [as 别名]
# 或者: from flashtext import KeywordProcessor [as 别名]
def test_extract_keywords(self):
"""For each of the test case initialize a new KeywordProcessor.
Add the keywords the test case to KeywordProcessor.
Extract keywords and check if they match the expected result for the test case.
"""
for test_id, test_case in enumerate(self.test_cases):
keyword_processor = KeywordProcessor()
for key in test_case['keyword_dict']:
keyword_processor.add_keywords_from_list(test_case['keyword_dict'][key])
keywords_extracted = keyword_processor.extract_keywords(test_case['sentence'], span_info=True)
for kwd in keywords_extracted:
# returned keyword lowered should match the sapn from sentence
self.assertEqual(
kwd[0].lower(), test_case['sentence'].lower()[kwd[1]:kwd[2]],
"keywords span don't match the expected results for test case: {}".format(test_id))
示例7: test_correct_keyword_on_deletion
# 需要导入模块: import flashtext [as 别名]
# 或者: from flashtext import KeywordProcessor [as 别名]
def test_correct_keyword_on_deletion(self):
"""
Test for simple deletions using the levensthein function
We ensure we end up on the right node in the trie when starting from the current node
"""
keyword_proc = KeywordProcessor()
keyword_proc.add_keyword('skype')
current_dict = {'y': {'p': {'e': {'_keyword_': 'skype'}}}}
closest_node, cost, depth = next(
keyword_proc.levensthein('pe', max_cost=1, start_node=current_dict),
({}, 0, 0),
)
self.assertDictEqual(closest_node, current_dict['y']['p']['e'])
self.assertEqual(cost, 1)
self.assertEqual(depth, 3)
示例8: test_correct_keyword_on_substitution
# 需要导入模块: import flashtext [as 别名]
# 或者: from flashtext import KeywordProcessor [as 别名]
def test_correct_keyword_on_substitution(self):
"""
Test for simple substitions using the levensthein function
We ensure we end up on the right node in the trie when starting from the current node
"""
keyword_proc = KeywordProcessor()
for keyword in (('skype', 'messenger'),):
keyword_proc.add_keyword(*keyword)
current_dict = keyword_proc.keyword_trie_dict['s']['k']
closest_node, cost, depth = next(
keyword_proc.levensthein('ope', max_cost=1, start_node=current_dict),
({}, 0, 0)
)
self.assertDictEqual(closest_node, current_dict['y']['p']['e'])
self.assertEqual(cost, 1)
self.assertEqual(depth, 3)
示例9: test_term_in_dictionary
# 需要导入模块: import flashtext [as 别名]
# 或者: from flashtext import KeywordProcessor [as 别名]
def test_term_in_dictionary(self):
keyword_processor = KeywordProcessor()
keyword_processor.add_keyword('j2ee', 'Java')
keyword_processor.add_keyword('colour', 'color')
keyword_processor.get_keyword('j2ee')
self.assertEqual(keyword_processor.get_keyword('j2ee'),
'Java',
"get_keyword didn't return expected Keyword")
self.assertEqual(keyword_processor['colour'],
'color',
"get_keyword didn't return expected Keyword")
self.assertEqual(keyword_processor['Test'],
None,
"get_keyword didn't return expected Keyword")
self.assertTrue('colour' in keyword_processor,
"get_keyword didn't return expected Keyword")
self.assertFalse('Test' in keyword_processor,
"get_keyword didn't return expected Keyword")
示例10: test_remove_keywords_dictionary_len
# 需要导入模块: import flashtext [as 别名]
# 或者: from flashtext import KeywordProcessor [as 别名]
def test_remove_keywords_dictionary_len(self):
"""For each of the test case initialize a new KeywordProcessor.
Add the keywords the test case to KeywordProcessor.
Remove the keywords in remove_keyword_dict
Extract keywords and check if they match the expected result for the test case.
"""
for test_id, test_case in enumerate(self.test_cases):
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_dict(test_case['keyword_dict'])
keyword_processor.remove_keywords_from_dict(test_case['remove_keyword_dict'])
kp_len = len(keyword_processor)
new_dictionary = defaultdict(list)
for key, values in test_case['keyword_dict'].items():
for value in values:
if not(key in test_case['remove_keyword_dict'] and value in test_case['remove_keyword_dict'][key]):
new_dictionary[key].append(value)
keyword_processor_two = KeywordProcessor()
keyword_processor_two.add_keywords_from_dict(new_dictionary)
kp_len_two = len(keyword_processor_two)
self.assertEqual(kp_len, kp_len_two,
"keyword processor length doesn't match for Text ID {}".format(test_id))
示例11: used_func_for_fast_key_word_matching_expanded_kw
# 需要导入模块: import flashtext [as 别名]
# 或者: from flashtext import KeywordProcessor [as 别名]
def used_func_for_fast_key_word_matching_expanded_kw():
"""
Added on July 1.
:return:
"""
# Load tokenizer
path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
drqa_yixin.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09)
tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner'])
#
keyword_processor = KeywordProcessor(case_sensitive=True)
id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl")
id_dict_key_word_expand(id_to_key_dict, create_new_key_word_dict=False)
# exit(-2)
# Write this in a for loop to keep track of the progress
build_flashtext_processor_wit(keyword_processor, id_to_key_dict)
# Load data for predicting
d_list = load_data(config.FEVER_DEV_JSONL)
sample_answer(d_list, tok, keyword_p=keyword_processor)
# save the the results for evaluating
out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl"
save_intermidiate_results(d_list, out_filename=out_fname)
# Evaluating
# out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:20:54_r/dev.jsonl'
# out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:08:06_r/dev.jsonl'
# d_list = load_data(out_fname)
eval_mode = {'check_doc_id_correct': True, 'standard': False}
# print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log"))
print(fever_score(d_list, d_list, mode=eval_mode, verbose=False))
示例12: used_func_for_fast_key_word_matching_expanded_kw
# 需要导入模块: import flashtext [as 别名]
# 或者: from flashtext import KeywordProcessor [as 别名]
def used_func_for_fast_key_word_matching_expanded_kw():
"""
Added on July 1.
:return:
"""
# Load tokenizer
# path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
# drqa.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09)
# tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner'])
#
# keyword_processor = KeywordProcessor(case_sensitive=True)
# id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl")
# id_dict_key_word_expand(id_to_key_dict)
# exit(-2)
# Write this in a for loop to keep track of the progress
# build_flashtext_processor(keyword_processor, id_to_key_dict)
# for clean_name, keywords in tqdm(id_to_key_dict.items()):
# if not isinstance(keywords, list):
# raise AttributeError("Value of key {} should be a list".format(clean_name))
#
# for keyword in keywords:
# keyword_processor.add_keyword(keyword, clean_name)
# Load data for predicting
# d_list = load_data(config.FEVER_DEV_JSONL)
# sample_answer(d_list, tok, keyword_p=keyword_processor)
# save the the results for evaluating
# out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl"
# save_intermidiate_results(d_list, out_filename=out_fname)
# Evaluating
out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:20:54_r/dev.jsonl'
# out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:08:06_r/dev.jsonl'
d_list = load_data(out_fname)
eval_mode = {'check_doc_id_correct': True, 'standard': False}
# print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log"))
print(fever_score(d_list, d_list, mode=eval_mode, verbose=False))
示例13: used_func_for_fast_key_word_matching_expanded_kw
# 需要导入模块: import flashtext [as 别名]
# 或者: from flashtext import KeywordProcessor [as 别名]
def used_func_for_fast_key_word_matching_expanded_kw():
"""
Added on July 1.
:return:
"""
# Load tokenizer
path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*')
drqa_yixin.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09)
tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner'])
#
keyword_processor = KeywordProcessor(case_sensitive=True)
id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl")
id_dict_key_word_expand(id_to_key_dict, create_new_key_word_dict=False)
# exit(-2)
# Write this in a for loop to keep track of the progress
build_flashtext_processor(keyword_processor, id_to_key_dict)
# Load data for predicting
d_list = load_data(config.FEVER_DEV_JSONL)
sample_answer(d_list, tok, keyword_p=keyword_processor)
# save the the results for evaluating
out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl"
save_intermidiate_results(d_list, out_filename=out_fname)
# Evaluating
# out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:20:54_r/dev.jsonl'
# out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:08:06_r/dev.jsonl'
# d_list = load_data(out_fname)
eval_mode = {'check_doc_id_correct': True, 'standard': False}
# print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log"))
print(fever_score(d_list, d_list, mode=eval_mode, verbose=False))
示例14: __init__
# 需要导入模块: import flashtext [as 别名]
# 或者: from flashtext import KeywordProcessor [as 别名]
def __init__(self, substrings, text_datasource=None, case_sensitive=False,
exclusions=None, name=None):
self.word_processor = KeywordProcessor(case_sensitive=case_sensitive)
if exclusions is not None:
substrings = list(set(substrings).difference(set(exclusions)))
self.word_processor.add_keywords_from_list(substrings)
name = self._format_name(name, [substrings, text_datasource])
super().__init__(name, self.process, depends_on=[text_datasource])
示例15: toy_init_results
# 需要导入模块: import flashtext [as 别名]
# 或者: from flashtext import KeywordProcessor [as 别名]
def toy_init_results():
ner_set = get_title_entity_set()
dev_fullwiki_list = common.load_json(config.DEV_FULLWIKI_FILE)
print(len(dev_fullwiki_list))
keyword_processor = KeywordProcessor(case_sensitive=True)
print("Build Processor")
for kw in tqdm(ner_set):
if kw.lower() in STOPWORDS or filter_document_id(kw):
continue # if the keyword is filtered by above function or is stopwords
else:
keyword_processor.add_keyword(kw, {kw})
doc_pred_dict = {'sp_doc': dict()}
for item in tqdm(dev_fullwiki_list):
question = item['question']
qid = item['_id']
finded_keys = keyword_processor.extract_keywords(question)
finded_keys_set = set()
if isinstance(finded_keys, list) and len(finded_keys) != 0:
finded_keys_set = set.union(*finded_keys)
# Addons cut retrieved document to contain only two
finded_keys_set = sorted(list(finded_keys_set), key=lambda x: len(x), reverse=True)
top_n = 2
finded_keys_set = finded_keys_set[:top_n]
doc_pred_dict['sp_doc'][qid] = list(finded_keys_set)
common.save_json(doc_pred_dict, "toy_doc_rm_stopword_top2_pred_file.json")