本文整理匯總了Python中SolrClient.SolrClient.query_indexed_terms_by_docId方法的典型用法代碼示例。如果您正苦於以下問題:Python SolrClient.query_indexed_terms_by_docId方法的具體用法?Python SolrClient.query_indexed_terms_by_docId怎麽用?Python SolrClient.query_indexed_terms_by_docId使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類SolrClient.SolrClient
的用法示例。
在下文中一共展示了SolrClient.query_indexed_terms_by_docId方法的1個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: TaggingProcessor
# 需要導入模塊: from SolrClient import SolrClient [as 別名]
# 或者: from SolrClient.SolrClient import query_indexed_terms_by_docId [as 別名]
#.........這裏部分代碼省略.........
elif node_b[1] == 'DT':
#only append if DT is in the middle,e.g., ratio of the tensile
term_tokens.append('' if len(term_tokens) == 0 else node_b[0])
#continue
else:
term_tokens.append('' if len(term_tokens) == 0 else ' ')
term_tokens.append(node_b[0])
term_candidates.add(''.join(term_tokens))
return term_candidates
def sentence_split(self, content):
"""
heuristic/pattern (e.g., by '\r\n' or '\t') based sentence splitting + NLTK's recommended sentence tokenizer
return list, sentence list
"""
pattern_split = re.compile(r"[\r\n|\t]")
sent_list = pattern_split.split(content.strip())
sent_list = [sent_tokenize(sent.strip()) for sent in sent_list if sent.strip()]
#flatten sentence list
sent_list = [item for sublist in sent_list for item in sublist]
return sent_list
def term_dictionary_tagging(self, doc_id):
"""
tagging content with the statistic dictionary
return set, term set to be indexed
"""
self._logger.debug("term dictionary tagging for single document ...")
indexed_terms = self.solrClient.query_indexed_terms_by_docId(doc_id, self.solr_field_content)
indexed_terms = set(indexed_terms.keys())
with MultiprocPool(processes=int(self.parallel_workers)) as pool:
tagged_terms=pool.starmap(term_async_comparison, [(indexed_term, self.dict_terms, self.dict_tagger_fuzzy_matching, self.dict_terms_trie, self.dict_tagger_sim_threshold) for indexed_term in indexed_terms])
tagged_terms = set(filter(None, set(tagged_terms)))
self._logger.debug("final dictionary tagged terms size: [%s]", str(len(tagged_terms)))
self._logger.debug("Term candidate extraction for current doc is completed.")
return tagged_terms
def term_candidate_extraction(self,content):
"""
Sentence based term candidates extraction. The content need to be tokenised and sentence splitted before parsing.
params:
content: content string to be analysed
return set, term candidates extracted from content
"""
self._logger.debug("term candidate extraction for single document...")
term_candidates=set()
grammars=['TermCandidate: {'+item+'}' for item in self.load_grammars() if not item.startswith('#')]
sent_tokenize_list = self.sentence_split(content)
for sent_content in sent_tokenize_list:
pos_sent_content=self.linguistic_processor.customised_preprocessing(sent_content)
# print(pos_sent_content)
for candidate_grammar in grammars:
pos_filter_candidates=self.parsing_candidates_regexp(pos_sent_content,candidate_grammar)
term_candidates.update(pos_filter_candidates)