本文整理匯總了Python中SolrClient.SolrClient.totaltermfreq方法的典型用法代碼示例。如果您正苦於以下問題:Python SolrClient.totaltermfreq方法的具體用法?Python SolrClient.totaltermfreq怎麽用?Python SolrClient.totaltermfreq使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類SolrClient.SolrClient
的用法示例。
在下文中一共展示了SolrClient.totaltermfreq方法的1個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: TaggingProcessor
# 需要導入模塊: from SolrClient import SolrClient [as 別名]
# 或者: from SolrClient.SolrClient import totaltermfreq [as 別名]
#.........這裏部分代碼省略.........
term_candidates=set()
grammars=['TermCandidate: {'+item+'}' for item in self.load_grammars() if not item.startswith('#')]
sent_tokenize_list = self.sentence_split(content)
for sent_content in sent_tokenize_list:
pos_sent_content=self.linguistic_processor.customised_preprocessing(sent_content)
# print(pos_sent_content)
for candidate_grammar in grammars:
pos_filter_candidates=self.parsing_candidates_regexp(pos_sent_content,candidate_grammar)
term_candidates.update(pos_filter_candidates)
self._logger.debug("term_candidates size after PoS filtering: [%s]", len(term_candidates))
term_candidates = self.linguistic_filter(term_candidates)
# print(term_candidates)
term_candidates = self.frequency_filtering(term_candidates)
self._logger.debug("Term candidate extraction for current doc is completed.")
return term_candidates
def frequency_filtering(self, term_candidates):
"""
Corpus (whole index) based frequency filtering
params:
term_candidates: set()
return set, filtered term candidates
"""
self._logger.debug("term frequency filtering for candidates [%s] by min frequency [%s] ...",str(len(term_candidates)), str(self._min_term_freq))
filtered_term_candidates=set()
terms_ttf_dict, normed_terms_dict= self.solrClient.totaltermfreq(self.solr_field_content, term_candidates)
if self._min_term_freq > 1:
for term in term_candidates:
tc_ttf = self.get_term_ttf(normed_terms_dict[term], terms_ttf_dict)
if tc_ttf == 0:
self._logger.warning("Error!! term [%s] has no ttf value. Please check tokenisation method for irregular text or the shingling range for the min and max value.", term)
if tc_ttf > self._min_term_freq:
filtered_term_candidates.add(term)
self._logger.debug("current term candidate size after frequency filtering [%s]", str(len(filtered_term_candidates)))
return filtered_term_candidates
def get_term_ttf(self, term, ttf_dict):
"""
get term ttf value from a given ttf dictionary returned from SolrClient.totaltermfreq
return ttf numerical value
"""
return ttf_dict[term]
def check_min_char_limit(self, multiword_term):
"""
return True if none of term unit length less than minimum char length
"""
is_exist_min_char=0
for token in multiword_term.split(' '):
if len(token) < self._min_char_length:
is_exist_min_char+=1
if is_exist_min_char > 0:
return False