当前位置: 首页>>代码示例>>Python>>正文


Python Tokenizer.split_query方法代码示例

本文整理汇总了Python中tokenizer.Tokenizer.split_query方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.split_query方法的具体用法?Python Tokenizer.split_query怎么用?Python Tokenizer.split_query使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在tokenizer.Tokenizer的用法示例。


在下文中一共展示了Tokenizer.split_query方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import split_query [as 别名]
class Search:
  def __init__(self, ngram, dir):
    self.docID = DocID()
    self.tokenizer = Tokenizer("ma")
    self.content = Content()
    self.ngram = ngram
    self.docID.load(dir + "docid.pickle")
    self.content.load(dir + "content.pickle")
    self.stopwords = self._load_stopwords(STOPWORDS_FILE)
  
  def zenhan_search(self, statement, numOfResult):
    han_statement = zenhan.z2h(statement)
    zen_statement = zenhan.h2z(statement)
    
    han_list = self.tokenizer.split_query(han_statement)
    zen_list = self.tokenizer.split_query(zen_statement)
    
    if han_statement != zen_statement:
      to_search = han_list + zen_list
    else:
      to_search = self.tokenizer.split_query(statement) 

    return self._search(to_search, numOfResult)

  def normal_search(self, statement, numOfResult):
    tokenized_list = self.tokenizer.split_query(statement)
    return self._search(tokenized_list, numOfResult)

  def _search(self, tokenList, numOfResult):
    frequency_hash = Counter()  #return value {document_id : frequency}
    frequency_memoize = dict()  #memoize offset of query and offset of document to cal score 
    doc_tok_map = []            #memoize index of word in query to prevent search same word

    token_search_index = 0 
    
    #<<<search loop
    for token in tokenList:
      token_content = token[0] #token content
      token_id = token[1] #real index in query statement

      content_list = self.docID.get(token_content)

      for content_data in content_list:
        already_searched = False 

        content_id = content_data[0]
        token_doc_index = content_data[1]
        
        #if same token, same index in document than skip
        map = (content_id, token_id)
        if map in doc_tok_map:
          already_searched = True
        else:
          doc_tok_map.append(map)

        #calculate score --> customize here
        #format of frequency_memoize : (token, token_doc_index, token_search_index)
        if frequency_hash.has_key(content_id):
          if token_content in self.stopwords: continue; #if stop word continue
          
          #else increase score
          if not self._exist_freq_memoize(token_id, frequency_memoize[content_id]): #if token already in memoize
            frequency_memoize[content_id].append((token_content, token_id, token_doc_index, token_search_index))
          
          #if this word already searched, increase with smaller score
          if already_searched:
            frequency_hash[content_id] += 1
          else:
            frequency_hash[content_id] += NEWWORD_FACTOR

        else:
          frequency_memoize[content_id] = [(token_content, token_id, token_doc_index, token_search_index)]
          frequency_hash[content_id] = 1
      token_search_index += 1
    #>>>endloop

    #increase score by confirming offset from frequency_memoize
    if False:
      #self._print_freq_memoize(frequency_memoize)
      self._cal_score_by_freq_memoize(frequency_memoize, frequency_hash)
    
    if DEBUG:
      print frequency_hash.most_common(20)

    #get numOfResult from result
    frequency_hash_len = len(frequency_hash)
    
    if (numOfResult == "all"):
      max_num = frequency_hash_len
    else :
      max_num = frequency_hash_len if numOfResult > frequency_hash_len  else numOfResult
    
    return frequency_hash.most_common(max_num)
  
  def _exist_freq_memoize(self, token_id, frequency_memoize_item):
    for token_item in frequency_memoize_item:
      if (token_id == token_item[1]): return True;
    return False
    

#.........这里部分代码省略.........
开发者ID:huydx,项目名称:fulltext_engine,代码行数:103,代码来源:search.py


注:本文中的tokenizer.Tokenizer.split_query方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。