本文整理汇总了Python中tokenizer.Tokenizer.split_query方法的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer.split_query方法的具体用法?Python Tokenizer.split_query怎么用?Python Tokenizer.split_query使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类tokenizer.Tokenizer
的用法示例。
在下文中一共展示了Tokenizer.split_query方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from tokenizer import Tokenizer [as 别名]
# 或者: from tokenizer.Tokenizer import split_query [as 别名]
class Search:
def __init__(self, ngram, dir):
self.docID = DocID()
self.tokenizer = Tokenizer("ma")
self.content = Content()
self.ngram = ngram
self.docID.load(dir + "docid.pickle")
self.content.load(dir + "content.pickle")
self.stopwords = self._load_stopwords(STOPWORDS_FILE)
def zenhan_search(self, statement, numOfResult):
han_statement = zenhan.z2h(statement)
zen_statement = zenhan.h2z(statement)
han_list = self.tokenizer.split_query(han_statement)
zen_list = self.tokenizer.split_query(zen_statement)
if han_statement != zen_statement:
to_search = han_list + zen_list
else:
to_search = self.tokenizer.split_query(statement)
return self._search(to_search, numOfResult)
def normal_search(self, statement, numOfResult):
tokenized_list = self.tokenizer.split_query(statement)
return self._search(tokenized_list, numOfResult)
def _search(self, tokenList, numOfResult):
frequency_hash = Counter() #return value {document_id : frequency}
frequency_memoize = dict() #memoize offset of query and offset of document to cal score
doc_tok_map = [] #memoize index of word in query to prevent search same word
token_search_index = 0
#<<<search loop
for token in tokenList:
token_content = token[0] #token content
token_id = token[1] #real index in query statement
content_list = self.docID.get(token_content)
for content_data in content_list:
already_searched = False
content_id = content_data[0]
token_doc_index = content_data[1]
#if same token, same index in document than skip
map = (content_id, token_id)
if map in doc_tok_map:
already_searched = True
else:
doc_tok_map.append(map)
#calculate score --> customize here
#format of frequency_memoize : (token, token_doc_index, token_search_index)
if frequency_hash.has_key(content_id):
if token_content in self.stopwords: continue; #if stop word continue
#else increase score
if not self._exist_freq_memoize(token_id, frequency_memoize[content_id]): #if token already in memoize
frequency_memoize[content_id].append((token_content, token_id, token_doc_index, token_search_index))
#if this word already searched, increase with smaller score
if already_searched:
frequency_hash[content_id] += 1
else:
frequency_hash[content_id] += NEWWORD_FACTOR
else:
frequency_memoize[content_id] = [(token_content, token_id, token_doc_index, token_search_index)]
frequency_hash[content_id] = 1
token_search_index += 1
#>>>endloop
#increase score by confirming offset from frequency_memoize
if False:
#self._print_freq_memoize(frequency_memoize)
self._cal_score_by_freq_memoize(frequency_memoize, frequency_hash)
if DEBUG:
print frequency_hash.most_common(20)
#get numOfResult from result
frequency_hash_len = len(frequency_hash)
if (numOfResult == "all"):
max_num = frequency_hash_len
else :
max_num = frequency_hash_len if numOfResult > frequency_hash_len else numOfResult
return frequency_hash.most_common(max_num)
def _exist_freq_memoize(self, token_id, frequency_memoize_item):
for token_item in frequency_memoize_item:
if (token_id == token_item[1]): return True;
return False
#.........这里部分代码省略.........