本文整理汇总了Python中org.apache.lucene.search.IndexSearcher.setSimilarity方法的典型用法代码示例。如果您正苦于以下问题:Python IndexSearcher.setSimilarity方法的具体用法?Python IndexSearcher.setSimilarity怎么用?Python IndexSearcher.setSimilarity使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.search.IndexSearcher
的用法示例。
在下文中一共展示了IndexSearcher.setSimilarity方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: getIndexSearcher
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
def getIndexSearcher(self):
indexSearcher = IndexSearcher(self.mIndexReader)
if self.mSimilarity != None:
indexSearcher.setSimilarity(self.mSimilarity)
return indexSearcher
示例2: IndexAndTaxonomy
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
class IndexAndTaxonomy(object):
def __init__(self, settings, indexDirectory=None, taxoDirectory=None):
self._settings = settings
self._similarity = settings.similarity
self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks
self._reader = DirectoryReader.open(indexDirectory)
self.taxoReader = DirectoryTaxonomyReader(taxoDirectory)
self._readerSettingsWrapper = ReaderSettingsWrapper()
self._readerSettingsWrapper.get = lambda: {"similarity": self.searcher.getSimilarity().toString(), "numberOfConcurrentTasks": self._numberOfConcurrentTasks}
self._readerSettingsWrapper.set = self._setReadSettings
self._searcher = None
self._executor = None
self._reopenSearcher = True
def reopen(self):
reader = DirectoryReader.openIfChanged(self._reader)
if reader is None:
return
self._reader.close()
self._reader = reader
self._reopenSearcher = True
taxoReader = DirectoryTaxonomyReader.openIfChanged(self.taxoReader)
if taxoReader is None:
return
self.taxoReader.close()
self.taxoReader = taxoReader
@property
def searcher(self):
if not self._reopenSearcher:
return self._searcher
if self._settings.multithreaded:
if self._executor:
self._executor.shutdown();
self._executor = Executors.newFixedThreadPool(self._numberOfConcurrentTasks);
self._searcher = SuperIndexSearcher(self._reader, self._executor, self._numberOfConcurrentTasks)
else:
self._searcher = IndexSearcher(self._reader)
self._searcher.setSimilarity(self._similarity)
self._reopenSearcher = False
return self._searcher
def _setReadSettings(self, similarity=None, numberOfConcurrentTasks=None):
# This method must be thread-safe
if similarity is None:
self._similarity = self._settings.similarity
else:
self._similarity = BM25Similarity(similarity["k1"], similarity["b"])
if numberOfConcurrentTasks is None:
self._numberOfConcurrentTasks = self._settings.numberOfConcurrentTasks
else:
self._numberOfConcurrentTasks = numberOfConcurrentTasks
self._reopenSearcher = True
def close(self):
self.taxoReader.close()
self._reader.close()
示例3: config
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
def config():
base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR)))
searcher = IndexSearcher(DirectoryReader.open(directory))
bm25Sim = BM25Similarity(2.0,0.75) #BM25 with these default values: k1 = 1.2, b = 0.75.
searcher.setSimilarity(bm25Sim)
analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT)
return searcher,analyzer
示例4: lucene_retrieval_multifield
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False):
"""
multifield: different query string for different field
not same word on different field
:param q_string:
:param feature_type:
:param use_BM25:
:return: retrieval_scores for each question-answer pair
"""
index = set_lucene_index['ind'] # nonlocal variable index
def retrieval_scores(hists):
"""
return sorted document+score by score
:param hists:
"""
def doc_score(hists):
"""
return doc_name & score
:param hists:
"""
for h in hists:
# docID = h.doc
# doc = searcher.doc(docID)
# file_name = doc.get("corpus_name")
# doc_name = doc.get("doc_name")
# text = doc.get("text")
score = h.score
# yield (file_name, doc_name, score, text)
yield score
doc_score_list = list(doc_score(hists))
return map(lambda f: f(doc_score_list), feature_type) # feature_type is a list of function
text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class))
query = BooleanQuery()
# BooleanClause.Occur
# MUST implies that the keyword must occur
# SHOULD implies that the keyword SHOULD occur
query.add(text_query, BooleanClause.Occur.SHOULD)
query.add(subject_query, BooleanClause.Occur.SHOULD)
# search
reader = IndexReader.open(index)
searcher = IndexSearcher(reader)
if use_BM25:
searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters
collector = TopScoreDocCollector.create(hitsPerPage, True)
searcher.search(query, collector)
hs = collector.topDocs().scoreDocs # hists
results = retrieval_scores(hs)
# reader.close()
return results # retrieval_scores for each question-answer pair
开发者ID:rarezhang,项目名称:allen-ai-science-challenge,代码行数:59,代码来源:question_classification_subject_feature.py
示例5: lucene_retrieval
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
def lucene_retrieval(q_string, feature_type, use_BM25=False):
"""
:param q_string:
:param feature_type:
:param use_BM25:
:return: retrieval_scores for each question-answer pair
"""
index = set_lucene_index['ind'] # nonlocal variable index
def retrieval_scores(hists):
"""
return sorted document+score by score
:param hists:
"""
def doc_score(hists):
"""
return doc_name & score
:param hists:
"""
for h in hists:
# docID = h.doc
# doc = searcher.doc(docID)
# file_name = doc.get("corpus_name")
# doc_name = doc.get("doc_name")
# text = doc.get("text")
score = h.score
# yield (file_name, doc_name, score, text)
yield score
doc_score_list = list(doc_score(hists))
return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function
# escape special characters via escape function
query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
# search
reader = IndexReader.open(index)
searcher = IndexSearcher(reader)
if use_BM25:
searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters
collector = TopScoreDocCollector.create(hitsPerPage, True)
searcher.search(query, collector)
hs = collector.topDocs().scoreDocs # hists
results = retrieval_scores(hs)
# reader.close()
return results # retrieval_scores for each question-answer pair
示例6: __init__
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
def __init__(self, index_dir, index_file, rawQuery):
self.indexFile = os.path.join(index_dir, index_file)
# lucene.initVM(vmargs=['-Djava.awt.headless=true']) # uncomment when run Retrieve separately
directory = SimpleFSDirectory(File(self.indexFile))
searcher = IndexSearcher(DirectoryReader.open(directory))
searcher.setSimilarity(BM25Similarity(1.2, 0.75)) # set BM25 as the similarity metric, k=1.2, b=0.75
if 'Standard' in self.indexFile:
print "Use the StandardAnalyzer"
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # build a standard analyzer with default stop words
if 'Porter' in self.indexFile:
print "Use the PorterStemmer analyzer"
analyzer = PorterStemmerAnalyzer()
self.run(searcher, analyzer, rawQuery)
del searcher
示例7: lucene_retrieval
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
def lucene_retrieval(q_string, use_BM25=False):
"""
:param q_string:
:param use_BM25:
:return: retrieval_scores for each question-answer pair
"""
index = set_lucene_index['ind'] # nonlocal variable index
def doc_text(hists):
"""
return doc_name & score
:param hists:
"""
text = '_NONE_'
for h in hists:
docID = h.doc
doc = searcher.doc(docID)
# file_name = doc.get("corpus_name")
# doc_name = doc.get("doc_name")
text = doc.get("text")
#score = h.score
# yield (file_name, doc_name, score, text)
return text
result = '_NONE_'
# escape special characters via escape function
if q_string and q_string.strip(): # when pre-process answers, `none of the above` -> '' cause error here
#print(q_string)
query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
# search
reader = IndexReader.open(index)
searcher = IndexSearcher(reader)
if use_BM25:
searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters
collector = TopScoreDocCollector.create(hitsPerPage, True)
searcher.search(query, collector)
hs = collector.topDocs().scoreDocs # hists
result = doc_text(hs)
# reader.close()
return result # text: also nodes
开发者ID:rarezhang,项目名称:allen-ai-science-challenge,代码行数:48,代码来源:network_feature_index_retrieval_nodes.py
示例8: get_sorted_results
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
def get_sorted_results(self, query):
SHOULD = BooleanClause.Occur.SHOULD
parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, query, ['docno', 'content'], [SHOULD, SHOULD], self.analyzer)
reader = IndexReader.open(self.directory)
searcher = IndexSearcher(reader)
searcher.setSimilarity(BM25Similarity())
topDocs = searcher.search(parsed_query, 10)
j = 0
for i in topDocs.scoreDocs:
d = searcher.doc(i.doc)
print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score)
j += 1
示例9: LuceneRetrieval
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
class LuceneRetrieval(BaseRetrieval):
"""
Encapsulates the Lucene retrieval engine
"""
def __init__(self, index_path, method, logger=None, use_default_similarity=False):
self.index_path=index_path
directory = SimpleFSDirectory(File(self.index_path))
self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)
self.reader=DirectoryReader.open(directory)
self.searcher = IndexSearcher(self.reader)
# uncomment one of these lines to change the type of parser, query and weight used
if use_default_similarity:
self.query_parser=QueryParser
else:
self.query_parser=FieldAgnosticQueryParser
if use_default_similarity:
similarity=DefaultSimilarity()
self.useExplainQuery=False
else:
similarity=FieldAgnosticSimilarity()
self.useExplainQuery=True
# by default, FieldAgnosticSimilarity uses coord factor, can be disabled
## similarity.useCoord=False
self.searcher.setSimilarity(similarity)
self.method=method # never used?
self.logger=logger
def runQueryViaExplain(self,query, max_results):
"""
Really crappy solution to make sure that explanations and searches are the same
while I fix Lucene
"""
results=[]
index=0
for index in range(self.reader.numDocs()):
explanation=self.searcher.explain(query,index)
score=explanation.getValue()
## match=re.search(r"(.*?)\s=",explanation.toString(),re.IGNORECASE|re.DOTALL)
## if match:
## score=float(match.group(1))
hit=namedtuple("Hit",["doc","score"])
hit.doc=index
hit.score=score
## heapq.heappush(results,hit)
results.append(hit)
results.sort(key=lambda x:x.score,reverse=True)
if max_results < self.reader.numDocs():
results=results[:max_results]
return results
def runQuery(self, structured_query, max_results=MAX_RESULTS_RECALL):
"""
LOTS OF SWEET LUCENE
"""
original_query=structured_query
if not structured_query or len(structured_query) == 0 :
return []
self.last_query=structured_query
query_text=self.rewriteQuery(structured_query["structured_query"], ["text"])
try:
query = self.query_parser(lucene.Version.LUCENE_CURRENT, "text", self.analyzer).parse(query_text)
except:
print("Lucene exception:",sys.exc_info()[:2])
return None
structured_query["lucene_query"]=query_text
if self.useExplainQuery:
# this should only exist until I fix the lucene bulkScorer to give the same results
hits=self.runQueryViaExplain(query,max_results)
else:
collector=TopScoreDocCollector.create(max_results, True)
self.searcher.search(query, collector)
hits = collector.topDocs().scoreDocs
## print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query))
res=[]
## if len(hits.scoreDocs) ==0:
## print "Original query:",original_query
## print "Query:", query
for hit in hits:
doc = self.searcher.doc(hit.doc)
metadata= json.loads(doc.get("metadata"))
res.append((hit.score,metadata))
return res
#.........这里部分代码省略.........
示例10: __init__
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
class SearchIndex:
def __init__(self, indexPath):
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
print 'lucene', lucene.VERSION
#initialize the index
self.INDEX_DIR = indexPath #"Clue_Index"
self.results = None
self.searcher = IndexSearcher(DirectoryReader.open(
SimpleFSDirectory(File(self.INDEX_DIR))))
self.searcher.setSimilarity(BM25Similarity())
def initializeAnalyzer(self):
#self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT,JavaSet(stopSet))
sSet = CharArraySet(Version.LUCENE_CURRENT, 0, True)
for entry in stopSet:
sSet.add(entry)
self.stopSet = sSet
#self.analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT,sSet)
self.analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)
def getTopDocuments(self, query, limit, sfield, dfield):
queryObj = QueryParser(Version.LUCENE_CURRENT, sfield,
self.analyzer).parse(query)
print queryObj
scoreDocs = self.searcher.search(queryObj, limit).scoreDocs
print '%s total matching documents.' % len(scoreDocs)
self.results = scoreDocs
rresults = []
i = 0
#reader = self.searcher.getIndexReader();
#print type(reader)
for scoreDoc in scoreDocs:
doc = self.searcher.doc(scoreDoc.doc)
rresults.append((doc.get(dfield), scoreDoc.score))
#rresults.append(doc.get(dfield));#,scoreDoc.score))
i += 1
if i == limit:
break
return rresults
#print 'path:', doc.get("URL"), 'name:', doc.get("id"), 'title:', doc.get("title")
def getTopDocumentsWithExpansion(self, query, expTerms, limit, sfield, dfield
):
print expTerms
query = query + ' ' + ' '.join('{0}^{1}'.format(x[0], round(x[1], 2))
for x in expTerms)
sSet = CharArraySet(Version.LUCENE_CURRENT, 0, True)
for entry in expTerms:
sSet.add(entry[0])
analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT, self.stopSet, sSet)
queryObj = QueryParser(Version.LUCENE_CURRENT, sfield,
analyzer).parse(query)
scoreDocs = self.searcher.search(queryObj, limit).scoreDocs
print '%s total matching documents.' % len(scoreDocs), queryObj
self.results = scoreDocs
rresults = []
i = 0
for scoreDoc in scoreDocs:
doc = self.searcher.doc(scoreDoc.doc)
#rresults.append(doc.get(dfield));#,scoreDoc.score))
rresults.append((doc.get(dfield), scoreDoc.score))
i += 1
if i == limit:
break
return rresults
def getField(self, dfield, name, limit):
toReturn = []
i = 0
for scoreDoc in self.results:
doc = self.searcher.doc(scoreDoc.doc)
toReturn.append((doc.get(dfield), doc.get(name)))
i += 1
if i == limit:
break
return toReturn
def close(self):
del self.searcher
示例11: Lucene
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
#.........这里部分代码省略.........
raise Exception("There is no open IndexReader to close")
def open_searcher(self):
"""
Open IndexSearcher. Automatically opens an IndexReader too,
if it is not already open. There is no close method for the
searcher.
"""
if self.searcher is None:
self.open_reader()
self.searcher = IndexSearcher(self.reader)
def get_searcher(self):
"""Returns index searcher (opens it if needed)."""
self.open_searcher()
return self.searcher
def set_lm_similarity_jm(self, method="jm", smoothing_param=0.1):
"""
Set searcher to use LM similarity.
:param method: LM similarity ("jm" or "dirichlet")
:param smoothing_param: smoothing parameter (lambda or mu)
"""
if method == "jm":
similarity = LMJelinekMercerSimilarity(smoothing_param)
elif method == "dirichlet":
similarity = LMDirichletSimilarity(smoothing_param)
else:
raise Exception("Unknown method")
if self.searcher is None:
raise Exception("Searcher has not been created")
self.searcher.setSimilarity(similarity)
def open_writer(self):
"""Open IndexWriter."""
if self.writer is None:
config = IndexWriterConfig(Lucene.get_version(), self.get_analyzer())
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
self.writer = IndexWriter(self.dir, config)
else:
raise Exception("IndexWriter is already open")
def close_writer(self):
"""Close IndexWriter."""
if self.writer is not None:
self.writer.close()
self.writer = None
else:
raise Exception("There is no open IndexWriter to close")
def add_document(self, contents):
"""
Adds a Lucene document with the specified contents to the index.
See LuceneDocument.create_document() for the explanation of contents.
"""
if self.ldf is None: # create a single LuceneDocument object that will be reused
self.ldf = LuceneDocument()
self.writer.addDocument(self.ldf.create_document(contents))
def get_lucene_document_id(self, doc_id):
"""Loads a document from a Lucene index based on its id."""
self.open_searcher()
query = TermQuery(Term(self.FIELDNAME_ID, doc_id))
tophit = self.searcher.search(query, 1).scoreDocs
示例12: len
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
total_recall += recall
total_precision += precision
total_FB += FB
print '%3s Recall: %.6f Precision: %.6f FB: %.6f' % (qid, recall, precision, FB)
query_data_length = len(query_data)
avg_recall = total_recall/query_data_length
avg_precision = total_precision/query_data_length
avg_FB = total_FB/query_data_length
print 'Avg Recall: %.6f Avg Precision: %.6f Avg FB: %.6f' % (avg_recall, avg_precision, avg_FB)
if __name__ == '__main__':
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
print 'lucene', lucene.VERSION
base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR)))
searcher = IndexSearcher(DirectoryReader.open(directory))
searcher.setSimilarity(similarities.BM25Similarity())
#Available similarity: BM25Similarity, MultiSimilarity, PerFieldSimilarityWrapper, SimilarityBase, TFIDFSimilarity
# analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
analyzer = MyAnalyzer(Version.LUCENE_CURRENT)
fs = FileSearcher(searcher, analyzer)
if len(sys.argv) < 2:
fs.perform_user_query(searcher, analyzer)
else:
fs.results_comparison(searcher, analyzer, sys.argv[1])
del searcher
示例13: LuceneCorpus
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
class LuceneCorpus(object):
# to init a LuceneCorpus, we need the outputdir, which is passed as index_dir
# we need filenames that contains one for more corpus we just created
# we need a parser, this parser should implement function 'parse' which knows how to split, how to stem
def __init__(self, index_dir, filenames, parser, similarity=None):
"""
:param index_dir: where to store the Lucene index
:param filenames: the corpus created previously. Note that the format of corpus that has been created is consistent
:param parser: SimpleWordParser in Parser.py, where we can apply functions such as stemming
:param similarity: We can put None here(then default Vector Space Model with TF-IDF is used) or we can use BM25 similarity to index
:return:
"""
self._index_dir = index_dir
self._filenames = filenames
self._parser = parser
self._similarity = similarity
lucene.initVM()
# the WhitespaceAnalyzer split the text based on whitespace
self._analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
self._store = SimpleFSDirectory(File(self._index_dir))
self._searcher = None
def prp_index(self):
'''
Prepare the index given our "corpus" file(s)
'''
print '=> Preparing Lucene index %s' % self._index_dir
writer = self._get_writer(create=True)
print ' Currently %d docs (dir %s)' % (writer.numDocs(), self._index_dir)
num_pages, num_sections = 0, 0
page_name, section_name = None, None
num_lines = 0
for ifname,fname in enumerate(self._filenames):
print ' Adding lines to index from file #%d: %s' % (ifname, fname)
with open(fname,'rt') as infile:
for text in infile:
if len(text)==0:
print 'Reached EOF'
break # EOF
# CorpusReader.PAGE_NAME_PREFIX is <Page>
# all our corpus we manipulated them to have this tag as the start of a page
if text.startswith(CorpusReader.PAGE_NAME_PREFIX):
page_name = text[len(CorpusReader.PAGE_NAME_PREFIX):].strip()
section_name = None
num_pages += 1
elif text.startswith(CorpusReader.SECTION_NAME_PREFIX):
section_name = text[len(CorpusReader.SECTION_NAME_PREFIX):].strip()
num_sections += 1
else:
assert (page_name is not None) and (section_name is not None)
if self._parser is None:
luc_text = text
else:
# note in our case the we always have SimpleWordParser
section_words = self._parser.parse(text, calc_weights=False) #True)
luc_text = ' '.join(section_words)
# for each section, we add the whole section to Lucene index, we store the text and makes it searchable
# seems like page is not necessary here since we do not add document page by page but section by section
doc = Document()
# there is only one field for each document, which is the text field
# section_name is not used as a field
doc.add(Field("text", luc_text, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
num_lines += 1
if num_lines % 100000 == 0:
print ' read %d lines so far: %d pages, %d sections' % (num_lines, num_pages, num_sections)
print ' Finished - %d docs (dir %s)' % (writer.numDocs(), self._index_dir)
writer.close()
def search(self, words, max_docs, weight_func=lambda n: np.ones(n), score_func=lambda s: s):
'''
Search the index for the given words, return total score
'''
searcher = self._get_searcher()
if type(words)==str:
search_text = words
search_text = AsciiConvertor.convert(search_text)
for c in '/+-&|!(){}[]^"~*?:':
search_text = search_text.replace('%s'%c, '\%s'%c)
else:
search_text = ' '.join(words)
print 'search_text: %s' % search_text
# note that whatever parser that we put as our argument, eventually when searching with query, we will use Lucene parser to split query words
query = QueryParser(Version.LUCENE_CURRENT, "text", self._analyzer).parse(search_text)
hits = searcher.search(query, max_docs)
score_sum = 0.0
weights = weight_func(len(hits.scoreDocs))
for hit,weight in zip(hits.scoreDocs, weights):
score_sum += weight * score_func(hit.score)
return score_sum
def _get_writer(self, analyzer=None, create=False):
config = IndexWriterConfig(Version.LUCENE_CURRENT, self._analyzer)
if create:
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
if self._similarity is not None:
config.setSimilarity(self._similarity)
writer = IndexWriter(self._store, config)
#.........这里部分代码省略.........
示例14: __init__
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
def __init__(self, tweets, storeDir, analyzer):
# first, index the tweets
if not path.exists(storeDir):
mkdir(storeDir)
store = SimpleFSDirectory(File(storeDir))
analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
self.index_docs(tweets, writer)
writer.commit()
writer.close()
# set up IndexSearcher
reader = IndexReader.open(store)
n_docs = reader.numDocs()
searcher = IndexSearcher(reader)
searcher.setSimilarity(BM25Similarity())
queryparser = QueryParser(Version.LUCENE_CURRENT, "contents", StandardAnalyzer(Version.LUCENE_CURRENT))
# create document vectors
doc_vectors = self.get_doc_vectors(reader, tweets, n_docs)
cs_scorer = CosineSimilarityScorer(doc_vectors, reader, searcher, tweets)
bm25_scorer = BM25Scorer(doc_vectors, searcher, queryparser)
# find relevant tweets
for fav_doc in (1, 26, 51):
cs_scores = cs_scorer.get_scores(fav_doc)
bm25_scores = bm25_scorer.get_scores(fav_doc)
top_cs_scores = dict(sorted(cs_scores.iteritems(), key=itemgetter(1), reverse=True)[:5])
top_bm25_scores = dict(sorted(bm25_scores.iteritems(), key=itemgetter(1), reverse=True)[:5])
# print "top_cs_scores", top_cs_scores
# print "top_bm25_scores", top_bm25_scores
# calculate composite score by multiplying cs scores by 100 and keeping bm25 scores as is.
# cs is bounded from 0.0-1.0. bm25 scores is actually idf * bm25_similarity_score so values
# above 10.0 are not uncommon
top_blended_scores = {}
for key, value in top_cs_scores.iteritems():
top_blended_scores[key] = value * 100.0
for key, value in top_bm25_scores.iteritems():
if key not in top_blended_scores:
top_blended_scores[key] = 0.0
top_blended_scores[key] += value
top_score = dict(sorted(top_blended_scores.iteritems(), key=itemgetter(1), reverse=True)[:1])
# print "\n"
# print "results for", fav_doc
# print tweets[fav_doc]
print searcher.doc(fav_doc).get("contents")
print top_score
# if the top score fails to reach 10.0, this result is probably not of high quality so onlyworthy
# will decline to identify a relevant match
if top_score.values()[0] < 10.0:
print "skipping"
continue
# print tweets[top_score.keys()[0]]
print searcher.doc(top_score.keys()[0]).get("contents")
print "\n"