本文整理汇总了Python中org.apache.lucene.search.IndexSearcher.search方法的典型用法代码示例。如果您正苦于以下问题:Python IndexSearcher.search方法的具体用法?Python IndexSearcher.search怎么用?Python IndexSearcher.search使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.search.IndexSearcher
的用法示例。
在下文中一共展示了IndexSearcher.search方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: lucene_retrieval_multifield
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import search [as 别名]
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False):
"""
multifield: different query string for different field
not same word on different field
:param q_string:
:param feature_type:
:param use_BM25:
:return: retrieval_scores for each question-answer pair
"""
index = set_lucene_index['ind'] # nonlocal variable index
def retrieval_scores(hists):
"""
return sorted document+score by score
:param hists:
"""
def doc_score(hists):
"""
return doc_name & score
:param hists:
"""
for h in hists:
# docID = h.doc
# doc = searcher.doc(docID)
# file_name = doc.get("corpus_name")
# doc_name = doc.get("doc_name")
# text = doc.get("text")
score = h.score
# yield (file_name, doc_name, score, text)
yield score
doc_score_list = list(doc_score(hists))
return map(lambda f: f(doc_score_list), feature_type) # feature_type is a list of function
text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class))
query = BooleanQuery()
# BooleanClause.Occur
# MUST implies that the keyword must occur
# SHOULD implies that the keyword SHOULD occur
query.add(text_query, BooleanClause.Occur.SHOULD)
query.add(subject_query, BooleanClause.Occur.SHOULD)
# search
reader = IndexReader.open(index)
searcher = IndexSearcher(reader)
if use_BM25:
searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters
collector = TopScoreDocCollector.create(hitsPerPage, True)
searcher.search(query, collector)
hs = collector.topDocs().scoreDocs # hists
results = retrieval_scores(hs)
# reader.close()
return results # retrieval_scores for each question-answer pair
开发者ID:rarezhang,项目名称:allen-ai-science-challenge,代码行数:59,代码来源:question_classification_subject_feature.py
示例2: SearchIndex
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import search [as 别名]
class SearchIndex(object):
def __init__(self):
vm_env = lucene.getVMEnv()
vm_env.attachCurrentThread()
indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH']))
self.searcher = IndexSearcher(DirectoryReader.open(indexDir))
self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)
def search(self, q, page = 1, duplicates = False):
query = self.parser.parse(q)
if not duplicates:
query = self.addDuplicatesQuery(query)
perPage = 10
start = (page - 1) * perPage
results = TopScoreDocCollector.create(1000, True)
self.searcher.search(query, results)
highlighter = Highlighter(QueryScorer(query))
highlighter.setTextFragmenter(SimpleFragmenter(40))
docs = []
for scoreDoc in results.topDocs(start, perPage).scoreDocs:
doc = self.searcher.doc(scoreDoc.doc)
tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents']))
highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...")
docs.append({
'title': doc['title'],
'url': doc['url'],
'duplicate': doc['duplicate'],
'highlight': highlight}
)
del self.searcher
totalPages = int(math.ceil(results.getTotalHits()/float(perPage)))
return totalPages, docs
def addDuplicatesQuery(self, query):
not_duplicate = TermQuery(Term('duplicate', 'false'))
booleanQuery = BooleanQuery()
booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST)
booleanQuery.add(query, BooleanClause.Occur.MUST)
return booleanQuery
示例3: lucene_retrieval
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import search [as 别名]
def lucene_retrieval(q_string, feature_type, use_BM25=False):
"""
:param q_string:
:param feature_type:
:param use_BM25:
:return: retrieval_scores for each question-answer pair
"""
index = set_lucene_index['ind'] # nonlocal variable index
def retrieval_scores(hists):
"""
return sorted document+score by score
:param hists:
"""
def doc_score(hists):
"""
return doc_name & score
:param hists:
"""
for h in hists:
# docID = h.doc
# doc = searcher.doc(docID)
# file_name = doc.get("corpus_name")
# doc_name = doc.get("doc_name")
# text = doc.get("text")
score = h.score
# yield (file_name, doc_name, score, text)
yield score
doc_score_list = list(doc_score(hists))
return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function
# escape special characters via escape function
query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
# search
reader = IndexReader.open(index)
searcher = IndexSearcher(reader)
if use_BM25:
searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters
collector = TopScoreDocCollector.create(hitsPerPage, True)
searcher.search(query, collector)
hs = collector.topDocs().scoreDocs # hists
results = retrieval_scores(hs)
# reader.close()
return results # retrieval_scores for each question-answer pair
示例4: search
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import search [as 别名]
def search(self, field, text):
"""
search text within indexed data
input:
field fieldname of the value that will be indexed
text text to search
output:
hits return a list of hits
"""
results = []
idx_reader = DirectoryReader.open(self.directory)
idx_searcher = IndexSearcher(idx_reader)
# parse query
parser = AnalyzingQueryParser(Version.LUCENE_CURRENT, field, self.analyser)
query = parser.parse(text)
# search
hits = idx_searcher.search(query, 1000).scoreDocs.tolist()
for hit in hits:
doc = idx_searcher.doc(hit.doc)
score = hit.score
title = doc.get(field)
url = doc.get("url")
results.append((score, url, title))
return results
示例5: query
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import search [as 别名]
def query(self, data):
if self.fil.exists():
searcher = IndexSearcher(DirectoryReader.open(self.d))
query = QueryParser(
Version.LUCENE_30,
"id",
self.analyzer).parse(
data['query'])
hits = searcher.search(query, 100000)
results = {}
results['totalHits'] = hits.totalHits
results['hits'] = {}
for hit in hits.scoreDocs:
record = {}
doc = searcher.doc(hit.doc)
fields = doc.getFields()
record['score'] = hit.score
for field in fields:
if field.name() != "id":
record[field.name()] = field.stringValue()
results['hits'][doc.get('id')] = record
searcher.getIndexReader().close()
return results
示例6: get_image_pmcid
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import search [as 别名]
def get_image_pmcid(pmcid, classes = ""):
fields = ["pmcid", "class"]
docs = []
location = web.__path__[0] + "/static/web/files/index/index.figures"
#lucene.initVM()
vm_env = lucene.getVMEnv()
vm_env.attachCurrentThread()
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(location)))
searcher = IndexSearcher(reader)
# multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
#query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
# query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
#query = query.parse(query, ('4175339','1'))
# query.parse(queryString)#"Shigella sonnei"
# query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"
MAX = 10000
#hits = searcher.search(query, MAX)
if classes == "all":
queryStr = "pmcid:(" + ' '.join(pmcid) +")"
else:
queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes
query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query
q = query.parse(queryStr)
hits = searcher.search(q, MAX)
for hit in hits.scoreDocs:#should only be one
#print hit.score, hit.doc, hit.toString()
docs.append(searcher.doc(hit.doc))
return docs #This will return the image documents that belong to a pmcid(article)
示例7: buscar
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import search [as 别名]
def buscar(indexDir, args,options = None):
#lucene.initVM(vmargs=['-Djava.awt.headless=true'])
fsDir = SimpleFSDirectory(File(indexDir))
#print fsDir
#Criando buscador baseado no diretorio dos indices passados pelo usuario
searcher = IndexSearcher(DirectoryReader.open(fsDir))
#Analizador para filtro dos tokens
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
#print analyzer
#Criando um QueryParser usando por padrao contents
#Variavel com as restricoes da busca
parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer)
#print parser
parser.setDefaultOperator(QueryParser.Operator.AND)
#print args
#Juntando parametros passados com o valor do mesmo
command = ' +'.join(args)
#print command
query = parser.parse(command)
print query
#Criando um JArray com resultado da consulta
return searcher.search(query, 200).scoreDocs
示例8: Searcher
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import search [as 别名]
class Searcher(object):
def __init__(self, **kwargs):
""" Initialize a new instance of the Searcher
:param count: The number of counts to return from a query
:param output: The output directory of the underlying index
"""
self.count = kwargs.get("count", 100)
self.output = kwargs.get("root", "index")
self.store = SimpleFSDirectory(File(self.output))
self.analyzer = StandardAnalyzer(Version.LUCENE_30)
self.searcher = IndexSearcher(DirectoryReader.open(self.store))
def search(self, query):
""" Given a query, apply it against the existing index.
:param query: The query to apply to the index
:returns: A generator of the matching documents
"""
query = QueryParser(Version.LUCENE_30, "data", self.analyzer).parse(query)
results = self.searcher.search(query, self.count)
for result in results.scoreDocs or []:
# logger.debug("%s %s %s", hit.score, hit.doc, hit.toString())
document = self.searcher.doc(result.doc)
yield document.get("path"), result.score
示例9: get_query_results
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import search [as 别名]
def get_query_results(reader,query,n,field):
searcher = IndexSearcher(reader)
hits = searcher.search(query, n).scoreDocs
print("Found %d hits:" % len(hits))
for i, hit in enumerate(hits):
doc = searcher.doc(hit.doc)
print("%d. %s" % (i + 1, doc.get(field)))
示例10: LuceneSearcher
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import search [as 别名]
class LuceneSearcher(object):
fields = ['id', 'text', 'types']
def __init__(self, db_path):
directory = SimpleFSDirectory(File(db_path))
reader = DirectoryReader.open(directory)
self.searcher = IndexSearcher(reader)
self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
logger.info("Loaded DB from %s with %d documents: ",
db_path, reader.numDocs())
def search(self, query, max_matches=1000):
query = VALID_CHARS_PATTERN.sub(' ', query)
logger.debug("Searching for %s", query)
query = QueryParser(Version.LUCENE_CURRENT, "text",
self.analyzer).parse(query)
score_docs = self.searcher.search(query, max_matches).scoreDocs
logger.debug("%s total matching documents.",
len(score_docs))
docs = [self.searcher.doc(d.doc) for d in score_docs]
return [self.convert_to_dict(doc) for doc in docs]
def convert_to_dict(self, doc):
return {field: doc.get(field) for field in self.fields}
示例11: get_candidates
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import search [as 别名]
def get_candidates(qatp):
if prm.create_index:
create_index()
lucene.initVM()
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder)))
searcher = IndexSearcher(reader)
candidates = []
n = 0
for q,a,t,p in qatp:
if n % 100 == 0:
print 'finding candidates sample', n
n+=1
q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q))
hits = searcher.search(query, prm.max_candidates)
c = []
for hit in hits.scoreDocs:
doc = searcher.doc(hit.doc)
c.append(doc.get("id"))
candidates.append(c)
return candidates
示例12: search
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import search [as 别名]
def search(self, input_query=None, max_answers=10):
''' Searches the given query in the index '''
if input_query is None:
return None
base_dir = '.'
directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir)))
searcher = IndexSearcher(DirectoryReader.open(directory))
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
# query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(input_query)
parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, (self._posts_fields + self._answer_fields), analyzer)
query = MultiFieldQueryParser.parse(parser, input_query)
scoreDocs = searcher.search(query, max_answers).scoreDocs
print "%s total matching documents." % len(scoreDocs)
docs = []
for scoreDoc in scoreDocs:
doc = searcher.doc(scoreDoc.doc)
doc_dict = dict((field.name(), field.stringValue()) for field in doc.getFields())
docs.append(doc_dict)
# print doc
return docs
示例13: retrieve
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import search [as 别名]
def retrieve(indexdir, queries):
lucene.initVM()
f = open("results_lucene.txt", "w")
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(indexdir)))
searcher = IndexSearcher(reader)
fields = ["title", "abstract", "authors"]
st = PorterStemmer()
for id, q in queries.iteritems():
query = q
tokenizer = RegexpTokenizer(r'\w+')
qwords = tokenizer.tokenize(query)
qwords_k = [st.stem(q) for q in qwords]
query = " ".join(qwords_k)
parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer)
parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
query = MultiFieldQueryParser.parse(parser, query)
MAX = 1000
hits = searcher.search(query, MAX)
# print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
for i, hit in enumerate(hits.scoreDocs):
f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score))
# print hit.doc+1, hit.score
# doc = searcher.doc(hit.doc)
# print doc.get("authors").encode("utf-8")
f.close()
示例14: search
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import search [as 别名]
def search(self):
''' Searches the given query in the index '''
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
# print 'lucene', lucene.VERSION
# base_dir = os.path.dirname(os.path.abspath('.'))
base_dir = '.'
directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir)))
searcher = IndexSearcher(DirectoryReader.open(directory))
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
while True:
print
print "Hit enter with no input to quit."
command = raw_input("Query:")
if command == '':
return
print
print "Searching for:", command
query = QueryParser(Version.LUCENE_CURRENT, "title",
analyzer).parse(command)
scoreDocs = searcher.search(query, 50).scoreDocs
print "%s total matching documents." % len(scoreDocs)
for scoreDoc in scoreDocs:
doc = searcher.doc(scoreDoc.doc)
# print 'path:', doc.get("path"), 'name:', doc.get("name")
print doc
示例15: lucene_retrieval
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import search [as 别名]
def lucene_retrieval(q_string, use_BM25=False):
"""
:param q_string:
:param use_BM25:
:return: retrieval_scores for each question-answer pair
"""
index = set_lucene_index['ind'] # nonlocal variable index
def doc_text(hists):
"""
return doc_name & score
:param hists:
"""
text = '_NONE_'
for h in hists:
docID = h.doc
doc = searcher.doc(docID)
# file_name = doc.get("corpus_name")
# doc_name = doc.get("doc_name")
text = doc.get("text")
#score = h.score
# yield (file_name, doc_name, score, text)
return text
result = '_NONE_'
# escape special characters via escape function
if q_string and q_string.strip(): # when pre-process answers, `none of the above` -> '' cause error here
#print(q_string)
query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
# search
reader = IndexReader.open(index)
searcher = IndexSearcher(reader)
if use_BM25:
searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters
collector = TopScoreDocCollector.create(hitsPerPage, True)
searcher.search(query, collector)
hs = collector.topDocs().scoreDocs # hists
result = doc_text(hs)
# reader.close()
return result # text: also nodes
开发者ID:rarezhang,项目名称:allen-ai-science-challenge,代码行数:48,代码来源:network_feature_index_retrieval_nodes.py