本文整理汇总了Python中org.apache.lucene.search.IndexSearcher.doc方法的典型用法代码示例。如果您正苦于以下问题:Python IndexSearcher.doc方法的具体用法?Python IndexSearcher.doc怎么用?Python IndexSearcher.doc使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.search.IndexSearcher
的用法示例。
在下文中一共展示了IndexSearcher.doc方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_candidates
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import doc [as 别名]
def get_candidates(qatp):
if prm.create_index:
create_index()
lucene.initVM()
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder)))
searcher = IndexSearcher(reader)
candidates = []
n = 0
for q,a,t,p in qatp:
if n % 100 == 0:
print 'finding candidates sample', n
n+=1
q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q))
hits = searcher.search(query, prm.max_candidates)
c = []
for hit in hits.scoreDocs:
doc = searcher.doc(hit.doc)
c.append(doc.get("id"))
candidates.append(c)
return candidates
示例2: search
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import doc [as 别名]
def search(self, input_query=None, max_answers=10):
''' Searches the given query in the index '''
if input_query is None:
return None
base_dir = '.'
directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir)))
searcher = IndexSearcher(DirectoryReader.open(directory))
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
# query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(input_query)
parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, (self._posts_fields + self._answer_fields), analyzer)
query = MultiFieldQueryParser.parse(parser, input_query)
scoreDocs = searcher.search(query, max_answers).scoreDocs
print "%s total matching documents." % len(scoreDocs)
docs = []
for scoreDoc in scoreDocs:
doc = searcher.doc(scoreDoc.doc)
doc_dict = dict((field.name(), field.stringValue()) for field in doc.getFields())
docs.append(doc_dict)
# print doc
return docs
示例3: Searcher
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import doc [as 别名]
class Searcher(object):
def __init__(self, **kwargs):
""" Initialize a new instance of the Searcher
:param count: The number of counts to return from a query
:param output: The output directory of the underlying index
"""
self.count = kwargs.get("count", 100)
self.output = kwargs.get("root", "index")
self.store = SimpleFSDirectory(File(self.output))
self.analyzer = StandardAnalyzer(Version.LUCENE_30)
self.searcher = IndexSearcher(DirectoryReader.open(self.store))
def search(self, query):
""" Given a query, apply it against the existing index.
:param query: The query to apply to the index
:returns: A generator of the matching documents
"""
query = QueryParser(Version.LUCENE_30, "data", self.analyzer).parse(query)
results = self.searcher.search(query, self.count)
for result in results.scoreDocs or []:
# logger.debug("%s %s %s", hit.score, hit.doc, hit.toString())
document = self.searcher.doc(result.doc)
yield document.get("path"), result.score
示例4: LuceneSearcher
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import doc [as 别名]
class LuceneSearcher(object):
fields = ['id', 'text', 'types']
def __init__(self, db_path):
directory = SimpleFSDirectory(File(db_path))
reader = DirectoryReader.open(directory)
self.searcher = IndexSearcher(reader)
self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
logger.info("Loaded DB from %s with %d documents: ",
db_path, reader.numDocs())
def search(self, query, max_matches=1000):
query = VALID_CHARS_PATTERN.sub(' ', query)
logger.debug("Searching for %s", query)
query = QueryParser(Version.LUCENE_CURRENT, "text",
self.analyzer).parse(query)
score_docs = self.searcher.search(query, max_matches).scoreDocs
logger.debug("%s total matching documents.",
len(score_docs))
docs = [self.searcher.doc(d.doc) for d in score_docs]
return [self.convert_to_dict(doc) for doc in docs]
def convert_to_dict(self, doc):
return {field: doc.get(field) for field in self.fields}
示例5: search
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import doc [as 别名]
def search(self):
''' Searches the given query in the index '''
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
# print 'lucene', lucene.VERSION
# base_dir = os.path.dirname(os.path.abspath('.'))
base_dir = '.'
directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir)))
searcher = IndexSearcher(DirectoryReader.open(directory))
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
while True:
print
print "Hit enter with no input to quit."
command = raw_input("Query:")
if command == '':
return
print
print "Searching for:", command
query = QueryParser(Version.LUCENE_CURRENT, "title",
analyzer).parse(command)
scoreDocs = searcher.search(query, 50).scoreDocs
print "%s total matching documents." % len(scoreDocs)
for scoreDoc in scoreDocs:
doc = searcher.doc(scoreDoc.doc)
# print 'path:', doc.get("path"), 'name:', doc.get("name")
print doc
示例6: get_image_pmcid
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import doc [as 别名]
def get_image_pmcid(pmcid, classes = ""):
fields = ["pmcid", "class"]
docs = []
location = web.__path__[0] + "/static/web/files/index/index.figures"
#lucene.initVM()
vm_env = lucene.getVMEnv()
vm_env.attachCurrentThread()
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(location)))
searcher = IndexSearcher(reader)
# multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
#query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
# query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
#query = query.parse(query, ('4175339','1'))
# query.parse(queryString)#"Shigella sonnei"
# query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"
MAX = 10000
#hits = searcher.search(query, MAX)
if classes == "all":
queryStr = "pmcid:(" + ' '.join(pmcid) +")"
else:
queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes
query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query
q = query.parse(queryStr)
hits = searcher.search(q, MAX)
for hit in hits.scoreDocs:#should only be one
#print hit.score, hit.doc, hit.toString()
docs.append(searcher.doc(hit.doc))
return docs #This will return the image documents that belong to a pmcid(article)
示例7: search
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import doc [as 别名]
def search(self, field, text):
"""
search text within indexed data
input:
field fieldname of the value that will be indexed
text text to search
output:
hits return a list of hits
"""
results = []
idx_reader = DirectoryReader.open(self.directory)
idx_searcher = IndexSearcher(idx_reader)
# parse query
parser = AnalyzingQueryParser(Version.LUCENE_CURRENT, field, self.analyser)
query = parser.parse(text)
# search
hits = idx_searcher.search(query, 1000).scoreDocs.tolist()
for hit in hits:
doc = idx_searcher.doc(hit.doc)
score = hit.score
title = doc.get(field)
url = doc.get("url")
results.append((score, url, title))
return results
示例8: query
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import doc [as 别名]
def query(self, data):
if self.fil.exists():
searcher = IndexSearcher(DirectoryReader.open(self.d))
query = QueryParser(
Version.LUCENE_30,
"id",
self.analyzer).parse(
data['query'])
hits = searcher.search(query, 100000)
results = {}
results['totalHits'] = hits.totalHits
results['hits'] = {}
for hit in hits.scoreDocs:
record = {}
doc = searcher.doc(hit.doc)
fields = doc.getFields()
record['score'] = hit.score
for field in fields:
if field.name() != "id":
record[field.name()] = field.stringValue()
results['hits'][doc.get('id')] = record
searcher.getIndexReader().close()
return results
示例9: get_query_results
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import doc [as 别名]
def get_query_results(reader,query,n,field):
searcher = IndexSearcher(reader)
hits = searcher.search(query, n).scoreDocs
print("Found %d hits:" % len(hits))
for i, hit in enumerate(hits):
doc = searcher.doc(hit.doc)
print("%d. %s" % (i + 1, doc.get(field)))
示例10: search
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import doc [as 别名]
def search():
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
args = []
if request.method == 'POST':
if request.form['ies']:
args.append('+ies:'+request.form['ies'])
if request.form['area']:
args.append('+area:'+request.form['area'])
if request.form['professor']:
args.append('+professor:'+request.form['professor'])
if request.form['conceito']:
#args.append('m:'+request.form['conceito']+'d:'+request.form['conceito']+'f:'+request.form['conceito'])
args.append('m:'+request.form['conceito'])
args.append('d:'+request.form['conceito'])
args.append('f:'+request.form['conceito'])
table = []
if(len(args) > 0):
scoreDocs = mansearch.buscar('indexer/',args)
fsDir = SimpleFSDirectory(File(indexDir))
searcher = IndexSearcher(DirectoryReader.open(fsDir))
for scoreDoc in scoreDocs:
doc = searcher.doc(scoreDoc.doc)
table.append(dict((field.name(), field.stringValue()) for field in doc.getFields()))
return render_template('busca.html',table = table)
pass
示例11: search_docs
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import doc [as 别名]
def search_docs(self, value, field="general_info"):
MAX_RESULTS = 1000
searcher = IndexSearcher(DirectoryReader.open(self.store))
query = QueryParser(Version.LUCENE_CURRENT, field,
self.analyzer).parse(value)
topDocs = searcher.search(query, MAX_RESULTS)
return [searcher.doc(hit.doc) for hit in topDocs.scoreDocs]
示例12: perform_search
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import doc [as 别名]
def perform_search(self, searchterm, results_per_page, page):
# if there is a field in the searchterm
"""if ":" in searchterm:
# processing a query
parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)
parser.setDefaultOperator(QueryParser.Operator.AND)
query = parser.parse(searchterm)
else:
query = BooleanQuery()
query_title = TermQuery(Term("title", searchterm))
query_description = TermQuery(Term("description", searchterm))
query_content = TermQuery(Term("content", searchterm))
# BooleanClause.Occur.MUST for AND queries
query.add(query_title, BooleanClause.Occur.SHOULD)
query.add(query_description, BooleanClause.Occur.SHOULD)
query.add(query_content, BooleanClause.Occur.SHOULD)"""
# create QueryParser for each field to be searched
parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer)
parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer)
parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)
# put fields together
query = BooleanQuery()
query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD)
query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD)
query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD)
# conducting search
searcher = IndexSearcher(DirectoryReader.open(self.store))
start = datetime.now()
hits = searcher.search(query, results_per_page + (results_per_page * page))
score_docs = hits.scoreDocs
count_results = hits.totalHits
duration = datetime.now() - start
# results to return
results = []
count = 0
for scoreDoc in score_docs:
# skip offset
if count < results_per_page * page:
count += 1
continue
count += 1
doc = searcher.doc(scoreDoc.doc)
table = dict((field.name(), field.stringValue()) for field in doc.getFields())
results.append(table)
return results, duration, count_results
示例13: SearchQuery
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import doc [as 别名]
def SearchQuery(queryString, fields, classification):
#if __name__ == "__main__":
#if __name__ == "retriever":
location = web.__path__[0] + "/static/web/files/index/index.articles"
#lucene.initVM()
vm_env = lucene.getVMEnv()
vm_env.attachCurrentThread()
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(location)))
searcher = IndexSearcher(reader)
#multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
#query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
query = MultiFieldQueryParser.parse(query, queryString)
#query.parse(queryString)#"Shigella sonnei"
#query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"
MAX = 10000
hits = searcher.search(query, MAX)
print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
paths = []
pmcids = []
documentDict = {}
for hit in hits.scoreDocs:
doc = searcher.doc(hit.doc)
pmcids.append(doc.get("pmcid"))
docDict = {"title" : doc.get("title")}#we can add any other field we want...
documentDict[doc.get("pmcid")] = docDict
#Where we get the images for all the pmcids
images = get_image_pmcid(pmcids, classification)#should take in pmcids and class
#create dictionary of images with pmcid being their key
imagesDict = {}
for img in images:
img_pmcid = img.get("pmcid")
if img_pmcid in imagesDict.keys():
imagesDict[img_pmcid].append(img.get("filepath") + "/" + img.get("figureid"))
else:
imagesDict[img_pmcid] = [(img.get("filepath") + "/" + img.get("figureid"))]
#for each pmcid, we will assign an image to it for the search results
for pmcid in pmcids:
if imagesDict:
docDict = documentDict[pmcid]
docDict["imgURL"] = imagesDict[pmcid][0]
documentDict[pmcid] = docDict
else:
docDict = documentDict[pmcid]
docDict["imgURL"] = "images/NoImageAvailable.jpg"
documentDict[pmcid] = docDict
#END - Where we get the images for all the pmcids
return documentDict
示例14: SearchIndex
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import doc [as 别名]
class SearchIndex(object):
def __init__(self):
vm_env = lucene.getVMEnv()
vm_env.attachCurrentThread()
indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH']))
self.searcher = IndexSearcher(DirectoryReader.open(indexDir))
self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)
def search(self, q, page = 1, duplicates = False):
query = self.parser.parse(q)
if not duplicates:
query = self.addDuplicatesQuery(query)
perPage = 10
start = (page - 1) * perPage
results = TopScoreDocCollector.create(1000, True)
self.searcher.search(query, results)
highlighter = Highlighter(QueryScorer(query))
highlighter.setTextFragmenter(SimpleFragmenter(40))
docs = []
for scoreDoc in results.topDocs(start, perPage).scoreDocs:
doc = self.searcher.doc(scoreDoc.doc)
tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents']))
highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...")
docs.append({
'title': doc['title'],
'url': doc['url'],
'duplicate': doc['duplicate'],
'highlight': highlight}
)
del self.searcher
totalPages = int(math.ceil(results.getTotalHits()/float(perPage)))
return totalPages, docs
def addDuplicatesQuery(self, query):
not_duplicate = TermQuery(Term('duplicate', 'false'))
booleanQuery = BooleanQuery()
booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST)
booleanQuery.add(query, BooleanClause.Occur.MUST)
return booleanQuery
示例15: search
# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import doc [as 别名]
def search(term, n_docs=10, index='index'):
store = SimpleFSDirectory(File(index))
searcher = IndexSearcher(DirectoryReader.open(store))
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
query = QueryParser(Version.LUCENE_CURRENT, 'art_body', analyzer).parse(term)
# str(query.getClass().toString()) == "class org.apache.lucene.search.TermQuery"
score_docs = searcher.search(query, n_docs).scoreDocs
return [(score_doc.score, unicode(searcher.doc(score_doc.doc).get('art_body'))) for score_doc in score_docs]