本文整理汇总了Python中org.apache.lucene.index.IndexReader类的典型用法代码示例。如果您正苦于以下问题:Python IndexReader类的具体用法?Python IndexReader怎么用?Python IndexReader使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了IndexReader类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: irsolver
def irsolver(data_file, index) :
from questions import get_input_data
lucene.initVM()
stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
for s in stopwords :
stops.add(s)
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
reader = IndexReader.open(SimpleFSDirectory(File(index)))
searcher = IndexSearcher(reader)
pred = []
mapp = { 1 : 'A', 2 : 'B', 3 : 'C', 4 : 'D'}
idx, ques, ans = get_input_data(data_file)
for acm, (idq, q, a) in enumerate(zip(idx, ques, ans)) :
max_score = -1000000
best_ans = 'A'
for i, ai in enumerate(a):
sc = query(q, ai, analyzer, searcher)
print(acm, i, sc)
if sc > max_score :
max_score = sc
best_ans = mapp[i+1]
pred.append(best_ans)
return idx, pred
示例2: retrieve
def retrieve(indexdir, queries):
lucene.initVM()
f = open("results_lucene.txt", "w")
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(indexdir)))
searcher = IndexSearcher(reader)
fields = ["title", "abstract", "authors"]
st = PorterStemmer()
for id, q in queries.iteritems():
query = q
tokenizer = RegexpTokenizer(r'\w+')
qwords = tokenizer.tokenize(query)
qwords_k = [st.stem(q) for q in qwords]
query = " ".join(qwords_k)
parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer)
parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
query = MultiFieldQueryParser.parse(parser, query)
MAX = 1000
hits = searcher.search(query, MAX)
# print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
for i, hit in enumerate(hits.scoreDocs):
f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score))
# print hit.doc+1, hit.score
# doc = searcher.doc(hit.doc)
# print doc.get("authors").encode("utf-8")
f.close()
示例3: getRandomDoc2
def getRandomDoc2():
location = web.__path__[0] + "/static/web/files/index/index.articles"
#lucene.initVM()
vm_env = lucene.getVMEnv()
vm_env.attachCurrentThread()
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(location)))
searcher = IndexSearcher(reader)
#query = QueryParser(Version.LUCENE_4_10_1, "keywords", analyzer).parse(queryString)#"Shigella sonnei"
MAX = 1000
docNum = randrange(0, reader.maxDoc())
doc = reader.document(docNum)
#print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
files = []
fileRoots = []
paths = []
paths.append(doc.get("articlepath"))
pth = paths[0].replace("/home/kevin/Downloads/","/home/kevin/git/YIF/imageFinder/web/static/web/")#os.path.join(tools.__path__,"static/web/images")
for root, directories, filenames in os.walk(pth):#probably something wrong with the location
for filename in filenames:
if (".jpg" or ".gif" or ".png") in filename:
files.append(root.replace("/home/kevin/git/YIF/imageFinder/web/static/web/","") + "/" +filename)#temp, will need to chance
fileRoots.append(root)
print (root.replace("/home/kevin/git/YIF/imageFinder/web/static/web/","") + "/" + filename)
try:
rng = randrange(0, len(files))
except:
return -1
else:
return files[randrange(0, len(files))]
示例4: evaluate_index
def evaluate_index(index_dir, context, analyzer):
# eval time of indexing (overall)
# we should also measure the elapsed time of
# each index_document call seperately
start = time.clock()
Indexer(index_dir, context, analyzer)
end = time.clock()
duration = end-start
directory = SimpleFSDirectory(File(index_dir))
reader = IndexReader.open(directory)
vocabulary = MultiFields.getTerms(reader, 'title')
vocab_size = vocabulary.size()
if vocab_size == -1:
termsref = BytesRefIterator.cast_(vocabulary.iterator(None))
vocab_size = sum(1 for _ in termsref)
# print str(vocab_size) # size of vocabulary
# print str(vocabulary.getDocCount()) # #docs that have at least one term for title field
# print str(vocabulary.getSumTotalTermFreq()) # #tokens
# print str(vocabulary.getSumDocFreq()) # #postings
reader.close()
return duration, vocab_size
示例5: get_image_pmcid
def get_image_pmcid(pmcid, classes = ""):
fields = ["pmcid", "class"]
docs = []
location = web.__path__[0] + "/static/web/files/index/index.figures"
#lucene.initVM()
vm_env = lucene.getVMEnv()
vm_env.attachCurrentThread()
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(location)))
searcher = IndexSearcher(reader)
# multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
#query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
# query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
#query = query.parse(query, ('4175339','1'))
# query.parse(queryString)#"Shigella sonnei"
# query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"
MAX = 10000
#hits = searcher.search(query, MAX)
if classes == "all":
queryStr = "pmcid:(" + ' '.join(pmcid) +")"
else:
queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes
query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query
q = query.parse(queryStr)
hits = searcher.search(q, MAX)
for hit in hits.scoreDocs:#should only be one
#print hit.score, hit.doc, hit.toString()
docs.append(searcher.doc(hit.doc))
return docs #This will return the image documents that belong to a pmcid(article)
示例6: get_candidates
def get_candidates(qatp):
if prm.create_index:
create_index()
lucene.initVM()
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder)))
searcher = IndexSearcher(reader)
candidates = []
n = 0
for q,a,t,p in qatp:
if n % 100 == 0:
print 'finding candidates sample', n
n+=1
q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q))
hits = searcher.search(query, prm.max_candidates)
c = []
for hit in hits.scoreDocs:
doc = searcher.doc(hit.doc)
c.append(doc.get("id"))
candidates.append(c)
return candidates
示例7: __init__
def __init__(self, path):
print "Loading data.json..."
with open(path, "r") as f:
self.data = json.load(f)
lucene.initVM()
self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
self.reader = IndexReader.open(SimpleFSDirectory(File("index/")))
self.searcher = IndexSearcher(self.reader)
示例8: __init__
def __init__(self, lucene_dir_path):
if lucene_dir_path!=None and lucene_dir_path!='':
lucene.initVM()
directory = SimpleFSDirectory(File(lucene_dir_path))
self.indexReader = IndexReader.open(directory)
self.is_init=True
else:
self.is_init=False
示例9: SearchQuery
def SearchQuery(queryString, fields, classification):
#if __name__ == "__main__":
#if __name__ == "retriever":
location = web.__path__[0] + "/static/web/files/index/index.articles"
#lucene.initVM()
vm_env = lucene.getVMEnv()
vm_env.attachCurrentThread()
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File(location)))
searcher = IndexSearcher(reader)
#multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser
query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer)
#query.setDefaultOperator(QueryParserBase.AND_OPERATOR)
query = MultiFieldQueryParser.parse(query, queryString)
#query.parse(queryString)#"Shigella sonnei"
#query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei"
MAX = 10000
hits = searcher.search(query, MAX)
print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
paths = []
pmcids = []
documentDict = {}
for hit in hits.scoreDocs:
doc = searcher.doc(hit.doc)
pmcids.append(doc.get("pmcid"))
docDict = {"title" : doc.get("title")}#we can add any other field we want...
documentDict[doc.get("pmcid")] = docDict
#Where we get the images for all the pmcids
images = get_image_pmcid(pmcids, classification)#should take in pmcids and class
#create dictionary of images with pmcid being their key
imagesDict = {}
for img in images:
img_pmcid = img.get("pmcid")
if img_pmcid in imagesDict.keys():
imagesDict[img_pmcid].append(img.get("filepath") + "/" + img.get("figureid"))
else:
imagesDict[img_pmcid] = [(img.get("filepath") + "/" + img.get("figureid"))]
#for each pmcid, we will assign an image to it for the search results
for pmcid in pmcids:
if imagesDict:
docDict = documentDict[pmcid]
docDict["imgURL"] = imagesDict[pmcid][0]
documentDict[pmcid] = docDict
else:
docDict = documentDict[pmcid]
docDict["imgURL"] = "images/NoImageAvailable.jpg"
documentDict[pmcid] = docDict
#END - Where we get the images for all the pmcids
return documentDict
示例10: lucene_retrieval_multifield
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False):
"""
multifield: different query string for different field
not same word on different field
:param q_string:
:param feature_type:
:param use_BM25:
:return: retrieval_scores for each question-answer pair
"""
index = set_lucene_index['ind'] # nonlocal variable index
def retrieval_scores(hists):
"""
return sorted document+score by score
:param hists:
"""
def doc_score(hists):
"""
return doc_name & score
:param hists:
"""
for h in hists:
# docID = h.doc
# doc = searcher.doc(docID)
# file_name = doc.get("corpus_name")
# doc_name = doc.get("doc_name")
# text = doc.get("text")
score = h.score
# yield (file_name, doc_name, score, text)
yield score
doc_score_list = list(doc_score(hists))
return map(lambda f: f(doc_score_list), feature_type) # feature_type is a list of function
text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class))
query = BooleanQuery()
# BooleanClause.Occur
# MUST implies that the keyword must occur
# SHOULD implies that the keyword SHOULD occur
query.add(text_query, BooleanClause.Occur.SHOULD)
query.add(subject_query, BooleanClause.Occur.SHOULD)
# search
reader = IndexReader.open(index)
searcher = IndexSearcher(reader)
if use_BM25:
searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters
collector = TopScoreDocCollector.create(hitsPerPage, True)
searcher.search(query, collector)
hs = collector.topDocs().scoreDocs # hists
results = retrieval_scores(hs)
# reader.close()
return results # retrieval_scores for each question-answer pair
开发者ID:rarezhang,项目名称:allen-ai-science-challenge,代码行数:57,代码来源:question_classification_subject_feature.py
示例11: xmlrpc_getStatistics
def xmlrpc_getStatistics(self, instance):
reader = IndexReader.open(self.indexPath)
filter = RangeFilter('instance', instance, instance, 1, 1)
num = filter.bits(reader).cardinality()
stat = Vector()
stat.add(num)
stat.add(0)#len(index.terms()))
reader.close()
return stat
示例12: __init__
def __init__(self):
#self.segmentor.load('./cws.model')
INDEXDIR = './Myindex'
#lucene.initVM(vmargs='-Xcheck:jni,-verbose:jni,-verbose:gc')
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
#vm_env = lucene.getVMEnv()
#vm_env.attachCurrentThread()
#lucene.initVM(vmargs='-')
#print 'lucene', lucene.VERSION
self.directory = SimpleFSDirectory(File(INDEXDIR))
self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
self.analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
self.reader = IndexReader.open(self.directory)
示例13: group_tests
def group_tests():
TP = 0.0
FN = 0.0
n = 0.0
precision = 0
recall = 0
lucene.initVM()
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
reader = IndexReader.open(SimpleFSDirectory(File("./articleTitleIndex/")))
searcher = IndexSearcher(reader)
with open('Labeled800Queries/labeler3.txt', 'r') as f:
for line in f:
n += 1
line = line.split('\t')
user_query = line[0]
labels = line[1:]
user_query = re.sub('[^0-9a-zA-Z]+', ' ', user_query)
print user_query
print labels
res = predict(user_query, analyzer, reader, searcher, test = "group")
converted_res = []
for label in res:
#print label[0]
converted_res.append(cvt.WikiToKDD[label[0].replace('_', ' ')])
if not res:
print "empty goal category set"
print converted_res
""" compare labels and converted_res """
for label in labels:
label = label.replace('\r', '')
label = label.replace('\n', '')
if label not in cvt.WikiToKDD.values():
continue
#print label
if label in converted_res:
TP += 1.0
else:
FN += 1.0
print "=========================================================="
precision = TP/(SIZE*n)
recall = TP/(TP+FN)
print "precision:", precision
print "recall:", recall
示例14: get_wiki_docids
def get_wiki_docids(data_file, wikipedia_index):
from questions import get_input_data
data = get_input_data(data_file)
lucene.initVM()
stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
for s in stopwords:
stops.add(s)
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
reader = IndexReader.open(SimpleFSDirectory(File(wikipedia_index)))
searcher = IndexSearcher(reader)
generate_docids(data, data_file, analyzer, searcher)
示例15: lucene_retrieval
def lucene_retrieval(q_string, feature_type, use_BM25=False):
"""
:param q_string:
:param feature_type:
:param use_BM25:
:return: retrieval_scores for each question-answer pair
"""
index = set_lucene_index['ind'] # nonlocal variable index
def retrieval_scores(hists):
"""
return sorted document+score by score
:param hists:
"""
def doc_score(hists):
"""
return doc_name & score
:param hists:
"""
for h in hists:
# docID = h.doc
# doc = searcher.doc(docID)
# file_name = doc.get("corpus_name")
# doc_name = doc.get("doc_name")
# text = doc.get("text")
score = h.score
# yield (file_name, doc_name, score, text)
yield score
doc_score_list = list(doc_score(hists))
return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function
# escape special characters via escape function
query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
# search
reader = IndexReader.open(index)
searcher = IndexSearcher(reader)
if use_BM25:
searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters
collector = TopScoreDocCollector.create(hitsPerPage, True)
searcher.search(query, collector)
hs = collector.topDocs().scoreDocs # hists
results = retrieval_scores(hs)
# reader.close()
return results # retrieval_scores for each question-answer pair