当前位置: 首页>>代码示例>>Python>>正文


Python IndexSearcher.setSimilarity方法代码示例

本文整理汇总了Python中org.apache.lucene.search.IndexSearcher.setSimilarity方法的典型用法代码示例。如果您正苦于以下问题:Python IndexSearcher.setSimilarity方法的具体用法?Python IndexSearcher.setSimilarity怎么用?Python IndexSearcher.setSimilarity使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.lucene.search.IndexSearcher的用法示例。


在下文中一共展示了IndexSearcher.setSimilarity方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: getIndexSearcher

# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
	def getIndexSearcher(self):

		indexSearcher = IndexSearcher(self.mIndexReader)
		if self.mSimilarity != None:
			indexSearcher.setSimilarity(self.mSimilarity)
		
		return indexSearcher	
开发者ID:haonguyen14,项目名称:CLIFinder,代码行数:9,代码来源:Administration.py

示例2: IndexAndTaxonomy

# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
class IndexAndTaxonomy(object):

    def __init__(self, settings, indexDirectory=None, taxoDirectory=None):
        self._settings = settings
        self._similarity = settings.similarity
        self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks
        self._reader = DirectoryReader.open(indexDirectory)
        self.taxoReader = DirectoryTaxonomyReader(taxoDirectory)
        self._readerSettingsWrapper = ReaderSettingsWrapper()
        self._readerSettingsWrapper.get = lambda: {"similarity": self.searcher.getSimilarity().toString(), "numberOfConcurrentTasks": self._numberOfConcurrentTasks}
        self._readerSettingsWrapper.set = self._setReadSettings
        self._searcher = None
        self._executor = None
        self._reopenSearcher = True

    def reopen(self):
        reader = DirectoryReader.openIfChanged(self._reader)
        if reader is None:
            return
        self._reader.close()
        self._reader = reader
        self._reopenSearcher = True
        taxoReader = DirectoryTaxonomyReader.openIfChanged(self.taxoReader)
        if taxoReader is None:
            return
        self.taxoReader.close()
        self.taxoReader = taxoReader

    @property
    def searcher(self):
        if not self._reopenSearcher:
            return self._searcher

        if self._settings.multithreaded:
            if self._executor:
                self._executor.shutdown();
            self._executor = Executors.newFixedThreadPool(self._numberOfConcurrentTasks);
            self._searcher = SuperIndexSearcher(self._reader, self._executor, self._numberOfConcurrentTasks)
        else:
            self._searcher = IndexSearcher(self._reader)
        self._searcher.setSimilarity(self._similarity)
        self._reopenSearcher = False
        return self._searcher

    def _setReadSettings(self, similarity=None, numberOfConcurrentTasks=None):
        # This method must be thread-safe
        if similarity is None:
            self._similarity = self._settings.similarity
        else:
            self._similarity = BM25Similarity(similarity["k1"], similarity["b"])

        if numberOfConcurrentTasks is None:
            self._numberOfConcurrentTasks = self._settings.numberOfConcurrentTasks
        else:
            self._numberOfConcurrentTasks = numberOfConcurrentTasks
        self._reopenSearcher = True

    def close(self):
        self.taxoReader.close()
        self._reader.close()
开发者ID:jerryba,项目名称:meresco-lucene,代码行数:62,代码来源:indexandtaxonomy.py

示例3: config

# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
def config():
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR)))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    bm25Sim = BM25Similarity(2.0,0.75) #BM25 with these default values: k1 = 1.2, b = 0.75.
    searcher.setSimilarity(bm25Sim)
    analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT)
    return searcher,analyzer
开发者ID:PhoenixZhao,项目名称:MovieSearchService,代码行数:10,代码来源:SearchMysql_v3.py

示例4: lucene_retrieval_multifield

# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
开发者ID:rarezhang,项目名称:allen-ai-science-challenge,代码行数:59,代码来源:question_classification_subject_feature.py

示例5: lucene_retrieval

# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
def lucene_retrieval(q_string, feature_type, use_BM25=False):
    """

    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function

    # escape special characters via escape function
    query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
开发者ID:rarezhang,项目名称:allen-ai-science-challenge,代码行数:51,代码来源:corpus_index_and_retrieval_feature.py

示例6: __init__

# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
    def __init__(self, index_dir, index_file, rawQuery):
        self.indexFile = os.path.join(index_dir, index_file)

#         lucene.initVM(vmargs=['-Djava.awt.headless=true']) # uncomment when run Retrieve separately
        directory = SimpleFSDirectory(File(self.indexFile))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        searcher.setSimilarity(BM25Similarity(1.2, 0.75))  # set BM25 as the similarity metric, k=1.2, b=0.75
        if 'Standard' in self.indexFile:
            print "Use the StandardAnalyzer"
            analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)  # build a standard analyzer with default stop words
        if 'Porter' in self.indexFile:
            print "Use the PorterStemmer analyzer"
            analyzer = PorterStemmerAnalyzer()
        self.run(searcher, analyzer, rawQuery)
        del searcher
开发者ID:w2wei,项目名称:XPRC,代码行数:17,代码来源:Index_TREC2005Genomics4584.py

示例7: lucene_retrieval

# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
def lucene_retrieval(q_string, use_BM25=False):
    """

    :param q_string:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def doc_text(hists):
        """
        return doc_name & score
        :param hists:
        """
        text = '_NONE_'
        for h in hists:
            docID = h.doc
            doc = searcher.doc(docID)
            # file_name = doc.get("corpus_name")
            # doc_name = doc.get("doc_name")
            text = doc.get("text")
            #score = h.score
            # yield (file_name, doc_name, score, text)
        return text

    result = '_NONE_'

    # escape special characters via escape function
    if q_string and q_string.strip():   # when pre-process answers, `none of the above` -> '' cause error here
        #print(q_string)
        query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

        # search
        reader = IndexReader.open(index)
        searcher = IndexSearcher(reader)

        if use_BM25:
            searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

        collector = TopScoreDocCollector.create(hitsPerPage, True)
        searcher.search(query, collector)
        hs = collector.topDocs().scoreDocs  # hists
        result = doc_text(hs)

        # reader.close()
    return result  # text: also nodes
开发者ID:rarezhang,项目名称:allen-ai-science-challenge,代码行数:48,代码来源:network_feature_index_retrieval_nodes.py

示例8: get_sorted_results

# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
    def get_sorted_results(self, query):
        SHOULD = BooleanClause.Occur.SHOULD
        parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, query, ['docno', 'content'], [SHOULD, SHOULD], self.analyzer)

        reader = IndexReader.open(self.directory)
        searcher = IndexSearcher(reader)

        searcher.setSimilarity(BM25Similarity())
        topDocs = searcher.search(parsed_query, 10)

        j = 0
        for i in topDocs.scoreDocs:
            d = searcher.doc(i.doc)

            print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score)

            j += 1
开发者ID:lenawtech,项目名称:FSS16,代码行数:19,代码来源:ir16_assignment-4_6.py

示例9: LuceneRetrieval

# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
class LuceneRetrieval(BaseRetrieval):
    """
        Encapsulates the Lucene retrieval engine
    """
    def __init__(self, index_path, method, logger=None, use_default_similarity=False):
        self.index_path=index_path
        directory = SimpleFSDirectory(File(self.index_path))
        self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)
        self.reader=DirectoryReader.open(directory)
        self.searcher = IndexSearcher(self.reader)

        # uncomment one of these lines to change the type of parser, query and weight used
        if use_default_similarity:
            self.query_parser=QueryParser
        else:
            self.query_parser=FieldAgnosticQueryParser

        if use_default_similarity:
            similarity=DefaultSimilarity()
            self.useExplainQuery=False
        else:
            similarity=FieldAgnosticSimilarity()
            self.useExplainQuery=True
        # by default, FieldAgnosticSimilarity uses coord factor, can be disabled
##        similarity.useCoord=False

        self.searcher.setSimilarity(similarity)
        self.method=method # never used?
        self.logger=logger

    def runQueryViaExplain(self,query, max_results):
        """
            Really crappy solution to make sure that explanations and searches are the same
            while I fix Lucene
        """
        results=[]

        index=0
        for index in range(self.reader.numDocs()):
            explanation=self.searcher.explain(query,index)
            score=explanation.getValue()
##            match=re.search(r"(.*?)\s=",explanation.toString(),re.IGNORECASE|re.DOTALL)
##            if match:
##                score=float(match.group(1))
            hit=namedtuple("Hit",["doc","score"])
            hit.doc=index
            hit.score=score
##            heapq.heappush(results,hit)
            results.append(hit)

        results.sort(key=lambda x:x.score,reverse=True)

        if max_results < self.reader.numDocs():
            results=results[:max_results]

        return results

    def runQuery(self, structured_query, max_results=MAX_RESULTS_RECALL):
        """
            LOTS OF SWEET LUCENE
        """
        original_query=structured_query

        if not structured_query or len(structured_query) == 0 :
            return []

        self.last_query=structured_query
        query_text=self.rewriteQuery(structured_query["structured_query"], ["text"])

        try:
            query = self.query_parser(lucene.Version.LUCENE_CURRENT, "text", self.analyzer).parse(query_text)
        except:
            print("Lucene exception:",sys.exc_info()[:2])
            return None

        structured_query["lucene_query"]=query_text

        if self.useExplainQuery:
            # this should only exist until I fix the lucene bulkScorer to give the same results
            hits=self.runQueryViaExplain(query,max_results)
        else:
            collector=TopScoreDocCollector.create(max_results, True)
            self.searcher.search(query, collector)
            hits = collector.topDocs().scoreDocs

##        print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query))
        res=[]


##        if len(hits.scoreDocs) ==0:
##            print "Original query:",original_query
##            print "Query:", query

        for hit in hits:
            doc = self.searcher.doc(hit.doc)
            metadata= json.loads(doc.get("metadata"))
            res.append((hit.score,metadata))
        return res


#.........这里部分代码省略.........
开发者ID:danieldmm,项目名称:minerva,代码行数:103,代码来源:lucene_retrieval.py

示例10: __init__

# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
class SearchIndex:

  def __init__(self, indexPath):
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION

    #initialize the index
    self.INDEX_DIR = indexPath  #"Clue_Index"
    self.results = None
    self.searcher = IndexSearcher(DirectoryReader.open(
        SimpleFSDirectory(File(self.INDEX_DIR))))

    self.searcher.setSimilarity(BM25Similarity())

  def initializeAnalyzer(self):
    #self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT,JavaSet(stopSet))
    sSet = CharArraySet(Version.LUCENE_CURRENT, 0, True)
    for entry in stopSet:
      sSet.add(entry)
    self.stopSet = sSet
    #self.analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT,sSet)
    self.analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)

  def getTopDocuments(self, query, limit, sfield, dfield):
    queryObj = QueryParser(Version.LUCENE_CURRENT, sfield,
                           self.analyzer).parse(query)
    print queryObj
    scoreDocs = self.searcher.search(queryObj, limit).scoreDocs
    print '%s total matching documents.' % len(scoreDocs)
    self.results = scoreDocs
    rresults = []
    i = 0
    #reader = self.searcher.getIndexReader();
    #print type(reader)
    for scoreDoc in scoreDocs:
      doc = self.searcher.doc(scoreDoc.doc)
      rresults.append((doc.get(dfield), scoreDoc.score))
      #rresults.append(doc.get(dfield));#,scoreDoc.score))
      i += 1
      if i == limit:
        break
    return rresults
    #print 'path:', doc.get("URL"), 'name:', doc.get("id"), 'title:', doc.get("title")

  def getTopDocumentsWithExpansion(self, query, expTerms, limit, sfield, dfield
                                 ):
    print expTerms
    query = query + ' ' + ' '.join('{0}^{1}'.format(x[0], round(x[1], 2))
                                   for x in expTerms)
    sSet = CharArraySet(Version.LUCENE_CURRENT, 0, True)
    for entry in expTerms:
      sSet.add(entry[0])

    analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT, self.stopSet, sSet)

    queryObj = QueryParser(Version.LUCENE_CURRENT, sfield,
                           analyzer).parse(query)
    scoreDocs = self.searcher.search(queryObj, limit).scoreDocs
    print '%s total matching documents.' % len(scoreDocs), queryObj
    self.results = scoreDocs
    rresults = []
    i = 0

    for scoreDoc in scoreDocs:
      doc = self.searcher.doc(scoreDoc.doc)
      #rresults.append(doc.get(dfield));#,scoreDoc.score))
      rresults.append((doc.get(dfield), scoreDoc.score))

      i += 1
      if i == limit:
        break
    return rresults

  def getField(self, dfield, name, limit):
    toReturn = []
    i = 0
    for scoreDoc in self.results:
      doc = self.searcher.doc(scoreDoc.doc)
      toReturn.append((doc.get(dfield), doc.get(name)))
      i += 1
      if i == limit:
        break
    return toReturn

  def close(self):
    del self.searcher
开发者ID:vmanisha,项目名称:QueryExpansion,代码行数:88,代码来源:searchIndex.py

示例11: Lucene

# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]

#.........这里部分代码省略.........
            raise Exception("There is no open IndexReader to close")

    def open_searcher(self):
        """
        Open IndexSearcher. Automatically opens an IndexReader too,
        if it is not already open. There is no close method for the
        searcher.
        """
        if self.searcher is None:
            self.open_reader()
            self.searcher = IndexSearcher(self.reader)

    def get_searcher(self):
        """Returns index searcher (opens it if needed)."""
        self.open_searcher()
        return self.searcher

    def set_lm_similarity_jm(self, method="jm", smoothing_param=0.1):
        """
        Set searcher to use LM similarity.

        :param method: LM similarity ("jm" or "dirichlet")
        :param smoothing_param: smoothing parameter (lambda or mu)
        """
        if method == "jm":
            similarity = LMJelinekMercerSimilarity(smoothing_param)
        elif method == "dirichlet":
            similarity = LMDirichletSimilarity(smoothing_param)
        else:
            raise Exception("Unknown method")

        if self.searcher is None:
            raise Exception("Searcher has not been created")
        self.searcher.setSimilarity(similarity)

    def open_writer(self):
        """Open IndexWriter."""
        if self.writer is None:
            config = IndexWriterConfig(Lucene.get_version(), self.get_analyzer())
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            self.writer = IndexWriter(self.dir, config)
        else:
            raise Exception("IndexWriter is already open")

    def close_writer(self):
        """Close IndexWriter."""
        if self.writer is not None:
            self.writer.close()
            self.writer = None
        else:
            raise Exception("There is no open IndexWriter to close")

    def add_document(self, contents):
        """
        Adds a Lucene document with the specified contents to the index.
        See LuceneDocument.create_document() for the explanation of contents.
        """
        if self.ldf is None:  # create a single LuceneDocument object that will be reused
            self.ldf = LuceneDocument()
        self.writer.addDocument(self.ldf.create_document(contents))

    def get_lucene_document_id(self, doc_id):
        """Loads a document from a Lucene index based on its id."""
        self.open_searcher()
        query = TermQuery(Term(self.FIELDNAME_ID, doc_id))
        tophit = self.searcher.search(query, 1).scoreDocs
开发者ID:hasibi,项目名称:ELR-EntityLinkingRetrieval,代码行数:70,代码来源:lucene_tools.py

示例12: len

# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
            total_recall += recall
            total_precision += precision
            total_FB += FB

            print '%3s Recall: %.6f  Precision: %.6f  FB: %.6f' % (qid, recall, precision, FB)

        query_data_length = len(query_data)
        avg_recall = total_recall/query_data_length
        avg_precision = total_precision/query_data_length
        avg_FB = total_FB/query_data_length

        print 'Avg Recall: %.6f  Avg Precision: %.6f Avg FB: %.6f' % (avg_recall, avg_precision, avg_FB)

if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR)))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    
    searcher.setSimilarity(similarities.BM25Similarity())
    #Available similarity: BM25Similarity, MultiSimilarity, PerFieldSimilarityWrapper, SimilarityBase, TFIDFSimilarity
    # analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    analyzer = MyAnalyzer(Version.LUCENE_CURRENT)
    fs = FileSearcher(searcher, analyzer)
    if len(sys.argv) < 2:
        fs.perform_user_query(searcher, analyzer)
    else:
        fs.results_comparison(searcher, analyzer, sys.argv[1])
    del searcher
开发者ID:huqiang,项目名称:CS3246-Assignment1,代码行数:32,代码来源:SearchFiles.py

示例13: LuceneCorpus

# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
class LuceneCorpus(object):
    # to init a LuceneCorpus, we need the outputdir, which is passed as index_dir
    # we need filenames that contains one for more corpus we just created
    # we need a parser, this parser should implement function 'parse' which knows how to split, how to stem
    def __init__(self, index_dir, filenames, parser, similarity=None):
        """
        :param index_dir: where to store the Lucene index
        :param filenames: the corpus created previously. Note that the format of corpus that has been created is consistent
        :param parser: SimpleWordParser in Parser.py, where we can apply functions such as stemming
        :param similarity: We can put None here(then default Vector Space Model with TF-IDF is used) or we can use BM25 similarity to index
        :return:
        """
        self._index_dir = index_dir
        self._filenames = filenames
        self._parser = parser
        self._similarity = similarity
        lucene.initVM()
        # the WhitespaceAnalyzer split the text based on whitespace
        self._analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
        self._store = SimpleFSDirectory(File(self._index_dir))
        self._searcher = None

    def prp_index(self):
        '''
        Prepare the index given our "corpus" file(s)
        '''
        print '=> Preparing Lucene index %s' % self._index_dir
        writer = self._get_writer(create=True)
        print '   Currently %d docs (dir %s)' % (writer.numDocs(), self._index_dir)
        num_pages, num_sections = 0, 0
        page_name, section_name = None, None
        num_lines = 0
        for ifname,fname in enumerate(self._filenames):
            print '   Adding lines to index from file #%d: %s' % (ifname, fname)
            with open(fname,'rt') as infile:
                for text in infile:
                    if len(text)==0:
                        print 'Reached EOF'
                        break # EOF
                    # CorpusReader.PAGE_NAME_PREFIX is <Page>
                    # all our corpus we manipulated them to have this tag as the start of a page
                    if text.startswith(CorpusReader.PAGE_NAME_PREFIX):
                        page_name = text[len(CorpusReader.PAGE_NAME_PREFIX):].strip()
                        section_name = None
                        num_pages += 1
                    elif text.startswith(CorpusReader.SECTION_NAME_PREFIX):
                        section_name = text[len(CorpusReader.SECTION_NAME_PREFIX):].strip()
                        num_sections += 1
                    else:
                        assert (page_name is not None) and (section_name is not None)
                        if self._parser is None:
                            luc_text = text
                        else:
                            # note in our case the we always have SimpleWordParser
                            section_words = self._parser.parse(text, calc_weights=False) #True)
                            luc_text = ' '.join(section_words)
                        # for each section, we add the whole section to Lucene index, we store the text and makes it searchable
                        # seems like page is not necessary here since we do not add document page by page but section by section
                        doc = Document()
                        # there is only one field for each document, which is the text field
                        # section_name is not used as a field
                        doc.add(Field("text", luc_text, Field.Store.YES, Field.Index.ANALYZED))
                        writer.addDocument(doc)
                    num_lines += 1
                    if num_lines % 100000 == 0:
                        print '    read %d lines so far: %d pages, %d sections' % (num_lines, num_pages, num_sections)

        print '   Finished - %d docs (dir %s)' % (writer.numDocs(), self._index_dir)
        writer.close()

    def search(self, words, max_docs, weight_func=lambda n: np.ones(n), score_func=lambda s: s):
        '''
        Search the index for the given words, return total score
        '''
        searcher = self._get_searcher()
        if type(words)==str:
            search_text = words
            search_text = AsciiConvertor.convert(search_text)
            for c in '/+-&|!(){}[]^"~*?:':
                search_text = search_text.replace('%s'%c, '\%s'%c)
        else:
            search_text = ' '.join(words)
        print 'search_text: %s' % search_text
        # note that whatever parser that we put as our argument, eventually when searching with query, we will use Lucene parser to split query words
        query = QueryParser(Version.LUCENE_CURRENT, "text", self._analyzer).parse(search_text)
        hits = searcher.search(query, max_docs)

        score_sum = 0.0
        weights = weight_func(len(hits.scoreDocs))
        for hit,weight in zip(hits.scoreDocs, weights):
            score_sum += weight * score_func(hit.score)
        return score_sum

    def _get_writer(self, analyzer=None, create=False):
        config = IndexWriterConfig(Version.LUCENE_CURRENT, self._analyzer)
        if create:
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        if self._similarity is not None:
            config.setSimilarity(self._similarity)
        writer = IndexWriter(self._store, config)
#.........这里部分代码省略.........
开发者ID:XihuanZeng,项目名称:kaggle,代码行数:103,代码来源:LuceneCorpus.py

示例14: __init__

# 需要导入模块: from org.apache.lucene.search import IndexSearcher [as 别名]
# 或者: from org.apache.lucene.search.IndexSearcher import setSimilarity [as 别名]
    def __init__(self, tweets, storeDir, analyzer):

        # first, index the tweets
        if not path.exists(storeDir):
            mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.index_docs(tweets, writer)
        writer.commit()
        writer.close()

        # set up IndexSearcher
        reader = IndexReader.open(store)
        n_docs = reader.numDocs()
        searcher = IndexSearcher(reader)
        searcher.setSimilarity(BM25Similarity())
        queryparser = QueryParser(Version.LUCENE_CURRENT, "contents", StandardAnalyzer(Version.LUCENE_CURRENT))

        # create document vectors
        doc_vectors = self.get_doc_vectors(reader, tweets, n_docs)

        cs_scorer = CosineSimilarityScorer(doc_vectors, reader, searcher, tweets)
        bm25_scorer = BM25Scorer(doc_vectors, searcher, queryparser)

        # find relevant tweets
        for fav_doc in (1, 26, 51):
            cs_scores = cs_scorer.get_scores(fav_doc)
            bm25_scores = bm25_scorer.get_scores(fav_doc)

            top_cs_scores = dict(sorted(cs_scores.iteritems(), key=itemgetter(1), reverse=True)[:5])
            top_bm25_scores = dict(sorted(bm25_scores.iteritems(), key=itemgetter(1), reverse=True)[:5])

            # print "top_cs_scores", top_cs_scores
            # print "top_bm25_scores", top_bm25_scores

            # calculate composite score by multiplying cs scores by 100 and keeping bm25 scores as is.
            # cs is bounded from 0.0-1.0. bm25 scores is actually idf * bm25_similarity_score so values
            # above 10.0 are not uncommon
            top_blended_scores = {}
            for key, value in top_cs_scores.iteritems():
                top_blended_scores[key] = value * 100.0

            for key, value in top_bm25_scores.iteritems():
                if key not in top_blended_scores:
                    top_blended_scores[key] = 0.0
                top_blended_scores[key] += value

            top_score = dict(sorted(top_blended_scores.iteritems(), key=itemgetter(1), reverse=True)[:1])

            # print "\n"
            # print "results for", fav_doc
            # print tweets[fav_doc]
            print searcher.doc(fav_doc).get("contents")
            print top_score

            # if the top score fails to reach 10.0, this result is probably not of high quality so onlyworthy
            # will decline to identify a relevant match
            if top_score.values()[0] < 10.0:
                print "skipping"
                continue

            # print tweets[top_score.keys()[0]]
            print searcher.doc(top_score.keys()[0]).get("contents")
            print "\n"
开发者ID:ryancutter,项目名称:OnlyWorthy,代码行数:71,代码来源:onlyworthy_dev.py


注:本文中的org.apache.lucene.search.IndexSearcher.setSimilarity方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。