当前位置: 首页>>代码示例>>Python>>正文


Python IndexWriter.optimize方法代码示例

本文整理汇总了Python中lucene.IndexWriter.optimize方法的典型用法代码示例。如果您正苦于以下问题:Python IndexWriter.optimize方法的具体用法?Python IndexWriter.optimize怎么用?Python IndexWriter.optimize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在lucene.IndexWriter的用法示例。


在下文中一共展示了IndexWriter.optimize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: index

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def index(string):
 lucene.initVM()
 indexDir = "REMOVEME.index-dir"
 dir = SimpleFSDirectory(File(indexDir))
 analyzer = StandardAnalyzer(Version.LUCENE_30)
 try:
  writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512))
 except lucene.JavaError:
  #print 'Inside Index Except'
  writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
#e = sys.exc_info()[0]
#print e
 #print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

 doc = Document()
 doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED))
 writer.addDocument(doc)
 #print 'In the index function'
 #print writer.numDocs()

#print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
#print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
 writer.optimize()
#print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
#print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
 #print 'ending Indexing'
 #print string 
 #print 'Total indexes'
 #print writer.numDocs() 
 writer.close()
开发者ID:kansal,项目名称:Sub-Event-Detection,代码行数:32,代码来源:subEventPylucene.py

示例2: update_index_withLineArray

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
    def update_index_withLineArray(self,array):
        """
        Parsed sentences (given in an array) are added to the index, with the corresponding two entities (x,y) and the DBpedia URI
        """
        print "start adding sentences"
        writer = IndexWriter(index_directory, analyzer, False, IndexWriter.MaxFieldLength(512))
        for item in array:
            line = item[0]
            x = item[1]
            y = item[2]
            uri = item[3]
            line=line.replace("\t"," ")
            line = line.replace("\n","  ")
            line = line.replace("   ","  ")
            try:
                
                doc = Document()
                doc.add(Field("Sentence", line, Field.Store.YES, Field.Index.ANALYZED))
                doc.add(Field("X", x, Field.Store.YES, Field.Index.ANALYZED))
                doc.add(Field("Y", y, Field.Store.YES, Field.Index.ANALYZED))
                doc.add(Field("URI", uri, Field.Store.YES, Field.Index.ANALYZED))
                writer.addDocument(doc)
                
            except Exception:
                print "Unexpected error:", sys.exc_info()[0]
                raw_input("Error in updating the Sentences")
        try:
            writer.optimize()
        except:
            print "Unexpected error:", sys.exc_info()[0]
            print ("could not optimize index")

        writer.close()
        print "all sentences added"
开发者ID:swalter2,项目名称:knowledgeLexicalisation,代码行数:36,代码来源:LiveIndex.py

示例3: do_index

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def do_index():
    initVM()
    indexDir = "/home/william/woyaoo/luceneindex"
    version = Version.LUCENE_CURRENT
    standardAnalyzer = StandardAnalyzer(version)
    # chineseAnalyzer = CJKAnalyzer(version)
    engine = data.engine_from_config("indexdb.config")
    # engine = data.engine_from_config()
    db = data.init_datafactory(engine)
    docs = dbfactory.Session().query(doc_model.Doc).filter(doc_model.Doc.dateCreated > "20121220").all()
    print len(docs)
    idxDir = SimpleFSDirectory(File(indexDir))
    perIndexCount = 5000
    writer = IndexWriter(idxDir, standardAnalyzer, True, IndexWriter.MaxFieldLength(512))

    # add field
    for doc in docs:
        # print repr(doc.description)
        lucenedoc = Document()
        descriptionValue = doc.description.strip("\r\n").encode("UTF-8")
        # descriptionValue ='中国 abc'
        print repr(descriptionValue)
        lucenedoc.add(Field("url", doc.url, Field.Store.YES, Field.Index.NOT_ANALYZED))
        lucenedoc.add(Field("intent", doc.intent, Field.Store.YES, Field.Index.NOT_ANALYZED))
        # lucenedoc.add(Field('description', doc.description, Field.Store.YES, Field.Index.ANALYZED))
        lucenedoc.add(Field("description", descriptionValue, Field.Store.YES, Field.Index.ANALYZED))
        lucenedoc.add(Field("title", doc.title, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(lucenedoc)
        writer.optimize()
    writer.close()
    print "index finished"
开发者ID:tongji1907,项目名称:woyaooo,代码行数:33,代码来源:docindexjob.py

示例4: configure_lucene

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def configure_lucene():
    
    f = open('clique.txt','r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t','')
        line = line.replace('\r','')
        line = line.replace('\n','')
  	line = line.replace('^','')
    	line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()
开发者ID:avinashkoulavkar,项目名称:GUI,代码行数:31,代码来源:app.py

示例5: index

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
	def index( self ):
		lucene.initVM()
		indexdir = SimpleFSDirectory( File( self.INDEX_DIR ) )
		analyzer = StandardAnalyzer( Version.LUCENE_30 )
		index_writer = IndexWriter( indexdir, analyzer, True, IndexWriter.MaxFieldLength( 512 ) )
		# read input files (.xml)
		for in_file in glob.glob( os.path.join( self.DOC_DIR, '*.xml' ) ):
			corpus = codecs.open( in_file, encoding='utf-8' ).read()
			d = pq( corpus, parser='html' )
			for text in d( 'Article' ).items():
				document = Document()
				# find ID
				art_id = str( text.attr( 'articleid' ).encode( 'utf-8' ) ).replace( '+', '-' )
				# find Title
				art_title = self.stem( str( text.attr( 'title' ).encode( 'utf-8' ) ) )
				# find Abstract
				art_abstract = self.stem( str( text.find( 'Abstract' ).html().encode('utf-8') ) )
				# find Keyword
				art_keyword = text.find( 'Keyword' ).html().encode('utf-8')
				# find Content
				art_content = self.stem( str( text.find( 'Content' ).html().encode('utf-8') ) )
				# find Authors
				art_authors = text.find( 'Authors' ).html().encode('utf-8')
				document.add( Field( 'id', art_id, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'title', art_title, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'abstract', art_abstract, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'keyword', art_keyword, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'content', art_content, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'authors', art_authors, Field.Store.YES, Field.Index.ANALYZED ) )
				document.add( Field( 'article', art_title + art_abstract + art_keyword + art_content,\
									 Field.Store.YES,\
									 Field.Index.ANALYZED ) )
				index_writer.addDocument( document )
			index_writer.optimize()
			index_writer.close()
开发者ID:farbod-s,项目名称:Noormags,代码行数:37,代码来源:indexing.py

示例6: index

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
 def index(self,path_to_index,path_files):
     'indexes anchor texts from a given folder'
     #lucene.initVM()
     indexDir = path_to_index
     directory_index = SimpleFSDirectory(File(indexDir))
     analyzer = StandardAnalyzer(Version.LUCENE_35)
     writer = IndexWriter(directory_index, analyzer, True, IndexWriter.MaxFieldLength(512))
     listOfPathes = []
     listOfPathes.extend(glob.glob(path_files+"*.txt"))
     counter = 0
     for path_to_file in listOfPathes:
         print path_to_file
         f = open(path_to_file,"r")
         for line in f:
             entry = line.split("\t")
             counter+=1
             """
             optimizes index after a certain amount of added documents
             """
             if counter%500000==0:
                 print counter
                 writer.optimize()
             doc = Document()
             doc.add(Field("anchor", entry[0], Field.Store.YES, Field.Index.ANALYZED))
             doc.add(Field("anchor_uri", entry[1], Field.Store.YES, Field.Index.ANALYZED))
             doc.add(Field("dbpedia_uri", entry[2], Field.Store.YES, Field.Index.ANALYZED))
             doc.add(Field("number", entry[3].replace("\n",""), Field.Store.YES, Field.Index.ANALYZED))
             writer.addDocument(doc)
         writer.optimize()
      
         f.close()
         
     writer.close()
     print counter
     print "done"
开发者ID:swalter2,项目名称:knowledgeLexicalisation,代码行数:37,代码来源:AnchorIndex.py

示例7: addDocuments

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
    def addDocuments(self, dir, isCompound):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(isCompound)

        # change to adjust performance of indexing with FSDirectory
        # writer.mergeFactor = writer.mergeFactor
        # writer.maxMergeDocs = writer.maxMergeDocs
        # writer.minMergeDocs = writer.minMergeDocs

        for word in self.docs:
            doc = Document()
            doc.add(Field("keyword", word,
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field("unindexed", word,
                          Field.Store.YES, Field.Index.NO))
            doc.add(Field("unstored", word,
                          Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("text", word,
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
开发者ID:ustramooner,项目名称:python-lucenepp,代码行数:27,代码来源:CompoundVersusMultiFileIndexTest.py

示例8: index_files

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def index_files (files, index_directory):
  lucene.initVM()
  d = SimpleFSDirectory(File(index_directory))
  analyzer = StandardAnalyzer(Version.LUCENE_30)
  writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512))
  for f in files:
    parse_file(f, writer)
  writer.optimize()
  writer.close()
开发者ID:CrawlingFingers,项目名称:ConcordiaCrawler,代码行数:11,代码来源:indexer.py

示例9: __init__

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
    def __init__(self, root, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        store = FSDirectory.getDirectory(storeDir, True)
        writer = IndexWriter(store, analyzer, True)
        writer.setMaxFieldLength(1048576)
        self.indexDocs(root, writer)
        print 'optimizing index',
        writer.optimize()
        writer.close()
        print 'done'
开发者ID:christineyen,项目名称:location-extraction,代码行数:14,代码来源:IndexFiles.py

示例10: indexSingleFieldDocs

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
    def indexSingleFieldDocs(self, fields):

        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        for field in fields:
            doc = Document()
            doc.add(field)
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
开发者ID:bpgriner01,项目名称:pylucene,代码行数:14,代码来源:ScoreTest.py

示例11: index

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
    def index(cls, indexDir, dataDir):

        if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
            raise IOError, "%s does not exist or is not a directory" % (dataDir)

        writer = IndexWriter(indexDir, StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
        writer.setUseCompoundFile(False)

        numIndexed = cls.indexDirectory(writer, dataDir)
        writer.optimize()
        writer.close()

        return numIndexed
开发者ID:bpgriner01,项目名称:pylucene,代码行数:15,代码来源:FileIndexer.py

示例12: luceneIndexer

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def luceneIndexer(docdir,indir):
	""" IndexDocuments from a directory.
	Args:
		docdir:文档所在文件夹
		indir:索引存放文件夹
	Returns:
		无返回值
	说明:
	FieldType().setStored=as-is value stored in the Lucene index
	FieldType().setTokenized=field is analyzed using the specified Analyzer - the tokens emitted are indexed
	FieldType().Indexed = the text (either as-is with keyword fields, or the tokens from tokenized fields) is made searchable (aka inverted)
	FieldType().Vectored = term frequency per document is stored in the index in an easily retrievable fashion.
	"""
	
	"""#类型1属性:对于需要检索,需要返回显示setStored(True)
	type1 = FieldType()
	type1.setIndexed(True)
	type1.setStored(True)
	type1.setTokenized(False)
	type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
	#类型2属性:对于不用返回显示,但是需要进行检索的字段。这里我认为文本内容(content)是这一种的,通常例如文件的META信息。
	type2 = FieldType()
	type2.setIndexed(True)
	type2.setStored(False)
	type2.setTokenized(True)
	type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)"""
	
	lucene.initVM()
	DIRTOINDEX= docdir
	INDEXIDR= indir
	indexdir= SimpleFSDirectory(File(INDEXIDR))
	analyzer= StandardAnalyzer(Version.LUCENE_30)
	#用指定的语言分析器构造一个新的写索引器.
	index_writer= IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512))
	for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):
	    #print "Indexing: "
		print "Indexing:", tfile;
		document = Document()
		content = open(tfile,'r').read()
		#类型使用方式
		#doc.add(Field("path", tfile, type1))
		
		#文档新增字段(Field){字段名:"text",存储:“YES”,索引:"YES"}
		document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED))
		document.add(Field("path",tfile,Field.Store.YES,Field.Index.ANALYZED))
		index_writer.addDocument(document)
		print "Done: ", tfile
	index_writer.optimize()
	print index_writer.numDocs()
	index_writer.close()
开发者ID:qiugen,项目名称:pylucene_demo,代码行数:52,代码来源:luceneInx.py

示例13: initDummyStore

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
 def initDummyStore(self, directory):
     """Open a dummy ramdirectory for testing."""
     writer = IndexWriter(directory, SimpleAnalyzer(), True)
     doc = Document()
     doc.add(Field("name", 'dummy.txt', Field.Store.YES,
                   Field.Index.UN_TOKENIZED))
     doc.add(Field("path", '/path/to/dummy.txt', Field.Store.YES,
                   Field.Index.UN_TOKENIZED))
     doc.add(Field("path", '/path/to/another/dummy.txt', Field.Store.YES,
                   Field.Index.UN_TOKENIZED))
     doc.add(Field("contents", "foo dummy bar", Field.Store.YES,
                   Field.Index.TOKENIZED))
     writer.addDocument(doc)
     writer.optimize()
     writer.close()
开发者ID:jjguy,项目名称:plush,代码行数:17,代码来源:plush.py

示例14: createIndex

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
    def createIndex(cls, dataDir, indexDir, useCompound):

        indexDir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(
            indexDir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.UNLIMITED
        )
        writer.setUseCompoundFile(useCompound)

        for dir, dirnames, filenames in os.walk(dataDir):
            for filename in filenames:
                if filename.endswith(".properties"):
                    cls.indexFile(writer, os.path.join(dir, filename), dataDir)

        writer.optimize()
        writer.close()
开发者ID:bpgriner01,项目名称:pylucene,代码行数:17,代码来源:TestDataDocumentHandler.py

示例15: index

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
    def index(self):

        dirPath = os.path.join(tempfile.gettempdir(),
                               "verbose-index")
        dir = FSDirectory.open(dirPath)
        writer = IndexWriter(dir, SimpleAnalyzer(), True)

        writer.setInfoStream(InfoStreamOut())

        for i in xrange(100):
            doc = Document()
            doc.add(Field("keyword", "goober",
                             Field.Store.YES, Field.Index.NOT_ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()
开发者ID:pombredanne,项目名称:python-lucenepp,代码行数:19,代码来源:VerboseIndexing.py


注:本文中的lucene.IndexWriter.optimize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。