本文整理汇总了Python中lucene.IndexWriter.setMergeFactor方法的典型用法代码示例。如果您正苦于以下问题:Python IndexWriter.setMergeFactor方法的具体用法?Python IndexWriter.setMergeFactor怎么用?Python IndexWriter.setMergeFactor使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lucene.IndexWriter
的用法示例。
在下文中一共展示了IndexWriter.setMergeFactor方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import setMergeFactor [as 别名]
def main(cls, argv):
if len(argv) < 5:
print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
return
docsInIndex = int(argv[1])
# create an index called 'index-dir' in a temp directory
indexDir = os.path.join(tempfile.gettempdir(),
'index-dir')
dir = FSDirectory.open(indexDir,)
analyzer = SimpleAnalyzer()
writer = IndexWriter(dir, analyzer, True)
# set variables that affect speed of indexing
writer.setMergeFactor(int(argv[2]))
writer.setMaxMergeDocs(int(argv[3]))
writer.setMaxBufferedDocs(int(argv[4]))
# writer.infoStream = tempfile.out
print "Merge factor: ", writer.getMergeFactor()
print "Max merge docs:", writer.getMaxMergeDocs()
print "Max buffered docs:", writer.getMaxBufferedDocs()
start = time()
for i in xrange(docsInIndex):
doc = Document()
doc.add(Field("fieldname", "Bibamus",
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.close()
print "Time: ", timedelta(seconds=time() - start)
示例2: lucene_index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import setMergeFactor [as 别名]
def lucene_index(input_folder,output_folder):
'''
Indexes fresh text data using lucene 3.6.
Doesn't support incremental generation of index as of now.
Currently crashes on neo by running out of heap space.
Arguments: Input folder for text files. output folder for index location
Returns: void. The index is stored if generated.
'''
# Setting up log file
logging.basicConfig(file=os.path.join(output_folder,"lucene_index.log"))
logging.info("Input directory for logging: "+input_folder)
logging.info("Output directory of index: "+output_folder)
if not os.path.isdir(output_folder):
logger.debug("Making output directory for index: "+ output_folder)
os.makedirs(output_folder)
# Setting up lucene's heap size for index and version of indexer
lucene.initVM(initialheap='1024m',maxheap='2048m')
index_folder = SimpleFSDirectory(File(output_folder))
analyzer = StandardAnalyzer(Version.LUCENE_30)
writer = IndexWriter(index_folder, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)
# Optimization to reduce heap space usage for generation of index. Merges buffer with
# current index after 15 docs.
writer.setMergeFactor(15)
writer.setRAMBufferSizeMB(32.0)
# Search to find the files to index
files_to_index = find_files_in_folder(input_folder)
for input_file in files_to_index:
doc = Document()
content = open(input_file, 'r').read()
doc.add(Field("text", content, Field.Store.NO, Field.Index.ANALYZED)) # Do not store text.Only index.
doc.add(Field("path", input_file, Field.Store.YES, Field.Index.NO)) # Store path to assist in retreiving the file
writer.addDocument(doc) # Index
logger.info("Indexed lines from " +input_folder+" (%d documents in index)" % (writer.numDocs()))
logger.info( "About to optimize index of %d documents..." % writer.numDocs())
writer.optimize() # Compress index
logger.info("...done optimizing index of %d documents" % writer.numDocs())
logger.info("Closing index of %d documents..." % writer.numDocs())
writer.close()
logger.info("Closed index")