当前位置: 首页>>代码示例>>Python>>正文


Python IndexWriter.numDocs方法代码示例

本文整理汇总了Python中lucene.IndexWriter.numDocs方法的典型用法代码示例。如果您正苦于以下问题:Python IndexWriter.numDocs方法的具体用法?Python IndexWriter.numDocs怎么用?Python IndexWriter.numDocs使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在lucene.IndexWriter的用法示例。


在下文中一共展示了IndexWriter.numDocs方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: configure_lucene

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
def configure_lucene():
    
    f = open('clique.txt','r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t','')
        line = line.replace('\r','')
        line = line.replace('\n','')
  	line = line.replace('^','')
    	line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()
开发者ID:avinashkoulavkar,项目名称:GUI,代码行数:31,代码来源:app.py

示例2: luceneIndexer

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
def luceneIndexer(docdir,indir):
	""" IndexDocuments from a directory.
	Args:
		docdir:文档所在文件夹
		indir:索引存放文件夹
	Returns:
		无返回值
	说明:
	FieldType().setStored=as-is value stored in the Lucene index
	FieldType().setTokenized=field is analyzed using the specified Analyzer - the tokens emitted are indexed
	FieldType().Indexed = the text (either as-is with keyword fields, or the tokens from tokenized fields) is made searchable (aka inverted)
	FieldType().Vectored = term frequency per document is stored in the index in an easily retrievable fashion.
	"""
	
	"""#类型1属性:对于需要检索,需要返回显示setStored(True)
	type1 = FieldType()
	type1.setIndexed(True)
	type1.setStored(True)
	type1.setTokenized(False)
	type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
	#类型2属性:对于不用返回显示,但是需要进行检索的字段。这里我认为文本内容(content)是这一种的,通常例如文件的META信息。
	type2 = FieldType()
	type2.setIndexed(True)
	type2.setStored(False)
	type2.setTokenized(True)
	type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)"""
	
	lucene.initVM()
	DIRTOINDEX= docdir
	INDEXIDR= indir
	indexdir= SimpleFSDirectory(File(INDEXIDR))
	analyzer= StandardAnalyzer(Version.LUCENE_30)
	#用指定的语言分析器构造一个新的写索引器.
	index_writer= IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512))
	for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):
	    #print "Indexing: "
		print "Indexing:", tfile;
		document = Document()
		content = open(tfile,'r').read()
		#类型使用方式
		#doc.add(Field("path", tfile, type1))
		
		#文档新增字段(Field){字段名:"text",存储:“YES”,索引:"YES"}
		document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED))
		document.add(Field("path",tfile,Field.Store.YES,Field.Index.ANALYZED))
		index_writer.addDocument(document)
		print "Done: ", tfile
	index_writer.optimize()
	print index_writer.numDocs()
	index_writer.close()
开发者ID:qiugen,项目名称:pylucene_demo,代码行数:52,代码来源:luceneInx.py

示例3: lucene_index

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
def lucene_index(input_folder,output_folder):
    '''
    Indexes fresh text data using lucene 3.6.
    Doesn't support incremental generation of index as of now.
    Currently crashes on neo by running out of heap space.
    Arguments: Input folder for text files. output folder for index location 
    Returns: void. The index is stored if generated.
    
    
    '''
    
    # Setting up log file
    logging.basicConfig(file=os.path.join(output_folder,"lucene_index.log"))
    logging.info("Input directory for logging: "+input_folder)
    logging.info("Output directory of index: "+output_folder)
    if  not os.path.isdir(output_folder):
        logger.debug("Making output directory for index: "+ output_folder)
        os.makedirs(output_folder)
    
    # Setting up lucene's heap size for index and version of indexer
    lucene.initVM(initialheap='1024m',maxheap='2048m')
    index_folder = SimpleFSDirectory(File(output_folder))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(index_folder, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)
    
    # Optimization to reduce heap space usage for generation of index. Merges buffer with
    # current index after 15 docs.
    writer.setMergeFactor(15) 
    writer.setRAMBufferSizeMB(32.0)
    
    # Search to find the files to index
    files_to_index = find_files_in_folder(input_folder) 
    for input_file in files_to_index:
        doc = Document()
        content = open(input_file, 'r').read()
        doc.add(Field("text", content, Field.Store.NO, Field.Index.ANALYZED)) # Do not store text.Only index.
        doc.add(Field("path", input_file, Field.Store.YES, Field.Index.NO)) # Store path to assist in retreiving the file
        writer.addDocument(doc) # Index

    logger.info("Indexed lines from " +input_folder+" (%d documents in index)" % (writer.numDocs()))
    logger.info( "About to optimize index of %d documents..." % writer.numDocs())
    writer.optimize() # Compress index
    logger.info("...done optimizing index of %d documents" % writer.numDocs())
    logger.info("Closing index of %d documents..." % writer.numDocs())
    writer.close()
    logger.info("Closed index")
开发者ID:clintpgeorge,项目名称:ediscovery,代码行数:48,代码来源:lucene_index.py

示例4: Indexer

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
def Indexer(docdir,indir):
	lucene.initVM()
	DIRTOINDEX   = docdir
	INDEXDIR     = indir
	indexdir     = FSDirectory(File(INDEXDIR))
	analyzer     = StandardAnalyzer(VERSION.LUCENE_30)
	index_writer = IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512))
	for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):
		print "Indexing ",tfile
		document=Document()
		content = open(tfile,'r').read()
		document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED))
		index_writer.addDocument(document)
		print "Done"
	index_writer.optimize()
	print index_writer.numDocs()
	index_writer.close()
开发者ID:liuyang1,项目名称:test,代码行数:19,代码来源:indexer.py

示例5: index

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
def index(string):
 lucene.initVM()
 indexDir = "REMOVEME.index-dir"
 dir = SimpleFSDirectory(File(indexDir))
 analyzer = StandardAnalyzer(Version.LUCENE_30)
 writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

 print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

 doc = Document()
 doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED))
 writer.addDocument(doc)

 print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
 print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
 writer.optimize()
 print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
 print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
 writer.close()
开发者ID:kansal,项目名称:Sub-Event-Detection,代码行数:21,代码来源:indexL.py

示例6: luceneIndexer

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
def luceneIndexer(docdir,indir):

	"""

	Index Documents from a dirrcory

	"""

	lucene.initVM()

	DIRTOINDEX = docdir

	INDEXIDR = indir

	indexdir = SimpleFSDirectory(File(INDEXIDR))

	analyzer = StandardAnalyzer(Version.LUCENE_30)

	index_writer = IndexWriter(indexdir,analyzer,True,\

	IndexWriter.MaxFieldLength(512))

	for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):

		print "Indexing: ", tfile

		document = Document()

		content = open(tfile,'r').read()

		document.add(Field("text",content,Field.Store.YES,\

		Field.Index.ANALYZED))

		index_writer.addDocument(document)

		print "Done: ", tfile

	index_writer.optimize()

	print index_writer.numDocs()

	index_writer.close()
开发者ID:ri0day,项目名称:gangster,代码行数:45,代码来源:pylucene_build_idx.py

示例7: index

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
    def index(cls, indexDir, dataDir):

        if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
            raise IOError, "%s does not exist or is not a directory" %(dataDir)

        dir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT),
                             True, IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(False)

        cls.indexDirectory(writer, dataDir)

        numIndexed = writer.numDocs()
        writer.commit()
        writer.close()
        dir.close()

        return numIndexed
开发者ID:qiugen,项目名称:pylucene-trunk,代码行数:20,代码来源:Indexer.py

示例8: SimpleFSDirectory

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
INDEXDIR = "texts.index"
dir = SimpleFSDirectory(File(INDEXDIR))
analyzer = SimpleAnalyzer(Version.LUCENE_35)
writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
conn = psycopg2.connect("dbname=texts user=swasheck")
cur = conn.cursor()
cur.execute("select reference, version_id, analysis_text from verse;")
for verse in cur.fetchall():
	print "Adding %s (version=%s)" % (verse[0],verse[1])
	doc = Document()
	doc.add(Field("reference", verse[0], Field.Store.YES, Field.Index.ANALYZED))
	doc.add(Field("version", str(verse[1]), Field.Store.YES, Field.Index.ANALYZED))
	doc.add(Field("text", verse[2], Field.Store.YES, Field.Index.ANALYZED))
	writer.addDocument(doc)	
print 'Optimizing the index of %d documents...' % writer.numDocs()
writer.optimize()
print 'Closing the index'
writer.close()
'''
INDEXDIR = "greek.texts.index"
dir = SimpleFSDirectory(File(INDEXDIR))
el_analyzer = GreekAnalyzer(Version.LUCENE_35)
analyzer = SimpleAnalyzer(Version.LUCENE_35)
writer = IndexWriter(dir, el_analyzer, True, IndexWriter.MaxFieldLength(512))
conn = psycopg2.connect("dbname=texts user=swasheck")
cur = conn.cursor()
cur.execute("select reference, version_id, analysis_text from verse where version_id in (2,3);")
for verse in cur.fetchall():
	print "Adding %s (version=%s)" % (verse[0],verse[1])
	doc = Document()
开发者ID:swasheck,项目名称:bible,代码行数:32,代码来源:create_index.py

示例9: import

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
from lucene import (SimpleFSDirectory, System, File,
    Document, Field, StandardAnalyzer, IndexWriter, IndexSearcher, QueryParser)

if __name__ == "__main__":
    lucene.initVM()
    fullIndexDir = r"c:\NLP\PhD\bob\fileDB\LuceneFullIndex"

    print "lucene version is:", lucene.VERSION

    fullIndex = SimpleFSDirectory(File(fullIndexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(fullIndex, analyzer, True, IndexWriter.MaxFieldLength(20000000))
##    writer = IndexWriter(store, analyzer, True, IndexWriter.MaxFieldLength(512))

    print  "Currently there are %d documents in the index..." % writer.numDocs()

##    print  "Reading lines from sys.stdin..."
    lines=["bla bla bla bla bla","Erase una vez que se era", "En un lugar de La Mancha de cuyo nombre no quiero acordarme, no ha mucho que vivia un hidalgo de los de lanza en ristre", "Manchame mancha mancha que te mancha la mancha"]

    for l in lines:
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        metadata={"asdfa":"asdfa"}
        json_metadata=json.dumps(metadata)
        doc.add(Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO))
        writer.addDocument(doc)

    print "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
    print "About to optimize index of %d documents..." % writer.numDocs()
    writer.optimize()
开发者ID:danieldmm,项目名称:minerva,代码行数:32,代码来源:pylucene_test.py

示例10: testIndexWriter

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
    def testIndexWriter(self):

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        self.assertEqual(len(self.keywords), writer.numDocs())
        writer.close()
开发者ID:pombredanne,项目名称:python-lucenepp,代码行数:8,代码来源:BaseIndexingTestCase.py

示例11: SimpleFSDirectory

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
    indexDir = "/Tmp/REMOVEME.index-dir"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

#    # set variables that affect speed of indexing
#    writer.setMergeFactor(int(argv[2]))
#    writer.setMaxMergeDocs(int(argv[3]))
#    writer.setMaxBufferedDocs(int(argv[4]))
#    # writer.infoStream = System.out
#
#    print "Merge factor:  ", writer.getMergeFactor()
#    print "Max merge docs:", writer.getMaxMergeDocs()
#    print "Max buffered docs:", writer.getMaxBufferedDocs()

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

    i = 0
    print >> sys.stderr, "Reading lines from sys.stdin..."
    for l in sys.stdin:
        i += 1

        if string.strip(l) == "": continue

        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

        if i % 10000 == 0:
            print >> sys.stderr, "Read %d lines from stdin (%d documents in index)..." % (i, writer.numDocs())
            print >> sys.stderr, stats()
开发者ID:f00barin,项目名称:biased-text-sample,代码行数:33,代码来源:index-sentences.py


注:本文中的lucene.IndexWriter.numDocs方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。