本文整理汇总了Python中lucene.IndexWriter.numDocs方法的典型用法代码示例。如果您正苦于以下问题:Python IndexWriter.numDocs方法的具体用法?Python IndexWriter.numDocs怎么用?Python IndexWriter.numDocs使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lucene.IndexWriter
的用法示例。
在下文中一共展示了IndexWriter.numDocs方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: configure_lucene
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
def configure_lucene():
f = open('clique.txt','r')
lucene.initVM()
print 'Inside Function'
#indexDir = "/tmp/luceneindex"
dir = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()
print >> sys.stderr, "Reading lines from sys.stdin..."
for line in f:
line = line.replace('\t','')
line = line.replace('\r','')
line = line.replace('\n','')
line = line.replace('^','')
line = line.strip()
doc = Document()
doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
writer.optimize()
print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
writer.close()
示例2: luceneIndexer
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
def luceneIndexer(docdir,indir):
""" IndexDocuments from a directory.
Args:
docdir:文档所在文件夹
indir:索引存放文件夹
Returns:
无返回值
说明:
FieldType().setStored=as-is value stored in the Lucene index
FieldType().setTokenized=field is analyzed using the specified Analyzer - the tokens emitted are indexed
FieldType().Indexed = the text (either as-is with keyword fields, or the tokens from tokenized fields) is made searchable (aka inverted)
FieldType().Vectored = term frequency per document is stored in the index in an easily retrievable fashion.
"""
"""#类型1属性:对于需要检索,需要返回显示setStored(True)
type1 = FieldType()
type1.setIndexed(True)
type1.setStored(True)
type1.setTokenized(False)
type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
#类型2属性:对于不用返回显示,但是需要进行检索的字段。这里我认为文本内容(content)是这一种的,通常例如文件的META信息。
type2 = FieldType()
type2.setIndexed(True)
type2.setStored(False)
type2.setTokenized(True)
type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)"""
lucene.initVM()
DIRTOINDEX= docdir
INDEXIDR= indir
indexdir= SimpleFSDirectory(File(INDEXIDR))
analyzer= StandardAnalyzer(Version.LUCENE_30)
#用指定的语言分析器构造一个新的写索引器.
index_writer= IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512))
for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):
#print "Indexing: "
print "Indexing:", tfile;
document = Document()
content = open(tfile,'r').read()
#类型使用方式
#doc.add(Field("path", tfile, type1))
#文档新增字段(Field){字段名:"text",存储:“YES”,索引:"YES"}
document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED))
document.add(Field("path",tfile,Field.Store.YES,Field.Index.ANALYZED))
index_writer.addDocument(document)
print "Done: ", tfile
index_writer.optimize()
print index_writer.numDocs()
index_writer.close()
示例3: lucene_index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
def lucene_index(input_folder,output_folder):
'''
Indexes fresh text data using lucene 3.6.
Doesn't support incremental generation of index as of now.
Currently crashes on neo by running out of heap space.
Arguments: Input folder for text files. output folder for index location
Returns: void. The index is stored if generated.
'''
# Setting up log file
logging.basicConfig(file=os.path.join(output_folder,"lucene_index.log"))
logging.info("Input directory for logging: "+input_folder)
logging.info("Output directory of index: "+output_folder)
if not os.path.isdir(output_folder):
logger.debug("Making output directory for index: "+ output_folder)
os.makedirs(output_folder)
# Setting up lucene's heap size for index and version of indexer
lucene.initVM(initialheap='1024m',maxheap='2048m')
index_folder = SimpleFSDirectory(File(output_folder))
analyzer = StandardAnalyzer(Version.LUCENE_30)
writer = IndexWriter(index_folder, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)
# Optimization to reduce heap space usage for generation of index. Merges buffer with
# current index after 15 docs.
writer.setMergeFactor(15)
writer.setRAMBufferSizeMB(32.0)
# Search to find the files to index
files_to_index = find_files_in_folder(input_folder)
for input_file in files_to_index:
doc = Document()
content = open(input_file, 'r').read()
doc.add(Field("text", content, Field.Store.NO, Field.Index.ANALYZED)) # Do not store text.Only index.
doc.add(Field("path", input_file, Field.Store.YES, Field.Index.NO)) # Store path to assist in retreiving the file
writer.addDocument(doc) # Index
logger.info("Indexed lines from " +input_folder+" (%d documents in index)" % (writer.numDocs()))
logger.info( "About to optimize index of %d documents..." % writer.numDocs())
writer.optimize() # Compress index
logger.info("...done optimizing index of %d documents" % writer.numDocs())
logger.info("Closing index of %d documents..." % writer.numDocs())
writer.close()
logger.info("Closed index")
示例4: Indexer
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
def Indexer(docdir,indir):
lucene.initVM()
DIRTOINDEX = docdir
INDEXDIR = indir
indexdir = FSDirectory(File(INDEXDIR))
analyzer = StandardAnalyzer(VERSION.LUCENE_30)
index_writer = IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512))
for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):
print "Indexing ",tfile
document=Document()
content = open(tfile,'r').read()
document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED))
index_writer.addDocument(document)
print "Done"
index_writer.optimize()
print index_writer.numDocs()
index_writer.close()
示例5: index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
def index(string):
lucene.initVM()
indexDir = "REMOVEME.index-dir"
dir = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(Version.LUCENE_30)
writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()
doc = Document()
doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
writer.optimize()
print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
writer.close()
示例6: luceneIndexer
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
def luceneIndexer(docdir,indir):
"""
Index Documents from a dirrcory
"""
lucene.initVM()
DIRTOINDEX = docdir
INDEXIDR = indir
indexdir = SimpleFSDirectory(File(INDEXIDR))
analyzer = StandardAnalyzer(Version.LUCENE_30)
index_writer = IndexWriter(indexdir,analyzer,True,\
IndexWriter.MaxFieldLength(512))
for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):
print "Indexing: ", tfile
document = Document()
content = open(tfile,'r').read()
document.add(Field("text",content,Field.Store.YES,\
Field.Index.ANALYZED))
index_writer.addDocument(document)
print "Done: ", tfile
index_writer.optimize()
print index_writer.numDocs()
index_writer.close()
示例7: index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
def index(cls, indexDir, dataDir):
if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
raise IOError, "%s does not exist or is not a directory" %(dataDir)
dir = SimpleFSDirectory(File(indexDir))
writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT),
True, IndexWriter.MaxFieldLength.LIMITED)
writer.setUseCompoundFile(False)
cls.indexDirectory(writer, dataDir)
numIndexed = writer.numDocs()
writer.commit()
writer.close()
dir.close()
return numIndexed
示例8: SimpleFSDirectory
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
INDEXDIR = "texts.index"
dir = SimpleFSDirectory(File(INDEXDIR))
analyzer = SimpleAnalyzer(Version.LUCENE_35)
writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
conn = psycopg2.connect("dbname=texts user=swasheck")
cur = conn.cursor()
cur.execute("select reference, version_id, analysis_text from verse;")
for verse in cur.fetchall():
print "Adding %s (version=%s)" % (verse[0],verse[1])
doc = Document()
doc.add(Field("reference", verse[0], Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("version", str(verse[1]), Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("text", verse[2], Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print 'Optimizing the index of %d documents...' % writer.numDocs()
writer.optimize()
print 'Closing the index'
writer.close()
'''
INDEXDIR = "greek.texts.index"
dir = SimpleFSDirectory(File(INDEXDIR))
el_analyzer = GreekAnalyzer(Version.LUCENE_35)
analyzer = SimpleAnalyzer(Version.LUCENE_35)
writer = IndexWriter(dir, el_analyzer, True, IndexWriter.MaxFieldLength(512))
conn = psycopg2.connect("dbname=texts user=swasheck")
cur = conn.cursor()
cur.execute("select reference, version_id, analysis_text from verse where version_id in (2,3);")
for verse in cur.fetchall():
print "Adding %s (version=%s)" % (verse[0],verse[1])
doc = Document()
示例9: import
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
from lucene import (SimpleFSDirectory, System, File,
Document, Field, StandardAnalyzer, IndexWriter, IndexSearcher, QueryParser)
if __name__ == "__main__":
lucene.initVM()
fullIndexDir = r"c:\NLP\PhD\bob\fileDB\LuceneFullIndex"
print "lucene version is:", lucene.VERSION
fullIndex = SimpleFSDirectory(File(fullIndexDir))
analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
writer = IndexWriter(fullIndex, analyzer, True, IndexWriter.MaxFieldLength(20000000))
## writer = IndexWriter(store, analyzer, True, IndexWriter.MaxFieldLength(512))
print "Currently there are %d documents in the index..." % writer.numDocs()
## print "Reading lines from sys.stdin..."
lines=["bla bla bla bla bla","Erase una vez que se era", "En un lugar de La Mancha de cuyo nombre no quiero acordarme, no ha mucho que vivia un hidalgo de los de lanza en ristre", "Manchame mancha mancha que te mancha la mancha"]
for l in lines:
doc = Document()
doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
metadata={"asdfa":"asdfa"}
json_metadata=json.dumps(metadata)
doc.add(Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO))
writer.addDocument(doc)
print "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
print "About to optimize index of %d documents..." % writer.numDocs()
writer.optimize()
示例10: testIndexWriter
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
def testIndexWriter(self):
writer = IndexWriter(self.dir, self.getAnalyzer(), False,
IndexWriter.MaxFieldLength.UNLIMITED)
self.assertEqual(len(self.keywords), writer.numDocs())
writer.close()
示例11: SimpleFSDirectory
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import numDocs [as 别名]
indexDir = "/Tmp/REMOVEME.index-dir"
dir = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(Version.LUCENE_30)
writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
# # set variables that affect speed of indexing
# writer.setMergeFactor(int(argv[2]))
# writer.setMaxMergeDocs(int(argv[3]))
# writer.setMaxBufferedDocs(int(argv[4]))
# # writer.infoStream = System.out
#
# print "Merge factor: ", writer.getMergeFactor()
# print "Max merge docs:", writer.getMaxMergeDocs()
# print "Max buffered docs:", writer.getMaxBufferedDocs()
print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()
i = 0
print >> sys.stderr, "Reading lines from sys.stdin..."
for l in sys.stdin:
i += 1
if string.strip(l) == "": continue
doc = Document()
doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
if i % 10000 == 0:
print >> sys.stderr, "Read %d lines from stdin (%d documents in index)..." % (i, writer.numDocs())
print >> sys.stderr, stats()