本文整理汇总了Python中lucene.IndexWriter.optimize方法的典型用法代码示例。如果您正苦于以下问题:Python IndexWriter.optimize方法的具体用法?Python IndexWriter.optimize怎么用?Python IndexWriter.optimize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lucene.IndexWriter
的用法示例。
在下文中一共展示了IndexWriter.optimize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def index(string):
lucene.initVM()
indexDir = "REMOVEME.index-dir"
dir = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(Version.LUCENE_30)
try:
writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512))
except lucene.JavaError:
#print 'Inside Index Except'
writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
#e = sys.exc_info()[0]
#print e
#print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()
doc = Document()
doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
#print 'In the index function'
#print writer.numDocs()
#print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
#print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
writer.optimize()
#print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
#print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
#print 'ending Indexing'
#print string
#print 'Total indexes'
#print writer.numDocs()
writer.close()
示例2: update_index_withLineArray
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def update_index_withLineArray(self,array):
"""
Parsed sentences (given in an array) are added to the index, with the corresponding two entities (x,y) and the DBpedia URI
"""
print "start adding sentences"
writer = IndexWriter(index_directory, analyzer, False, IndexWriter.MaxFieldLength(512))
for item in array:
line = item[0]
x = item[1]
y = item[2]
uri = item[3]
line=line.replace("\t"," ")
line = line.replace("\n"," ")
line = line.replace(" "," ")
try:
doc = Document()
doc.add(Field("Sentence", line, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("X", x, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("Y", y, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("URI", uri, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
except Exception:
print "Unexpected error:", sys.exc_info()[0]
raw_input("Error in updating the Sentences")
try:
writer.optimize()
except:
print "Unexpected error:", sys.exc_info()[0]
print ("could not optimize index")
writer.close()
print "all sentences added"
示例3: do_index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def do_index():
initVM()
indexDir = "/home/william/woyaoo/luceneindex"
version = Version.LUCENE_CURRENT
standardAnalyzer = StandardAnalyzer(version)
# chineseAnalyzer = CJKAnalyzer(version)
engine = data.engine_from_config("indexdb.config")
# engine = data.engine_from_config()
db = data.init_datafactory(engine)
docs = dbfactory.Session().query(doc_model.Doc).filter(doc_model.Doc.dateCreated > "20121220").all()
print len(docs)
idxDir = SimpleFSDirectory(File(indexDir))
perIndexCount = 5000
writer = IndexWriter(idxDir, standardAnalyzer, True, IndexWriter.MaxFieldLength(512))
# add field
for doc in docs:
# print repr(doc.description)
lucenedoc = Document()
descriptionValue = doc.description.strip("\r\n").encode("UTF-8")
# descriptionValue ='中国 abc'
print repr(descriptionValue)
lucenedoc.add(Field("url", doc.url, Field.Store.YES, Field.Index.NOT_ANALYZED))
lucenedoc.add(Field("intent", doc.intent, Field.Store.YES, Field.Index.NOT_ANALYZED))
# lucenedoc.add(Field('description', doc.description, Field.Store.YES, Field.Index.ANALYZED))
lucenedoc.add(Field("description", descriptionValue, Field.Store.YES, Field.Index.ANALYZED))
lucenedoc.add(Field("title", doc.title, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(lucenedoc)
writer.optimize()
writer.close()
print "index finished"
示例4: configure_lucene
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def configure_lucene():
f = open('clique.txt','r')
lucene.initVM()
print 'Inside Function'
#indexDir = "/tmp/luceneindex"
dir = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()
print >> sys.stderr, "Reading lines from sys.stdin..."
for line in f:
line = line.replace('\t','')
line = line.replace('\r','')
line = line.replace('\n','')
line = line.replace('^','')
line = line.strip()
doc = Document()
doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
writer.optimize()
print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
writer.close()
示例5: index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def index( self ):
lucene.initVM()
indexdir = SimpleFSDirectory( File( self.INDEX_DIR ) )
analyzer = StandardAnalyzer( Version.LUCENE_30 )
index_writer = IndexWriter( indexdir, analyzer, True, IndexWriter.MaxFieldLength( 512 ) )
# read input files (.xml)
for in_file in glob.glob( os.path.join( self.DOC_DIR, '*.xml' ) ):
corpus = codecs.open( in_file, encoding='utf-8' ).read()
d = pq( corpus, parser='html' )
for text in d( 'Article' ).items():
document = Document()
# find ID
art_id = str( text.attr( 'articleid' ).encode( 'utf-8' ) ).replace( '+', '-' )
# find Title
art_title = self.stem( str( text.attr( 'title' ).encode( 'utf-8' ) ) )
# find Abstract
art_abstract = self.stem( str( text.find( 'Abstract' ).html().encode('utf-8') ) )
# find Keyword
art_keyword = text.find( 'Keyword' ).html().encode('utf-8')
# find Content
art_content = self.stem( str( text.find( 'Content' ).html().encode('utf-8') ) )
# find Authors
art_authors = text.find( 'Authors' ).html().encode('utf-8')
document.add( Field( 'id', art_id, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'title', art_title, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'abstract', art_abstract, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'keyword', art_keyword, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'content', art_content, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'authors', art_authors, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'article', art_title + art_abstract + art_keyword + art_content,\
Field.Store.YES,\
Field.Index.ANALYZED ) )
index_writer.addDocument( document )
index_writer.optimize()
index_writer.close()
示例6: index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def index(self,path_to_index,path_files):
'indexes anchor texts from a given folder'
#lucene.initVM()
indexDir = path_to_index
directory_index = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(Version.LUCENE_35)
writer = IndexWriter(directory_index, analyzer, True, IndexWriter.MaxFieldLength(512))
listOfPathes = []
listOfPathes.extend(glob.glob(path_files+"*.txt"))
counter = 0
for path_to_file in listOfPathes:
print path_to_file
f = open(path_to_file,"r")
for line in f:
entry = line.split("\t")
counter+=1
"""
optimizes index after a certain amount of added documents
"""
if counter%500000==0:
print counter
writer.optimize()
doc = Document()
doc.add(Field("anchor", entry[0], Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("anchor_uri", entry[1], Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("dbpedia_uri", entry[2], Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("number", entry[3].replace("\n",""), Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.optimize()
f.close()
writer.close()
print counter
print "done"
示例7: addDocuments
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def addDocuments(self, dir, isCompound):
writer = IndexWriter(dir, SimpleAnalyzer(), True,
IndexWriter.MaxFieldLength.LIMITED)
writer.setUseCompoundFile(isCompound)
# change to adjust performance of indexing with FSDirectory
# writer.mergeFactor = writer.mergeFactor
# writer.maxMergeDocs = writer.maxMergeDocs
# writer.minMergeDocs = writer.minMergeDocs
for word in self.docs:
doc = Document()
doc.add(Field("keyword", word,
Field.Store.YES, Field.Index.NOT_ANALYZED))
doc.add(Field("unindexed", word,
Field.Store.YES, Field.Index.NO))
doc.add(Field("unstored", word,
Field.Store.NO, Field.Index.ANALYZED))
doc.add(Field("text", word,
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.optimize()
writer.close()
示例8: index_files
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def index_files (files, index_directory):
lucene.initVM()
d = SimpleFSDirectory(File(index_directory))
analyzer = StandardAnalyzer(Version.LUCENE_30)
writer = IndexWriter(d, analyzer, True, IndexWriter.MaxFieldLength(512))
for f in files:
parse_file(f, writer)
writer.optimize()
writer.close()
示例9: __init__
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def __init__(self, root, storeDir, analyzer):
if not os.path.exists(storeDir):
os.mkdir(storeDir)
store = FSDirectory.getDirectory(storeDir, True)
writer = IndexWriter(store, analyzer, True)
writer.setMaxFieldLength(1048576)
self.indexDocs(root, writer)
print 'optimizing index',
writer.optimize()
writer.close()
print 'done'
示例10: indexSingleFieldDocs
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def indexSingleFieldDocs(self, fields):
writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
IndexWriter.MaxFieldLength.UNLIMITED)
for field in fields:
doc = Document()
doc.add(field)
writer.addDocument(doc)
writer.optimize()
writer.close()
示例11: index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def index(cls, indexDir, dataDir):
if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
raise IOError, "%s does not exist or is not a directory" % (dataDir)
writer = IndexWriter(indexDir, StandardAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED)
writer.setUseCompoundFile(False)
numIndexed = cls.indexDirectory(writer, dataDir)
writer.optimize()
writer.close()
return numIndexed
示例12: luceneIndexer
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def luceneIndexer(docdir,indir):
""" IndexDocuments from a directory.
Args:
docdir:文档所在文件夹
indir:索引存放文件夹
Returns:
无返回值
说明:
FieldType().setStored=as-is value stored in the Lucene index
FieldType().setTokenized=field is analyzed using the specified Analyzer - the tokens emitted are indexed
FieldType().Indexed = the text (either as-is with keyword fields, or the tokens from tokenized fields) is made searchable (aka inverted)
FieldType().Vectored = term frequency per document is stored in the index in an easily retrievable fashion.
"""
"""#类型1属性:对于需要检索,需要返回显示setStored(True)
type1 = FieldType()
type1.setIndexed(True)
type1.setStored(True)
type1.setTokenized(False)
type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
#类型2属性:对于不用返回显示,但是需要进行检索的字段。这里我认为文本内容(content)是这一种的,通常例如文件的META信息。
type2 = FieldType()
type2.setIndexed(True)
type2.setStored(False)
type2.setTokenized(True)
type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)"""
lucene.initVM()
DIRTOINDEX= docdir
INDEXIDR= indir
indexdir= SimpleFSDirectory(File(INDEXIDR))
analyzer= StandardAnalyzer(Version.LUCENE_30)
#用指定的语言分析器构造一个新的写索引器.
index_writer= IndexWriter(indexdir,analyzer,True,IndexWriter.MaxFieldLength(512))
for tfile in glob.glob(os.path.join(DIRTOINDEX,'*.txt')):
#print "Indexing: "
print "Indexing:", tfile;
document = Document()
content = open(tfile,'r').read()
#类型使用方式
#doc.add(Field("path", tfile, type1))
#文档新增字段(Field){字段名:"text",存储:“YES”,索引:"YES"}
document.add(Field("text",content,Field.Store.YES,Field.Index.ANALYZED))
document.add(Field("path",tfile,Field.Store.YES,Field.Index.ANALYZED))
index_writer.addDocument(document)
print "Done: ", tfile
index_writer.optimize()
print index_writer.numDocs()
index_writer.close()
示例13: initDummyStore
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def initDummyStore(self, directory):
"""Open a dummy ramdirectory for testing."""
writer = IndexWriter(directory, SimpleAnalyzer(), True)
doc = Document()
doc.add(Field("name", 'dummy.txt', Field.Store.YES,
Field.Index.UN_TOKENIZED))
doc.add(Field("path", '/path/to/dummy.txt', Field.Store.YES,
Field.Index.UN_TOKENIZED))
doc.add(Field("path", '/path/to/another/dummy.txt', Field.Store.YES,
Field.Index.UN_TOKENIZED))
doc.add(Field("contents", "foo dummy bar", Field.Store.YES,
Field.Index.TOKENIZED))
writer.addDocument(doc)
writer.optimize()
writer.close()
示例14: createIndex
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def createIndex(cls, dataDir, indexDir, useCompound):
indexDir = SimpleFSDirectory(File(indexDir))
writer = IndexWriter(
indexDir, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.UNLIMITED
)
writer.setUseCompoundFile(useCompound)
for dir, dirnames, filenames in os.walk(dataDir):
for filename in filenames:
if filename.endswith(".properties"):
cls.indexFile(writer, os.path.join(dir, filename), dataDir)
writer.optimize()
writer.close()
示例15: index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import optimize [as 别名]
def index(self):
dirPath = os.path.join(tempfile.gettempdir(),
"verbose-index")
dir = FSDirectory.open(dirPath)
writer = IndexWriter(dir, SimpleAnalyzer(), True)
writer.setInfoStream(InfoStreamOut())
for i in xrange(100):
doc = Document()
doc.add(Field("keyword", "goober",
Field.Store.YES, Field.Index.NOT_ANALYZED))
writer.addDocument(doc)
writer.optimize()
writer.close()