本文整理汇总了Python中lucene.IndexWriter.commit方法的典型用法代码示例。如果您正苦于以下问题:Python IndexWriter.commit方法的具体用法?Python IndexWriter.commit怎么用?Python IndexWriter.commit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lucene.IndexWriter
的用法示例。
在下文中一共展示了IndexWriter.commit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: addDocuments
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
def addDocuments(self, dir):
writer = IndexWriter(dir, SimpleAnalyzer(), True,
IndexWriter.MaxFieldLength.UNLIMITED)
#
# change to adjust performance of indexing with FSDirectory
# writer.mergeFactor = writer.mergeFactor
# writer.maxMergeDocs = writer.maxMergeDocs
# writer.minMergeDocs = writer.minMergeDocs
#
for word in self.docs:
doc = Document()
doc.add(Field("keyword", word,
Field.Store.YES, Field.Index.NOT_ANALYZED))
doc.add(Field("unindexed", word,
Field.Store.YES, Field.Index.NO))
doc.add(Field("unstored", word,
Field.Store.NO, Field.Index.ANALYZED))
doc.add(Field("text", word,
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.commit()
writer.close()
示例2: indexSingleFieldDocs
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
def indexSingleFieldDocs(self, fields):
writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
IndexWriter.MaxFieldLength.UNLIMITED)
for field in fields:
doc = Document()
doc.add(field)
writer.addDocument(doc)
writer.commit()
writer.close()
示例3: index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
def index(cls, indexDir, dataDir):
if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
raise IOError, "%s does not exist or is not a directory" %(dataDir)
writer = IndexWriter(indexDir, StandardAnalyzer(), True,
IndexWriter.MaxFieldLength.UNLIMITED)
writer.setUseCompoundFile(False)
numIndexed = cls.indexDirectory(writer, dataDir)
writer.commit()
writer.close()
return numIndexed
示例4: index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
def index(self):
dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
"verbose-index")
dir = FSDirectory.getDirectory(dirPath, True)
writer = IndexWriter(dir, SimpleAnalyzer(), True)
writer.setInfoStream(System.out)
for i in xrange(100):
doc = Document()
doc.add(Field("keyword", "goober",
Field.Store.YES, Field.Index.UN_TOKENIZED))
writer.addDocument(doc)
writer.commit()
writer.close()
示例5: index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
def index(cls, indexDir, dataDir):
if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
raise IOError, "%s does not exist or is not a directory" %(dataDir)
dir = SimpleFSDirectory(File(indexDir))
writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT),
True, IndexWriter.MaxFieldLength.LIMITED)
writer.setUseCompoundFile(False)
cls.indexDirectory(writer, dataDir)
numIndexed = writer.numDocs()
writer.commit()
writer.close()
dir.close()
return numIndexed
示例6: main
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
def main(cls, argv):
if len(argv) != 3:
print "Usage: T9er <WordNet index dir> <t9 index>"
return
for key in cls.keys:
c = key[0]
k = key[1:]
for kc in k:
cls.keyMap[kc] = c
print kc, "=", c
indexDir = argv[1]
t9dir = argv[2]
reader = IndexReader.open(indexDir)
numDocs = reader.maxDoc()
print "Processing", numDocs, "words"
writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True)
for id in xrange(reader.maxDoc()):
origDoc = reader.document(id)
word = origDoc.get("word")
if word is None or len(word) == 0:
continue
newDoc = Document()
newDoc.add(Field("word", word,
Field.Store.YES, Field.Index.UN_TOKENIZED))
newDoc.add(Field("t9", cls.t9(word),
Field.Store.YES, Field.Index.UN_TOKENIZED))
newDoc.add(Field("length", str(len(word)),
Field.Store.NO, Field.Index.UN_TOKENIZED))
writer.addDocument(newDoc)
if id % 100 == 0:
print "Document", id
writer.commit()
writer.close()
reader.close()
示例7: __init__
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
def __init__(self, storeDir, analyzer):
if not os.path.exists(storeDir):
os.mkdir(storeDir)
store = SimpleFSDirectory(File(storeDir))
analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
self.indexDocs(writer)
ticker = Ticker()
print( 'optimizing index' ),
threading.Thread(target=ticker.run).start()
writer.commit()
writer.close()
ticker.tick = False
print( 'done' )
示例8: addDocuments
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
def addDocuments(self, dir):
writer = IndexWriter(dir, self.getAnalyzer(), True,
IndexWriter.MaxFieldLength.UNLIMITED)
writer.setUseCompoundFile(self.isCompound())
for i in xrange(len(self.keywords)):
doc = Document()
doc.add(Field("id", self.keywords[i],
Field.Store.YES, Field.Index.NOT_ANALYZED))
doc.add(Field("country", self.unindexed[i],
Field.Store.YES, Field.Index.NO))
doc.add(Field("contents", self.unstored[i],
Field.Store.NO, Field.Index.ANALYZED))
doc.add(Field("city", self.text[i],
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.commit()
writer.close()
示例9: createIndex
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
def createIndex(cls, dataDir, indexDir, useCompound):
indexDir = SimpleFSDirectory(File(indexDir))
config = IndexWriterConfig(Version.LUCENE_CURRENT,
StandardAnalyzer(Version.LUCENE_CURRENT))
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(indexDir, config)
config = writer.getConfig()
mp = config.getMergePolicy()
if (LogMergePolicy.instance_(mp)):
mp.setUseCompoundFile(useCompound)
for dir, dirnames, filenames in os.walk(dataDir):
for filename in filenames:
if filename.endswith('.properties'):
cls.indexFile(writer, os.path.join(dir, filename), dataDir)
writer.commit()
writer.close()
示例10: testDeleteAfterIndexMerge
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
def testDeleteAfterIndexMerge(self):
reader = IndexReader.open(self.dir, False)
self.assertEqual(2, reader.maxDoc())
self.assertEqual(2, reader.numDocs())
reader.deleteDocument(1)
reader.close()
writer = IndexWriter(self.dir, self.getAnalyzer(), False,
IndexWriter.MaxFieldLength.UNLIMITED)
writer.commit()
writer.close()
reader = IndexReader.open(self.dir, True)
deletedDocs = MultiFields.getDeletedDocs(reader)
self.assert_(deletedDocs is None or not deletedDocs.get(1))
self.assert_(not reader.hasDeletions())
self.assertEqual(1, reader.maxDoc())
self.assertEqual(1, reader.numDocs())
reader.close()
示例11: indexTaxonomy
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
def indexTaxonomy(taxonomy, index_path):
lucene.initVM()
index_location = index_path
dir = SimpleFSDirectory(lucene.File(index_location))
analyzer = StandardAnalyzer(Version.LUCENE_30)
writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(1024))
for i in taxonomy:
v = taxonomy[i]
doc = lucene.Document()
doc.add(lucene.Field('name', v['name'] , lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
doc.add(lucene.Field('id', v['id'] , lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
doc.add(lucene.Field('alias', json.dumps(v['alias']) , lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
doc.add(lucene.Field('type', v['type'] , lucene.Field.Store.YES, lucene.Field.Index.NO))
doc.add(lucene.Field('contained_by', json.dumps(v['contained_by']) , lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
doc.add(lucene.Field('images', json.dumps(v['images']) , lucene.Field.Store.YES, lucene.Field.Index.NO))
writer.addDocument(doc)
writer.commit()
writer.close()
示例12: testUpdate
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
def testUpdate(self):
self.assertEqual(1, self.getHitCount("city", "Amsterdam"))
reader = IndexReader.open(self.dir, False)
reader.deleteDocuments(Term("city", "Amsterdam"))
reader.close()
writer = IndexWriter(self.dir, self.getAnalyzer(), False,
IndexWriter.MaxFieldLength.UNLIMITED)
doc = Document()
doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED))
doc.add(Field("country", "Russia",
Field.Store.YES, Field.Index.NO))
doc.add(Field("contents", "St. Petersburg has lots of bridges",
Field.Store.NO, Field.Index.ANALYZED))
doc.add(Field("city", "St. Petersburg",
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.commit()
writer.close()
self.assertEqual(0, self.getHitCount("city", "Amsterdam"))
self.assertEqual(1, self.getHitCount("city", "Petersburg"))
示例13: main
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
def main(cls, argv):
if len(argv) < 2:
print "Usage: BerkeleyDbIndexer <index dir> -create"
return
dbHome = argv[1]
create = len(argv) > 2 and argv[2] == "-create"
if not os.path.exists(dbHome):
os.makedirs(dbHome)
elif create:
for name in os.listdir(dbHome):
if name.startswith('__'):
os.remove(os.path.join(dbHome, name))
env = DBEnv()
env.set_flags(DB_LOG_INMEMORY, 1);
if os.name == 'nt':
env.set_cachesize(0, 0x4000000, 1)
elif os.name == 'posix':
from commands import getstatusoutput
if getstatusoutput('uname') == (0, 'Linux'):
env.set_cachesize(0, 0x4000000, 1)
env.open(dbHome, (DB_CREATE | DB_THREAD |
DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_TXN), 0)
index = DB(env)
blocks = DB(env)
txn = None
try:
txn = env.txn_begin(None)
index.open(filename = '__index__', dbtype = DB_BTREE,
flags = DB_CREATE | DB_THREAD, txn = txn)
blocks.open(filename = '__blocks__', dbtype = DB_BTREE,
flags = DB_CREATE | DB_THREAD, txn = txn)
except:
if txn is not None:
txn.abort()
txn = None
raise
else:
txn.commit()
txn = None
try:
txn = env.txn_begin(None)
directory = DbDirectory(txn, index, blocks, 0)
writer = IndexWriter(directory, StandardAnalyzer(), create,
IndexWriter.MaxFieldLength.UNLIMITED)
writer.setUseCompoundFile(False)
doc = Document()
doc.add(Field("contents", "The quick brown fox...",
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.commit()
writer.close()
except:
if txn is not None:
txn.abort()
txn = None
raise
else:
txn.commit()
index.close()
blocks.close()
env.close()
print "Indexing Complete"
示例14: initVM
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
if __name__ == '__main__':
initVM()
directory = RAMDirectory()
iwriter = IndexWriter(directory, StandardAnalyzer(Version.LUCENE_CURRENT),
True, IndexWriter.MaxFieldLength.LIMITED)
ts = ["this bernhard is the text to be index text",
"this claudia is the text to be index"]
for t in ts:
doc = Document()
doc.add(Field("fieldname", t,
Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.WITH_POSITIONS_OFFSETS))
iwriter.addDocument(doc)
iwriter.commit()
iwriter.close()
ireader = IndexReader.open(directory, True)
tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname'))
for (t,f,i) in zip(tpv.getTerms(),tpv.getTermFrequencies(),xrange(100000)):
print 'term %s' % t
print ' freq: %i' % f
try:
print ' pos: ' + str([p for p in tpv.getTermPositions(i)])
except:
print ' no pos'
try:
print ' off: ' + \
str(["%i-%i" % (o.getStartOffset(), o.getEndOffset())
示例15: index_plain_text_emails
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
def index_plain_text_emails(data_folder,
path_index_file, store_dir,
lemmatize = False, stem = False,
nonascii = True):
'''
Indexes all the plain text emails in the input directory
and stores the index in the store_dir
Arguments:
data_folder - input directory absolute path
path_index_file - file paths index file
store_dir - index store directory absolute path
Returns:
None
'''
if not os.path.exists(store_dir):
os.mkdir(store_dir)
if os.path.exists(path_index_file):
logging.info('Loading file paths index...')
file_tuples = load_file_paths_index(path_index_file)
logging.info('%d files found in the file paths index.' % len(file_tuples))
else:
logging.info('Loading files in the data folder %s...' % data_folder)
file_tuples = get_file_paths_index(data_folder)
logging.info('%d email documents found.' % len(file_tuples))
store_file_paths_index(path_index_file, file_tuples)
logging.info('File paths index is stored into %s' % path_index_file)
logging.info('Lucene: Stem = %s, Lemmatize = %s, Number of documents = %d' % (stem, lemmatize, len(file_tuples)))
store = SimpleFSDirectory(File(store_dir))
writer = IndexWriter(store, STD_ANALYZER, True, IndexWriter.MaxFieldLength.LIMITED)
print 'Lucene:', len(file_tuples), 'files found in %s.' % data_folder
print 'Lucene: Stem =', stem, 'Lemmatize =', lemmatize, 'Allow non-ASCII =', nonascii
for ft in file_tuples:
idx, root, file_name = ft
file_path = os.path.join(root, file_name)
logging.info("[%d] file: %s - adding to Lucene index.", idx, file_name)
# parses the emails in plain text format
receiver, sender, cc, subject, message_text, bcc, date, email_text = parse_plain_text_email(file_path,
tokenize = True,
lemmatize = lemmatize,
stem = stem,
nonascii = nonascii)
doc = Document()
doc.add(Field(MetadataType.FILE_ID, str(idx), Field.Store.YES, Field.Index.NOT_ANALYZED))
doc.add(Field(MetadataType.FILE_NAME, file_name, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.YES))
doc.add(Field(MetadataType.FILE_PATH, file_path, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
doc.add(Field(MetadataType.EMAIL_RECEIVER, receiver, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
doc.add(Field(MetadataType.EMAIL_SENDER, sender, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
doc.add(Field(MetadataType.EMAIL_CC, cc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
doc.add(Field(MetadataType.EMAIL_SUBJECT, subject, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
#Subodh-Rahul - Added BCC field in indexing.
doc.add(Field(MetadataType.EMAIL_BCC, bcc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
#Subodh-Rahul - Added Email-Date field in indexing
doc.add(Field(MetadataType.EMAIL_DATE, date, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
if len(message_text) > 0:
doc.add(Field(MetadataType.EMAIL_BODY, message_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))
else:
logging.error("[%d] file: %s - body text is empty.", idx, file_name)
# Adds all documents fields as a separate index so that we can search through them
doc.add(Field(MetadataType.ALL, email_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))
writer.addDocument(doc)
logging.info("[%d] file: %s - added to Lucene index.", idx, file_name)
writer.commit()
writer.close()
logging.info('Lucene: All files are indexed.')