当前位置: 首页>>代码示例>>Python>>正文


Python IndexWriter.commit方法代码示例

本文整理汇总了Python中lucene.IndexWriter.commit方法的典型用法代码示例。如果您正苦于以下问题:Python IndexWriter.commit方法的具体用法?Python IndexWriter.commit怎么用?Python IndexWriter.commit使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在lucene.IndexWriter的用法示例。


在下文中一共展示了IndexWriter.commit方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: addDocuments

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
    def addDocuments(self, dir):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        #
        # change to adjust performance of indexing with FSDirectory
        # writer.mergeFactor = writer.mergeFactor
        # writer.maxMergeDocs = writer.maxMergeDocs
        # writer.minMergeDocs = writer.minMergeDocs
        #

        for word in self.docs:
            doc = Document()
            doc.add(Field("keyword", word,
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field("unindexed", word,
                          Field.Store.YES, Field.Index.NO))
            doc.add(Field("unstored", word,
                          Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("text", word,
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.commit()
        writer.close()
开发者ID:qiugen,项目名称:pylucene-trunk,代码行数:28,代码来源:FSversusRAMDirectoryTest.py

示例2: indexSingleFieldDocs

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
    def indexSingleFieldDocs(self, fields):

        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        for field in fields:
            doc = Document()
            doc.add(field)
            writer.addDocument(doc)

        writer.commit()
        writer.close()
开发者ID:qiugen,项目名称:pylucene-trunk,代码行数:14,代码来源:ScoreTest.py

示例3: index

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
    def index(cls, indexDir, dataDir):

        if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
            raise IOError, "%s does not exist or is not a directory" %(dataDir)

        writer = IndexWriter(indexDir, StandardAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        writer.setUseCompoundFile(False)

        numIndexed = cls.indexDirectory(writer, dataDir)
        writer.commit()
        writer.close()

        return numIndexed
开发者ID:qiugen,项目名称:pylucene-trunk,代码行数:16,代码来源:FileIndexer.py

示例4: index

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
    def index(self):

        dirPath = os.path.join(System.getProperty("java.io.tmpdir", "tmp"),
                               "verbose-index")
        dir = FSDirectory.getDirectory(dirPath, True)
        writer = IndexWriter(dir, SimpleAnalyzer(), True)

        writer.setInfoStream(System.out)

        for i in xrange(100):
            doc = Document()
            doc.add(Field("keyword", "goober",
                             Field.Store.YES, Field.Index.UN_TOKENIZED))
            writer.addDocument(doc)

        writer.commit()
        writer.close()
开发者ID:qiugen,项目名称:pylucene-trunk,代码行数:19,代码来源:VerboseIndexing.py

示例5: index

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
    def index(cls, indexDir, dataDir):

        if not (os.path.exists(dataDir) and os.path.isdir(dataDir)):
            raise IOError, "%s does not exist or is not a directory" %(dataDir)

        dir = SimpleFSDirectory(File(indexDir))
        writer = IndexWriter(dir, StandardAnalyzer(Version.LUCENE_CURRENT),
                             True, IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(False)

        cls.indexDirectory(writer, dataDir)

        numIndexed = writer.numDocs()
        writer.commit()
        writer.close()
        dir.close()

        return numIndexed
开发者ID:qiugen,项目名称:pylucene-trunk,代码行数:20,代码来源:Indexer.py

示例6: main

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
    def main(cls, argv):
        
        if len(argv) != 3:
            print "Usage: T9er <WordNet index dir> <t9 index>"
            return
        
        for key in cls.keys:
            c = key[0]
            k = key[1:]
            for kc in k:
                cls.keyMap[kc] = c
                print kc, "=", c

        indexDir = argv[1]
        t9dir = argv[2]

        reader = IndexReader.open(indexDir)

        numDocs = reader.maxDoc()
        print "Processing", numDocs, "words"

        writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True)

        for id in xrange(reader.maxDoc()):
            origDoc = reader.document(id)
            word = origDoc.get("word")
            if word is None or len(word) == 0:
                continue

            newDoc = Document()
            newDoc.add(Field("word", word,
                             Field.Store.YES, Field.Index.UN_TOKENIZED))
            newDoc.add(Field("t9", cls.t9(word),
                             Field.Store.YES, Field.Index.UN_TOKENIZED))
            newDoc.add(Field("length", str(len(word)),
                             Field.Store.NO, Field.Index.UN_TOKENIZED))
            writer.addDocument(newDoc)
            if id % 100 == 0:
                print "Document", id

        writer.commit()
        writer.close()

        reader.close()
开发者ID:qiugen,项目名称:pylucene-trunk,代码行数:46,代码来源:T9er.py

示例7: __init__

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
    def __init__(self, storeDir, analyzer):

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)
        store = SimpleFSDirectory(File(storeDir))

        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        writer = IndexWriter(store, config)

        self.indexDocs(writer)
        ticker = Ticker()
        print(  'optimizing index' ),
        threading.Thread(target=ticker.run).start()
        writer.commit()
        writer.close()
        ticker.tick = False
        print(  'done' )
开发者ID:yuchenlin,项目名称:MusicSpotlight,代码行数:22,代码来源:IndexFiles.py

示例8: addDocuments

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
    def addDocuments(self, dir):

        writer = IndexWriter(dir, self.getAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        writer.setUseCompoundFile(self.isCompound())

        for i in xrange(len(self.keywords)):
            doc = Document()
            doc.add(Field("id", self.keywords[i],
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field("country", self.unindexed[i],
                          Field.Store.YES, Field.Index.NO))
            doc.add(Field("contents", self.unstored[i],
                          Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("city", self.text[i],
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.commit()
        writer.close()
开发者ID:qiugen,项目名称:pylucene-trunk,代码行数:22,代码来源:BaseIndexingTestCase.py

示例9: createIndex

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
    def createIndex(cls, dataDir, indexDir, useCompound):

        indexDir = SimpleFSDirectory(File(indexDir))
        config = IndexWriterConfig(Version.LUCENE_CURRENT,
                             StandardAnalyzer(Version.LUCENE_CURRENT))
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        
        writer = IndexWriter(indexDir, config)
        config = writer.getConfig()
        mp = config.getMergePolicy()
        
        if (LogMergePolicy.instance_(mp)):
            mp.setUseCompoundFile(useCompound)

        for dir, dirnames, filenames in os.walk(dataDir):
            for filename in filenames:
                if filename.endswith('.properties'):
                    cls.indexFile(writer, os.path.join(dir, filename), dataDir)

        writer.commit()
        writer.close()
开发者ID:qiugen,项目名称:pylucene-trunk,代码行数:23,代码来源:TestDataDocumentHandler.py

示例10: testDeleteAfterIndexMerge

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
    def testDeleteAfterIndexMerge(self):

        reader = IndexReader.open(self.dir, False)
        self.assertEqual(2, reader.maxDoc())
        self.assertEqual(2, reader.numDocs())
        reader.deleteDocument(1)
        reader.close()

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        writer.commit()
        writer.close()

        reader = IndexReader.open(self.dir, True)
        deletedDocs = MultiFields.getDeletedDocs(reader)

        self.assert_(deletedDocs is None or not deletedDocs.get(1))
        self.assert_(not reader.hasDeletions())
        self.assertEqual(1, reader.maxDoc())
        self.assertEqual(1, reader.numDocs())

        reader.close()
开发者ID:qiugen,项目名称:pylucene-trunk,代码行数:24,代码来源:DocumentDeleteTest.py

示例11: indexTaxonomy

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
def indexTaxonomy(taxonomy, index_path):
    lucene.initVM()
    
    index_location = index_path
    dir = SimpleFSDirectory(lucene.File(index_location))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(1024))
    
    for i in taxonomy:
        v = taxonomy[i]
        doc = lucene.Document()
        doc.add(lucene.Field('name', v['name'] , lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
        doc.add(lucene.Field('id', v['id'] , lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
        doc.add(lucene.Field('alias', json.dumps(v['alias']) , lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
        doc.add(lucene.Field('type', v['type'] , lucene.Field.Store.YES, lucene.Field.Index.NO))
        doc.add(lucene.Field('contained_by', json.dumps(v['contained_by']) , lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
        doc.add(lucene.Field('images', json.dumps(v['images']) , lucene.Field.Store.YES, lucene.Field.Index.NO))
        writer.addDocument(doc)
        writer.commit()
        
    writer.close()
开发者ID:linares,项目名称:fwiki,代码行数:24,代码来源:FreebaseIndex.py

示例12: testUpdate

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
    def testUpdate(self):

        self.assertEqual(1, self.getHitCount("city", "Amsterdam"))

        reader = IndexReader.open(self.dir, False)
        reader.deleteDocuments(Term("city", "Amsterdam"))
        reader.close()

        writer = IndexWriter(self.dir, self.getAnalyzer(), False,
                             IndexWriter.MaxFieldLength.UNLIMITED)
        doc = Document()
        doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field("country", "Russia",
                      Field.Store.YES, Field.Index.NO))
        doc.add(Field("contents", "St. Petersburg has lots of bridges",
                      Field.Store.NO, Field.Index.ANALYZED))
        doc.add(Field("city", "St. Petersburg",
                      Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.commit()
        writer.close()

        self.assertEqual(0, self.getHitCount("city", "Amsterdam"))
        self.assertEqual(1, self.getHitCount("city", "Petersburg"))
开发者ID:qiugen,项目名称:pylucene-trunk,代码行数:26,代码来源:DocumentUpdateTest.py

示例13: main

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
    def main(cls, argv):

        if len(argv) < 2:
            print "Usage: BerkeleyDbIndexer <index dir> -create"
            return

        dbHome = argv[1]
        create = len(argv) > 2 and argv[2] == "-create"

        if not os.path.exists(dbHome):
            os.makedirs(dbHome)
        elif create:
            for name in os.listdir(dbHome):
                if name.startswith('__'):
                    os.remove(os.path.join(dbHome, name))

        env = DBEnv()
        env.set_flags(DB_LOG_INMEMORY, 1);
        if os.name == 'nt':
            env.set_cachesize(0, 0x4000000, 1)
        elif os.name == 'posix':
            from commands import getstatusoutput
            if getstatusoutput('uname') == (0, 'Linux'):
                env.set_cachesize(0, 0x4000000, 1)

        env.open(dbHome, (DB_CREATE | DB_THREAD |
                          DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_TXN), 0)

        index = DB(env)
        blocks = DB(env)
        txn = None
        
        try:
            txn = env.txn_begin(None)
            index.open(filename = '__index__', dbtype = DB_BTREE,
                       flags = DB_CREATE | DB_THREAD, txn = txn)
            blocks.open(filename = '__blocks__', dbtype = DB_BTREE,
                        flags = DB_CREATE | DB_THREAD, txn = txn)
        except:
            if txn is not None:
                txn.abort()
                txn = None
            raise
        else:
            txn.commit()
            txn = None

        try:
            txn = env.txn_begin(None)
            directory = DbDirectory(txn, index, blocks, 0)
            writer = IndexWriter(directory, StandardAnalyzer(), create,
                                 IndexWriter.MaxFieldLength.UNLIMITED)
            writer.setUseCompoundFile(False)

            doc = Document()
            doc.add(Field("contents", "The quick brown fox...",
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

            writer.commit()
            writer.close()
        except:
            if txn is not None:
                txn.abort()
                txn = None
            raise
        else:
            txn.commit()
            index.close()
            blocks.close()
            env.close()

        print "Indexing Complete"
开发者ID:qiugen,项目名称:pylucene-trunk,代码行数:75,代码来源:BerkeleyDbIndexer.py

示例14: initVM

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
if __name__ == '__main__':
    initVM()

directory = RAMDirectory()
iwriter = IndexWriter(directory, StandardAnalyzer(Version.LUCENE_CURRENT),
                      True, IndexWriter.MaxFieldLength.LIMITED)
ts = ["this bernhard is the text to be index text",
      "this claudia is the text to be index"]
for t in ts:
    doc = Document()
    doc.add(Field("fieldname", t,
                  Field.Store.YES, Field.Index.ANALYZED,
                  Field.TermVector.WITH_POSITIONS_OFFSETS))
    iwriter.addDocument(doc)
iwriter.commit()
iwriter.close()

ireader = IndexReader.open(directory, True)
tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname'))

for (t,f,i) in zip(tpv.getTerms(),tpv.getTermFrequencies(),xrange(100000)):
    print 'term %s' % t
    print '  freq: %i' % f
    try:
        print '  pos: ' + str([p for p in tpv.getTermPositions(i)])
    except:
        print '  no pos'
    try:
        print '  off: ' + \
              str(["%i-%i" % (o.getStartOffset(), o.getEndOffset())
开发者ID:qiugen,项目名称:pylucene-trunk,代码行数:32,代码来源:TermPositionVector.py

示例15: index_plain_text_emails

# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import commit [as 别名]
def index_plain_text_emails(data_folder, 
                            path_index_file, store_dir, 
                            lemmatize = False, stem = False, 
                            nonascii = True):
    '''
    Indexes all the plain text emails in the input directory 
    and stores the index in the store_dir  
    
    Arguments: 
        data_folder - input directory absolute path 
        path_index_file - file paths index file 
        store_dir - index store directory absolute path 
    Returns: 
        None 

    '''
    
    if not os.path.exists(store_dir): 
        os.mkdir(store_dir)
    
    
    if os.path.exists(path_index_file): 
        logging.info('Loading file paths index...')
        file_tuples = load_file_paths_index(path_index_file)
        logging.info('%d files found in the file paths index.' % len(file_tuples))
    else: 
        logging.info('Loading files in the data folder %s...' % data_folder)
        file_tuples = get_file_paths_index(data_folder)
    
        logging.info('%d email documents found.' % len(file_tuples))
    
        store_file_paths_index(path_index_file, file_tuples)
        logging.info('File paths index is stored into %s' % path_index_file)
    
    logging.info('Lucene: Stem = %s, Lemmatize = %s, Number of documents = %d' % (stem, lemmatize, len(file_tuples)))
        
    store = SimpleFSDirectory(File(store_dir))
    writer = IndexWriter(store, STD_ANALYZER, True, IndexWriter.MaxFieldLength.LIMITED)
    
    print 'Lucene:', len(file_tuples), 'files found in %s.' % data_folder
    print 'Lucene: Stem =', stem, 'Lemmatize =', lemmatize, 'Allow non-ASCII =', nonascii  
    
    for ft in file_tuples: 
        idx, root, file_name = ft
        file_path = os.path.join(root, file_name)
        logging.info("[%d] file: %s - adding to Lucene index.", idx, file_name)
        # parses the emails in plain text format 
        receiver, sender, cc, subject, message_text, bcc, date, email_text = parse_plain_text_email(file_path, 
                                                                                                    tokenize = True, 
                                                                                                    lemmatize = lemmatize, 
                                                                                                    stem = stem, 
                                                                                                    nonascii = nonascii)

        doc = Document()
        doc.add(Field(MetadataType.FILE_ID, str(idx), Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field(MetadataType.FILE_NAME, file_name, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.YES))
        doc.add(Field(MetadataType.FILE_PATH, file_path, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(Field(MetadataType.EMAIL_RECEIVER, receiver, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(Field(MetadataType.EMAIL_SENDER, sender, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(Field(MetadataType.EMAIL_CC, cc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(Field(MetadataType.EMAIL_SUBJECT, subject, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        #Subodh-Rahul - Added BCC field in indexing.
        doc.add(Field(MetadataType.EMAIL_BCC, bcc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        #Subodh-Rahul - Added Email-Date field in indexing
        doc.add(Field(MetadataType.EMAIL_DATE, date, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        
        if len(message_text) > 0:
            doc.add(Field(MetadataType.EMAIL_BODY, message_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))
        else:
            logging.error("[%d] file: %s - body text is empty.", idx, file_name)
            
        # Adds all documents fields as a separate index so that we can search through them 
        doc.add(Field(MetadataType.ALL, email_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))

        writer.addDocument(doc)
        logging.info("[%d] file: %s - added to Lucene index.", idx, file_name)


    writer.commit()
    writer.close()

    logging.info('Lucene: All files are indexed.')
开发者ID:clintpgeorge,项目名称:ediscovery,代码行数:84,代码来源:lucene_index_dir.py


注:本文中的lucene.IndexWriter.commit方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。