本文整理汇总了Python中org.apache.lucene.index.IndexWriter.close方法的典型用法代码示例。如果您正苦于以下问题:Python IndexWriter.close方法的具体用法?Python IndexWriter.close怎么用?Python IndexWriter.close使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.index.IndexWriter
的用法示例。
在下文中一共展示了IndexWriter.close方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: wikipedia_indexer
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import close [as 别名]
def wikipedia_indexer(storage, wikipedia_file) :
lucene.initVM()
indexDir = SimpleFSDirectory(File(storage))
stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
for s in stopwords :
stops.add(s)
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
writer = IndexWriter(indexDir, writerConfig)
print "%d docs in index" % writer.numDocs()
print "Reading Documents"
f = open(wikipedia_file)
for i, line in enumerate(f) :
text = line.strip().decode('utf-8').split('\t')
title = text[0]
if 'disambigu' in text[0] or len(text) < 2:
continue
text = text[1]
doc = Document()
doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
writer.addDocument(doc)
if writer.numDocs() % 1000 == 0 :
print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例2: create_index
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import close [as 别名]
def create_index(storage, paths) :
lucene.initVM()
indexDir = SimpleFSDirectory(File(storage))
stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
for s in stopwords :
stops.add(s)
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
writer = IndexWriter(indexDir, writerConfig)
print "%d docs in index" % writer.numDocs()
print "Reading Documents"
import os
for path in paths :
for filen in os.listdir(path) :
text = sent_tokenize(get_data_from_file(path + filen))
total_sent = len(text)
for i in range(0, total_sent, 3) :
doc = Document()
a = i-5 if i-5 > 0 else 0
sentence = ' '.join(text[a:i+5])
doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print("Done %s" % (path+filen))
print "Indexed (%d docs in index)" % (writer.numDocs())
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例3: index
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import close [as 别名]
def index(indexdir):
lucene.initVM()
indexDir = SimpleFSDirectory(File(indexdir))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
f = open('data/docid.documento-xml.txt')
st = PorterStemmer()
for i, line in enumerate(f.readlines()):
id, xmltext = line.split('\t')
xmltext = xmltext.rstrip('\n')
xmldoc = minidom.parseString(xmltext)
title = xmldoc.getElementsByTagName("TITLE")
title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue
authors = xmldoc.getElementsByTagName("AUTHORS")
authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue
abstract = xmldoc.getElementsByTagName("ABSTRACT")
abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue
doc = Document()
doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED))
writer.addDocument(doc)
print "indexed %s docs" % (i+1)
writer.close()
示例4: build_index
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import close [as 别名]
def build_index():
lucene.initVM()
# post_dir = current_app.config['LOCAL_REPO_PATH'] + '/_posts/'
post_dir = '/Users/w3/data/github/codeif_backup'
index_store_dir = current_app.config['INDEX_STORE_DIR']
print post_dir
print index_store_dir
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
store = SimpleFSDirectory(File(index_store_dir))
analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
indexDocs(post_dir, writer)
ticker = Ticker()
print 'commit index',
threading.Thread(target=ticker.run).start()
writer.commit()
writer.close()
ticker.tick = False
print 'done'
示例5: create_index
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import close [as 别名]
def create_index():
lucene.initVM()
if os.path.exists(prm.index_folder):
shutil.rmtree(prm.index_folder)
indexDir = SimpleFSDirectory(File(prm.index_folder))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
wk = wiki.Wiki(prm.pages_path)
print "%d docs in index" % writer.numDocs()
print "Reading files from wikipedia..."
n = 0
for l in wk.get_text_iter():
doc = Document()
doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
n += 1
if n % 100000 == 0:
print 'indexing article', n
print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs())
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例6: __init__
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import close [as 别名]
def __init__(self,root,storeDir,analyzer):
# Create the index dir if it does not exist
if not os.path.exists(storeDir):
os.mkdir(storeDir)
# the SimpleFSDirectory which the index will be written in
store = SimpleFSDirectory(File(storeDir))
analyzer = LimitTokenCountAnalyzer(analyzer,1048576)
config = IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
# create a index writer
# atach the index dir and config info to it
writer = IndexWriter(store,config)
# call the indexing procedure
# indexing all the files in the directory specified by root
# write the index with writer
self.indexDocs(root,writer)
# start a ticker
ticker = Ticker()
print 'commit index'
threading.Thread(target=ticker.run).start()
writer.commit()
writer.close()
# stop the ticker when the indexing procedure completes
ticker.tick = False
print 'Done'
示例7: reindex
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import close [as 别名]
def reindex(self):
writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.corpus.analyzer, False, IndexWriter.MaxFieldLength.LIMITED)
indexutils.reindex_all(self.reader, writer, self.corpus.analyzer)
writer.optimize()
writer.close()
self.parent.write({'message': "Reindex successful. Corpus analyzer is now set to %s." % (self.corpus.analyzer_str,)})
self.parent.write({'status': "Ready!"})
示例8: __init__
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import close [as 别名]
class LuceneIndexer:
def __init__(self, path_to_save):
self.path_to_save = path_to_save
self.num_docs = 0
lucene.initVM()
self.indexDir = SimpleFSDirectory(File(self.path_to_save))
self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
self.analyzer2 = WhitespaceAnalyzer(Version.LUCENE_4_10_1)
self.writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, self.analyzer2)
self.writer = IndexWriter(self.indexDir, self.writerConfig)
def add_document(self, fields, header, id_):
doc = Document()
if len(fields) > len(header):
sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_))
for field in fields:
sys.stderr.write('%s\n' % field)
return
for idx, field in enumerate(fields):
fname, fieldtype = header[idx]
if fieldtype is IntField:
field = int(field)
doc.add(fieldtype(fname, field, Field.Store.YES))
self.writer.addDocument(doc)
self.num_docs += 1
def close(self):
print 'Indexed %d lines from stdin (%d docs in index)' % (self.num_docs, self.writer.numDocs())
self.writer.close()
示例9: removeindex
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import close [as 别名]
def removeindex(self, data):
writer = IndexWriter(
self.d, self.conf)
writer.deleteDocuments(lucene.Term("_id", data['record']['_id']))
writer.optimize()
writer.close()
示例10: updateindex
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import close [as 别名]
def updateindex(self, data):
writer = IndexWriter(
self.d, self.conf)
doc = self.buildDocument(data['fields'], data['record'])
writer.updateDocument(lucene.Term("_id", data['record']['_id']), doc)
writer.optimize()
writer.close()
示例11: deleteRec
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import close [as 别名]
def deleteRec(self, pid):
config = IndexWriterConfig(self.analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
writer = IndexWriter(self.indexDir, config)
writer.deleteDocuments(Term('uid', pid))
writer.commit()
writer.close()
self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir))
return
示例12: index
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import close [as 别名]
def index(self, data):
writer = IndexWriter(
self.d, self.conf)
doc = self.buildDocument(data['fields'], data['record'])
writer.addDocument(doc)
writer.commit()
writer.close()
示例13: rebuildIndex
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import close [as 别名]
def rebuildIndex(self, data):
writer = IndexWriter(
self.d, self.conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE))
for record in data['records']:
doc = self.buildDocument(data['fields'], record)
writer.addDocument(doc)
writer.commit()
writer.close()
示例14: indexer
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import close [as 别名]
def indexer(docNumber, docText):
lucene.initVM()
indexDir = SimpleFSDirectory(File("index/"))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
doc = Document()
doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例15: Indexer
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import close [as 别名]
class Indexer(object):
def __init__(self, **kwargs):
""" Initialize a new instance of the Indexer
:param output: The output directory of the underlying index
:param anaylzer: The overloaded analyzer to work with
"""
self.output = kwargs.get("root", "index")
if not os.path.exists(self.output):
os.mkdir(self.output)
self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT))
self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
self.store = SimpleFSDirectory(File(self.output))
self.writer = IndexWriter(self.store, self.config)
self.create_field_types()
def index(self, document):
""" Given a new document, add it to the index.
:param document: The document to add to the indexer
"""
try:
self.writer.addDocument(document)
except Exception:
logger.exception("Failed to index the supplied document")
def shutdown(self):
""" Shutdown the currently processing indexer.
"""
try:
# self.writer.optimize()
self.writer.close()
except Exception:
logger.exception("Failed to shutdown the indexer correctly")
def create_field_types(self):
""" Create the field types that will be used to specify
what actions lucene should take on the various fields
supplied to index.
"""
self.field_clean = FieldType()
self.field_clean.setIndexed(True)
self.field_clean.setStored(True)
self.field_clean.setTokenized(False)
self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
self.field_dirty = FieldType()
self.field_dirty.setIndexed(True)
self.field_dirty.setStored(False)
self.field_dirty.setTokenized(True)
self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)