本文整理汇总了Python中org.apache.lucene.index.IndexWriter类的典型用法代码示例。如果您正苦于以下问题:Python IndexWriter类的具体用法?Python IndexWriter怎么用?Python IndexWriter使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了IndexWriter类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: build_index
def build_index():
lucene.initVM()
# post_dir = current_app.config['LOCAL_REPO_PATH'] + '/_posts/'
post_dir = '/Users/w3/data/github/codeif_backup'
index_store_dir = current_app.config['INDEX_STORE_DIR']
print post_dir
print index_store_dir
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
store = SimpleFSDirectory(File(index_store_dir))
analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
indexDocs(post_dir, writer)
ticker = Ticker()
print 'commit index',
threading.Thread(target=ticker.run).start()
writer.commit()
writer.close()
ticker.tick = False
print 'done'
示例2: index
def index(indexdir):
lucene.initVM()
indexDir = SimpleFSDirectory(File(indexdir))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
f = open('data/docid.documento-xml.txt')
st = PorterStemmer()
for i, line in enumerate(f.readlines()):
id, xmltext = line.split('\t')
xmltext = xmltext.rstrip('\n')
xmldoc = minidom.parseString(xmltext)
title = xmldoc.getElementsByTagName("TITLE")
title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue
authors = xmldoc.getElementsByTagName("AUTHORS")
authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue
abstract = xmldoc.getElementsByTagName("ABSTRACT")
abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue
doc = Document()
doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED))
writer.addDocument(doc)
print "indexed %s docs" % (i+1)
writer.close()
示例3: reindex
def reindex(self):
writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.corpus.analyzer, False, IndexWriter.MaxFieldLength.LIMITED)
indexutils.reindex_all(self.reader, writer, self.corpus.analyzer)
writer.optimize()
writer.close()
self.parent.write({'message': "Reindex successful. Corpus analyzer is now set to %s." % (self.corpus.analyzer_str,)})
self.parent.write({'status': "Ready!"})
示例4: __init__
def __init__(self,root,storeDir,analyzer):
# Create the index dir if it does not exist
if not os.path.exists(storeDir):
os.mkdir(storeDir)
# the SimpleFSDirectory which the index will be written in
store = SimpleFSDirectory(File(storeDir))
analyzer = LimitTokenCountAnalyzer(analyzer,1048576)
config = IndexWriterConfig(Version.LUCENE_CURRENT,analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
# create a index writer
# atach the index dir and config info to it
writer = IndexWriter(store,config)
# call the indexing procedure
# indexing all the files in the directory specified by root
# write the index with writer
self.indexDocs(root,writer)
# start a ticker
ticker = Ticker()
print 'commit index'
threading.Thread(target=ticker.run).start()
writer.commit()
writer.close()
# stop the ticker when the indexing procedure completes
ticker.tick = False
print 'Done'
示例5: __init__
class LuceneIndexer:
def __init__(self, path_to_save):
self.path_to_save = path_to_save
self.num_docs = 0
lucene.initVM()
self.indexDir = SimpleFSDirectory(File(self.path_to_save))
self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
self.analyzer2 = WhitespaceAnalyzer(Version.LUCENE_4_10_1)
self.writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, self.analyzer2)
self.writer = IndexWriter(self.indexDir, self.writerConfig)
def add_document(self, fields, header, id_):
doc = Document()
if len(fields) > len(header):
sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_))
for field in fields:
sys.stderr.write('%s\n' % field)
return
for idx, field in enumerate(fields):
fname, fieldtype = header[idx]
if fieldtype is IntField:
field = int(field)
doc.add(fieldtype(fname, field, Field.Store.YES))
self.writer.addDocument(doc)
self.num_docs += 1
def close(self):
print 'Indexed %d lines from stdin (%d docs in index)' % (self.num_docs, self.writer.numDocs())
self.writer.close()
示例6: getLucene
def getLucene(path):
directory = FSDirectory.open(Paths.get(path))
analyzer = WhitespaceAnalyzer()
config = IndexWriterConfig(analyzer)
config.setIndexSort(Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG)))
writer = IndexWriter(directory, config)
reader = writer.getReader()
searcher = IndexSearcher(reader)
return writer, reader, searcher
示例7: _getLucene
def _getLucene(self, path):
directory = FSDirectory.open(Paths.get(path))
config = IndexWriterConfig(None)
config.setRAMBufferSizeMB(256.0) # faster
config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later
writer = IndexWriter(directory, config)
reader = writer.getReader()
searcher = IndexSearcher(reader)
return writer, reader, searcher
示例8: wikipedia_indexer
def wikipedia_indexer(storage, wikipedia_file) :
lucene.initVM()
indexDir = SimpleFSDirectory(File(storage))
stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
for s in stopwords :
stops.add(s)
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
writer = IndexWriter(indexDir, writerConfig)
print "%d docs in index" % writer.numDocs()
print "Reading Documents"
f = open(wikipedia_file)
for i, line in enumerate(f) :
text = line.strip().decode('utf-8').split('\t')
title = text[0]
if 'disambigu' in text[0] or len(text) < 2:
continue
text = text[1]
doc = Document()
doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
writer.addDocument(doc)
if writer.numDocs() % 1000 == 0 :
print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例9: Indexer
class Indexer(object):
def __init__(self, **kwargs):
""" Initialize a new instance of the Indexer
:param output: The output directory of the underlying index
:param anaylzer: The overloaded analyzer to work with
"""
self.output = kwargs.get("root", "index")
if not os.path.exists(self.output):
os.mkdir(self.output)
self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT))
self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
self.store = SimpleFSDirectory(File(self.output))
self.writer = IndexWriter(self.store, self.config)
self.create_field_types()
def index(self, document):
""" Given a new document, add it to the index.
:param document: The document to add to the indexer
"""
try:
self.writer.addDocument(document)
except Exception:
logger.exception("Failed to index the supplied document")
def shutdown(self):
""" Shutdown the currently processing indexer.
"""
try:
# self.writer.optimize()
self.writer.close()
except Exception:
logger.exception("Failed to shutdown the indexer correctly")
def create_field_types(self):
""" Create the field types that will be used to specify
what actions lucene should take on the various fields
supplied to index.
"""
self.field_clean = FieldType()
self.field_clean.setIndexed(True)
self.field_clean.setStored(True)
self.field_clean.setTokenized(False)
self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
self.field_dirty = FieldType()
self.field_dirty.setIndexed(True)
self.field_dirty.setStored(False)
self.field_dirty.setTokenized(True)
self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
示例10: make_index
def make_index(indexed_data, index_destination, source='directory'):
#index wiki articles based on ck 12 topics
#analyzer = StandardAnalyzer(Version.LUCENE_30)
analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET)
indexWriterConfig = IndexWriterConfig(Version.LUCENE_30, analyzer)
writer = IndexWriter(SimpleFSDirectory(File(index_destination)), indexWriterConfig)
if source == 'directory':
indexDirectory(indexed_data, writer)
else:
indexDictionary(indexed_data, writer)
writer.close()
示例11: __init__
def __init__(self, root, storeDir, analyzer):
if not os.path.exists(storeDir):
os.mkdir(storeDir)
store = SimpleFSDirectory(File(storeDir))
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
self.indexDocs(root, writer)
writer.commit()
writer.close()
示例12: import_csv_with_content
def import_csv_with_content(self, csv_file, content_field):
try:
writer = IndexWriter(SimpleFSDirectory(File(self.corpus.path)), self.analyzer, False, IndexWriter.MaxFieldLength.LIMITED)
changed_rows = addmetadata.add_metadata_and_content_from_csv(self.searcher, self.reader, writer, csv_file, content_field, self.args_dir)
writer.close()
except UnicodeDecodeError:
try:
writer.close()
except:
pass
self.parent.write({'error': 'CSV import failed: file contained non-unicode characters. Please save the file with UTF-8 encoding and try again!'})
return
self.parent.write({'message': "CSV import complete: %s rows added." % (changed_rows,)})
示例13: __init__
def __init__(self, indexPath):
"""Instantiate the handler object."""
self.indexPath = indexPath
self.analyzer = StopAnalyzer()
# Make sure the path exists
if not os.path.exists(self.indexPath):
os.mkdir(self.indexPath)
if not os.path.exists(os.path.join(self.indexPath, 'segments.gen')):
log('Creating new index.')
writer = IndexWriter(self.indexPath, self.analyzer, 1)
writer.close()
示例14: create_index
def create_index(storage, paths) :
lucene.initVM()
indexDir = SimpleFSDirectory(File(storage))
stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
for s in stopwords :
stops.add(s)
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
writer = IndexWriter(indexDir, writerConfig)
print "%d docs in index" % writer.numDocs()
print "Reading Documents"
import os
for path in paths :
for filen in os.listdir(path) :
text = sent_tokenize(get_data_from_file(path + filen))
total_sent = len(text)
for i in range(0, total_sent, 3) :
doc = Document()
a = i-5 if i-5 > 0 else 0
sentence = ' '.join(text[a:i+5])
doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print("Done %s" % (path+filen))
print "Indexed (%d docs in index)" % (writer.numDocs())
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例15: create_index
def create_index():
lucene.initVM()
if os.path.exists(prm.index_folder):
shutil.rmtree(prm.index_folder)
indexDir = SimpleFSDirectory(File(prm.index_folder))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
wk = wiki.Wiki(prm.pages_path)
print "%d docs in index" % writer.numDocs()
print "Reading files from wikipedia..."
n = 0
for l in wk.get_text_iter():
doc = Document()
doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
n += 1
if n % 100000 == 0:
print 'indexing article', n
print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs())
print "Closing index of %d docs..." % writer.numDocs()
writer.close()