本文整理汇总了Python中org.apache.lucene.index.IndexWriter.addDocument方法的典型用法代码示例。如果您正苦于以下问题:Python IndexWriter.addDocument方法的具体用法?Python IndexWriter.addDocument怎么用?Python IndexWriter.addDocument使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.index.IndexWriter
的用法示例。
在下文中一共展示了IndexWriter.addDocument方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: index
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import addDocument [as 别名]
def index(indexdir):
lucene.initVM()
indexDir = SimpleFSDirectory(File(indexdir))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
f = open('data/docid.documento-xml.txt')
st = PorterStemmer()
for i, line in enumerate(f.readlines()):
id, xmltext = line.split('\t')
xmltext = xmltext.rstrip('\n')
xmldoc = minidom.parseString(xmltext)
title = xmldoc.getElementsByTagName("TITLE")
title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue
authors = xmldoc.getElementsByTagName("AUTHORS")
authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue
abstract = xmldoc.getElementsByTagName("ABSTRACT")
abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue
doc = Document()
doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED))
writer.addDocument(doc)
print "indexed %s docs" % (i+1)
writer.close()
示例2: create_index
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import addDocument [as 别名]
def create_index(storage, paths) :
lucene.initVM()
indexDir = SimpleFSDirectory(File(storage))
stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
for s in stopwords :
stops.add(s)
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
writer = IndexWriter(indexDir, writerConfig)
print "%d docs in index" % writer.numDocs()
print "Reading Documents"
import os
for path in paths :
for filen in os.listdir(path) :
text = sent_tokenize(get_data_from_file(path + filen))
total_sent = len(text)
for i in range(0, total_sent, 3) :
doc = Document()
a = i-5 if i-5 > 0 else 0
sentence = ' '.join(text[a:i+5])
doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print("Done %s" % (path+filen))
print "Indexed (%d docs in index)" % (writer.numDocs())
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例3: __init__
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import addDocument [as 别名]
class LuceneIndexer:
def __init__(self, path_to_save):
self.path_to_save = path_to_save
self.num_docs = 0
lucene.initVM()
self.indexDir = SimpleFSDirectory(File(self.path_to_save))
self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
self.analyzer2 = WhitespaceAnalyzer(Version.LUCENE_4_10_1)
self.writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, self.analyzer2)
self.writer = IndexWriter(self.indexDir, self.writerConfig)
def add_document(self, fields, header, id_):
doc = Document()
if len(fields) > len(header):
sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_))
for field in fields:
sys.stderr.write('%s\n' % field)
return
for idx, field in enumerate(fields):
fname, fieldtype = header[idx]
if fieldtype is IntField:
field = int(field)
doc.add(fieldtype(fname, field, Field.Store.YES))
self.writer.addDocument(doc)
self.num_docs += 1
def close(self):
print 'Indexed %d lines from stdin (%d docs in index)' % (self.num_docs, self.writer.numDocs())
self.writer.close()
示例4: create_index
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import addDocument [as 别名]
def create_index():
lucene.initVM()
if os.path.exists(prm.index_folder):
shutil.rmtree(prm.index_folder)
indexDir = SimpleFSDirectory(File(prm.index_folder))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
wk = wiki.Wiki(prm.pages_path)
print "%d docs in index" % writer.numDocs()
print "Reading files from wikipedia..."
n = 0
for l in wk.get_text_iter():
doc = Document()
doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
n += 1
if n % 100000 == 0:
print 'indexing article', n
print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs())
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例5: wikipedia_indexer
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import addDocument [as 别名]
def wikipedia_indexer(storage, wikipedia_file) :
lucene.initVM()
indexDir = SimpleFSDirectory(File(storage))
stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
for s in stopwords :
stops.add(s)
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
writer = IndexWriter(indexDir, writerConfig)
print "%d docs in index" % writer.numDocs()
print "Reading Documents"
f = open(wikipedia_file)
for i, line in enumerate(f) :
text = line.strip().decode('utf-8').split('\t')
title = text[0]
if 'disambigu' in text[0] or len(text) < 2:
continue
text = text[1]
doc = Document()
doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
writer.addDocument(doc)
if writer.numDocs() % 1000 == 0 :
print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例6: index
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import addDocument [as 别名]
def index(self, data):
writer = IndexWriter(
self.d, self.conf)
doc = self.buildDocument(data['fields'], data['record'])
writer.addDocument(doc)
writer.commit()
writer.close()
示例7: rebuildIndex
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import addDocument [as 别名]
def rebuildIndex(self, data):
writer = IndexWriter(
self.d, self.conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE))
for record in data['records']:
doc = self.buildDocument(data['fields'], record)
writer.addDocument(doc)
writer.commit()
writer.close()
示例8: Indexer
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import addDocument [as 别名]
class Indexer(object):
def __init__(self, **kwargs):
""" Initialize a new instance of the Indexer
:param output: The output directory of the underlying index
:param anaylzer: The overloaded analyzer to work with
"""
self.output = kwargs.get("root", "index")
if not os.path.exists(self.output):
os.mkdir(self.output)
self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT))
self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
self.store = SimpleFSDirectory(File(self.output))
self.writer = IndexWriter(self.store, self.config)
self.create_field_types()
def index(self, document):
""" Given a new document, add it to the index.
:param document: The document to add to the indexer
"""
try:
self.writer.addDocument(document)
except Exception:
logger.exception("Failed to index the supplied document")
def shutdown(self):
""" Shutdown the currently processing indexer.
"""
try:
# self.writer.optimize()
self.writer.close()
except Exception:
logger.exception("Failed to shutdown the indexer correctly")
def create_field_types(self):
""" Create the field types that will be used to specify
what actions lucene should take on the various fields
supplied to index.
"""
self.field_clean = FieldType()
self.field_clean.setIndexed(True)
self.field_clean.setStored(True)
self.field_clean.setTokenized(False)
self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
self.field_dirty = FieldType()
self.field_dirty.setIndexed(True)
self.field_dirty.setStored(False)
self.field_dirty.setTokenized(True)
self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
示例9: indexer
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import addDocument [as 别名]
def indexer(docNumber, docText):
lucene.initVM()
indexDir = SimpleFSDirectory(File("index/"))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
doc = Document()
doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例10: WikiPageIndex
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import addDocument [as 别名]
class WikiPageIndex():
def __init__(self, index_dir):
#lucene.initVM(vmargs=['-Djava.awt.headless=true', '-Xmx4g'])
self.index_dir = index_dir
self.directory = SimpleFSDirectory(File(self.index_dir))
self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
def createIndex(self):
self.writer = IndexWriter(self.directory, self.config)
if not os.path.exists(self.index_dir):
os.mkdir(self.index_dir)
def addDocumentToIndex(self, title, text):
doc = Document()
doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED))
self.writer.addDocument(doc)
def closeIndex(self):
self.writer.commit()
self.writer.close()
def searchIndex(self, queryString, field="Text", max_results=100):
query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(queryString)
scoreDocs = self.searcher.search(query, max_results).scoreDocs
log.debug("Found {0} documents for query [{1}]".format(len(scoreDocs), queryString))
docs = []
for scoreDoc in scoreDocs:
doc = self.searcher.doc(scoreDoc.doc)
log.debug(WikiPageIndex.cleanWikiText(doc.get("Text")))
#print("title: {0}\ncontents: {1}".format(doc.get("Title"), doc.get("Text")[:70]))
docs.append(doc)
return docs
@staticmethod
def cleanWikiText(text):
text = text.encode('ascii', 'ignore')
text = re.sub('(\[\[.*?\]\]|\{\{.*?\}\}|\{\|.*?\|\})', '', text)
text = re.sub('[^\na-zA-Z0-9\n_-]+', ' ', text)
text = re.sub('([ \t]*[\n]+[ \t]*)+', '\n', text)
return text.strip()
示例11: dummyIndex
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import addDocument [as 别名]
def dummyIndex(self):
"""
Create a dummy index - to avoid problems updating it
"""
config = IndexWriterConfig(self.analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(self.indexDir, config)
doc = Document()
doc.add(Field('uid', 'dummy', StringField.TYPE_STORED))
writer.addDocument(doc)
writer.commit()
writer.close()
return
示例12: buildIndex
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import addDocument [as 别名]
def buildIndex(self, inputFile):
analyzer = self.getAnalyzer()
iwconf = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
iwconf.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter( SimpleFSDirectory( File(self.luceneDir) ), iwconf)
# read through input file and write out to lucene
counter = 0
linesReadCounter = 0
with open(inputFile, 'r') as lines:
linesRead = 0
for line in lines:
try:
linesRead+=1
if linesRead % 1000 == 0:
print "%d lines read" % linesRead
cui, concept = line.replace("\",\"", "\t").replace("\"", "").split("\t")
concept = concept.strip()
cui = cui.strip()
strNorm = self.normalizeCasePunct(concept)
strSorted = self.sortWords(strNorm)
strStemmed = self.stemWords(strNorm)
strStemmedSorted = self.stemWords(strSorted)
fdoc = Document()
counter +=1
fid = counter
fdoc.add( Field("id", unicode(fid), Field.Store.YES, Field.Index.NOT_ANALYZED))
fdoc.add( Field("cui", cui, Field.Store.YES, Field.Index.NOT_ANALYZED))
fdoc.add( Field("str", concept, Field.Store.YES, Field.Index.NOT_ANALYZED))
fdoc.add( Field("str_norm", strNorm, Field.Store.YES, Field.Index.NOT_ANALYZED))
fdoc.add( Field("str_sorted", strSorted, Field.Store.YES, Field.Index.NOT_ANALYZED))
fdoc.add( Field("str_stemmed", strStemmed, Field.Store.YES, Field.Index.NOT_ANALYZED))
fdoc.add( Field("str_stemmedSorted", strStemmedSorted, Field.Store.YES, Field.Index.NOT_ANALYZED))
writer.addDocument(fdoc)
if fid % 1000 == 0:
writer.commit()
except:
"Skipping line: %s" % line
writer.commit()
writer.close()
示例13: index
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import addDocument [as 别名]
def index(analyzer, index_dest_dir, documents):
""" Builds Lucene index from provided documents using given analyzer
:param analyzer:
:param index_dest_dir:
:param list[Document] documents:
:return:
"""
if not all([isinstance(d, Document) for d in documents]):
raise TypeError("documents should be iterable of type Document! Given: %s" % type(documents[0]))
writer_config = IndexWriterConfig(Version.LUCENE_30, analyzer)
writer = IndexWriter(SimpleFSDirectory(File(index_dest_dir)), writer_config)
for doc in documents:
writer.addDocument(doc)
writer.close()
示例14: xmlrpc_indexDocument
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import addDocument [as 别名]
def xmlrpc_indexDocument(self, instance, id, text):
"""Index a new document."""
self.xmlrpc_unindexDocument(instance, id)
# Create a document and add two fields to it.
doc = Document()
doc.add(Field('id', id, Field.Store.YES, Field.Index.UN_TOKENIZED))
doc.add(Field('text', text, Field.Store.YES, Field.Index.TOKENIZED))
doc.add(Field('instance', instance, Field.Store.YES, Field.Index.UN_TOKENIZED))
# Write the document into the index.
writer = IndexWriter(self.indexPath, self.analyzer, 0)
writer.addDocument(doc)
writer.optimize()
writer.close()
log('Insert: Instance: %s Document: %s' %(instance, id))
return 1
示例15: index_wiki
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import addDocument [as 别名]
def index_wiki(wiki_xmlfile, index_directory_name):
lucene.initVM()
# Initialize index directory and analyzer.
version = Version.LUCENE_CURRENT
store = FSDirectory.open(File(index_directory_name))
analyzer = StandardAnalyzer(version)
# Creates config file.
config = IndexWriterConfig(version, analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
# Set document content field type.
content_fieldtype = FieldType()
content_fieldtype.setIndexed(True)
content_fieldtype.setStored(True)
content_fieldtype.setTokenized(True)
content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
# Set document title field type.
title_fieldtype = FieldType()
title_fieldtype.setIndexed(True)
title_fieldtype.setStored(True)
title_fieldtype.setTokenized(True)
title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
# Set document url field type.
url_fieldtype = FieldType()
url_fieldtype.setIndexed(True)
url_fieldtype.setStored(True)
url_fieldtype.setTokenized(False)
url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for xmldoc in wikicorpusxml((wiki_xmlfile)):
content = xmldoc.partition('>')[2].partition('<')[0].strip()
title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
doc = Document()
doc.add(Field("contents", content, content_fieldtype))
doc.add(Field("title", title, title_fieldtype))
doc.add(Field("url", url, url_fieldtype))
writer.addDocument(doc)
writer.commit()
writer.close()