本文整理汇总了Python中lucene.IndexWriter.addDocument方法的典型用法代码示例。如果您正苦于以下问题:Python IndexWriter.addDocument方法的具体用法?Python IndexWriter.addDocument怎么用?Python IndexWriter.addDocument使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lucene.IndexWriter
的用法示例。
在下文中一共展示了IndexWriter.addDocument方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: setUp
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import addDocument [as 别名]
def setUp(self):
self.directory = RAMDirectory()
self.analyzer = WhitespaceAnalyzer()
writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)
doc = Document()
doc.add(Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
doc = Document()
doc.add(Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.close()
self.searcher = IndexSearcher(self.directory, True)
self.reader = IndexReader.open(self.directory, True)
self.quick = SpanTermQuery(Term("f", "quick"))
self.brown = SpanTermQuery(Term("f", "brown"))
self.red = SpanTermQuery(Term("f", "red"))
self.fox = SpanTermQuery(Term("f", "fox"))
self.lazy = SpanTermQuery(Term("f", "lazy"))
self.sleepy = SpanTermQuery(Term("f", "sleepy"))
self.dog = SpanTermQuery(Term("f", "dog"))
self.cat = SpanTermQuery(Term("f", "cat"))
示例2: index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import addDocument [as 别名]
def index(self,path_to_index,path_files):
'indexes anchor texts from a given folder'
#lucene.initVM()
indexDir = path_to_index
directory_index = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(Version.LUCENE_35)
writer = IndexWriter(directory_index, analyzer, True, IndexWriter.MaxFieldLength(512))
listOfPathes = []
listOfPathes.extend(glob.glob(path_files+"*.txt"))
counter = 0
for path_to_file in listOfPathes:
print path_to_file
f = open(path_to_file,"r")
for line in f:
entry = line.split("\t")
counter+=1
"""
optimizes index after a certain amount of added documents
"""
if counter%500000==0:
print counter
writer.optimize()
doc = Document()
doc.add(Field("anchor", entry[0], Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("anchor_uri", entry[1], Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("dbpedia_uri", entry[2], Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("number", entry[3].replace("\n",""), Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.optimize()
f.close()
writer.close()
print counter
print "done"
示例3: do_index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import addDocument [as 别名]
def do_index():
initVM()
indexDir = "/home/william/woyaoo/luceneindex"
version = Version.LUCENE_CURRENT
standardAnalyzer = StandardAnalyzer(version)
# chineseAnalyzer = CJKAnalyzer(version)
engine = data.engine_from_config("indexdb.config")
# engine = data.engine_from_config()
db = data.init_datafactory(engine)
docs = dbfactory.Session().query(doc_model.Doc).filter(doc_model.Doc.dateCreated > "20121220").all()
print len(docs)
idxDir = SimpleFSDirectory(File(indexDir))
perIndexCount = 5000
writer = IndexWriter(idxDir, standardAnalyzer, True, IndexWriter.MaxFieldLength(512))
# add field
for doc in docs:
# print repr(doc.description)
lucenedoc = Document()
descriptionValue = doc.description.strip("\r\n").encode("UTF-8")
# descriptionValue ='中国 abc'
print repr(descriptionValue)
lucenedoc.add(Field("url", doc.url, Field.Store.YES, Field.Index.NOT_ANALYZED))
lucenedoc.add(Field("intent", doc.intent, Field.Store.YES, Field.Index.NOT_ANALYZED))
# lucenedoc.add(Field('description', doc.description, Field.Store.YES, Field.Index.ANALYZED))
lucenedoc.add(Field("description", descriptionValue, Field.Store.YES, Field.Index.ANALYZED))
lucenedoc.add(Field("title", doc.title, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(lucenedoc)
writer.optimize()
writer.close()
print "index finished"
示例4: index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import addDocument [as 别名]
def index(source, indexName):
if(not os.path.exists(indexName)):
os.mkdir(indexName)
indexDir = File(indexName)
writer = IndexWriter(SimpleFSDirectory(File(indexName)),StandardAnalyzer(Version.LUCENE_CURRENT), True,IndexWriter.MaxFieldLength.LIMITED)
p = re.compile("(GH\d+\-\d+)\n(.*?)\n+", re.DOTALL)
res = p.findall(source)
i = 0
for pair in res:
i += 1
doc = Document()
doc.add(Field("id", pair[0], Field.Store.YES, Field.Index.NO))
for t in pair[1].split():
doc.add(Field("content", t.replace("-","_"), Field.Store.NO, Field.Index.NOT_ANALYZED));
#doc.add(Field("content", pair[1], Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc)
writer.close()
print str(i)+ " docs indexed"
示例5: configure_lucene
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import addDocument [as 别名]
def configure_lucene():
f = open('clique.txt','r')
lucene.initVM()
print 'Inside Function'
#indexDir = "/tmp/luceneindex"
dir = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()
print >> sys.stderr, "Reading lines from sys.stdin..."
for line in f:
line = line.replace('\t','')
line = line.replace('\r','')
line = line.replace('\n','')
line = line.replace('^','')
line = line.strip()
doc = Document()
doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
writer.optimize()
print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
writer.close()
示例6: index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import addDocument [as 别名]
def index( self ):
lucene.initVM()
indexdir = SimpleFSDirectory( File( self.INDEX_DIR ) )
analyzer = StandardAnalyzer( Version.LUCENE_30 )
index_writer = IndexWriter( indexdir, analyzer, True, IndexWriter.MaxFieldLength( 512 ) )
# read input files (.xml)
for in_file in glob.glob( os.path.join( self.DOC_DIR, '*.xml' ) ):
corpus = codecs.open( in_file, encoding='utf-8' ).read()
d = pq( corpus, parser='html' )
for text in d( 'Article' ).items():
document = Document()
# find ID
art_id = str( text.attr( 'articleid' ).encode( 'utf-8' ) ).replace( '+', '-' )
# find Title
art_title = self.stem( str( text.attr( 'title' ).encode( 'utf-8' ) ) )
# find Abstract
art_abstract = self.stem( str( text.find( 'Abstract' ).html().encode('utf-8') ) )
# find Keyword
art_keyword = text.find( 'Keyword' ).html().encode('utf-8')
# find Content
art_content = self.stem( str( text.find( 'Content' ).html().encode('utf-8') ) )
# find Authors
art_authors = text.find( 'Authors' ).html().encode('utf-8')
document.add( Field( 'id', art_id, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'title', art_title, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'abstract', art_abstract, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'keyword', art_keyword, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'content', art_content, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'authors', art_authors, Field.Store.YES, Field.Index.ANALYZED ) )
document.add( Field( 'article', art_title + art_abstract + art_keyword + art_content,\
Field.Store.YES,\
Field.Index.ANALYZED ) )
index_writer.addDocument( document )
index_writer.optimize()
index_writer.close()
示例7: main
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import addDocument [as 别名]
def main(cls, argv):
if len(argv) < 5:
print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
return
docsInIndex = int(argv[1])
# create an index called 'index-dir' in a temp directory
indexDir = os.path.join(tempfile.gettempdir(),
'index-dir')
dir = FSDirectory.open(indexDir,)
analyzer = SimpleAnalyzer()
writer = IndexWriter(dir, analyzer, True)
# set variables that affect speed of indexing
writer.setMergeFactor(int(argv[2]))
writer.setMaxMergeDocs(int(argv[3]))
writer.setMaxBufferedDocs(int(argv[4]))
# writer.infoStream = tempfile.out
print "Merge factor: ", writer.getMergeFactor()
print "Max merge docs:", writer.getMaxMergeDocs()
print "Max buffered docs:", writer.getMaxBufferedDocs()
start = time()
for i in xrange(docsInIndex):
doc = Document()
doc.add(Field("fieldname", "Bibamus",
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.close()
print "Time: ", timedelta(seconds=time() - start)
示例8: update_index_withLineArray
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import addDocument [as 别名]
def update_index_withLineArray(self,array):
"""
Parsed sentences (given in an array) are added to the index, with the corresponding two entities (x,y) and the DBpedia URI
"""
print "start adding sentences"
writer = IndexWriter(index_directory, analyzer, False, IndexWriter.MaxFieldLength(512))
for item in array:
line = item[0]
x = item[1]
y = item[2]
uri = item[3]
line=line.replace("\t"," ")
line = line.replace("\n"," ")
line = line.replace(" "," ")
try:
doc = Document()
doc.add(Field("Sentence", line, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("X", x, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("Y", y, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("URI", uri, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
except Exception:
print "Unexpected error:", sys.exc_info()[0]
raw_input("Error in updating the Sentences")
try:
writer.optimize()
except:
print "Unexpected error:", sys.exc_info()[0]
print ("could not optimize index")
writer.close()
print "all sentences added"
示例9: setUp
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import addDocument [as 别名]
def setUp(self):
animals = [ "aardvark", "beaver", "coati",
"dog", "elephant", "frog", "gila monster",
"horse", "iguana", "javelina", "kangaroo",
"lemur", "moose", "nematode", "orca",
"python", "quokka", "rat", "scorpion",
"tarantula", "uromastyx", "vicuna",
"walrus", "xiphias", "yak", "zebra" ]
analyzer = WhitespaceAnalyzer()
aTOmDirectory = RAMDirectory()
nTOzDirectory = RAMDirectory()
aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True,
IndexWriter.MaxFieldLength.UNLIMITED)
nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True,
IndexWriter.MaxFieldLength.UNLIMITED)
for animal in animals:
doc = Document()
doc.add(Field("animal", animal,
Field.Store.YES, Field.Index.NOT_ANALYZED))
if animal[0].lower() < "n":
aTOmWriter.addDocument(doc)
else:
nTOzWriter.addDocument(doc)
aTOmWriter.close()
nTOzWriter.close()
self.searchers = [ IndexSearcher(aTOmDirectory),
IndexSearcher(nTOzDirectory) ]
示例10: addDocuments
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import addDocument [as 别名]
def addDocuments(self, dir, isCompound):
writer = IndexWriter(dir, SimpleAnalyzer(), True,
IndexWriter.MaxFieldLength.LIMITED)
writer.setUseCompoundFile(isCompound)
# change to adjust performance of indexing with FSDirectory
# writer.mergeFactor = writer.mergeFactor
# writer.maxMergeDocs = writer.maxMergeDocs
# writer.minMergeDocs = writer.minMergeDocs
for word in self.docs:
doc = Document()
doc.add(Field("keyword", word,
Field.Store.YES, Field.Index.NOT_ANALYZED))
doc.add(Field("unindexed", word,
Field.Store.YES, Field.Index.NO))
doc.add(Field("unstored", word,
Field.Store.NO, Field.Index.ANALYZED))
doc.add(Field("text", word,
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.optimize()
writer.close()
示例11: index
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import addDocument [as 别名]
def index(string):
lucene.initVM()
indexDir = "REMOVEME.index-dir"
dir = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(Version.LUCENE_30)
try:
writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512))
except lucene.JavaError:
#print 'Inside Index Except'
writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
#e = sys.exc_info()[0]
#print e
#print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()
doc = Document()
doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
#print 'In the index function'
#print writer.numDocs()
#print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
#print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
writer.optimize()
#print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
#print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
#print 'ending Indexing'
#print string
#print 'Total indexes'
#print writer.numDocs()
writer.close()
示例12: addContents
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import addDocument [as 别名]
def addContents(self,contents):
try:
#iwconfig = IndexWriterConfig(SimpleAnalyzer(),IndexWriter.MaxFieldLength.LIMITED)
writer = IndexWriter(self.ramIndex,SimpleAnalyzer(Version.LUCENE_CURRENT),True,IndexWriter.MaxFieldLength.LIMITED)
for content in contents:
doc = Document()
doc.add(Field("contents",content[1],Field.Store.NO,Field.Index.ANALYZED,Field.TermVector.YES))
writer.addDocument(doc)
writer.close()
except Exception,e:
print 'Unable to add content to RAM index'
示例13: indexSingleFieldDocs
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import addDocument [as 别名]
def indexSingleFieldDocs(self, fields):
writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
IndexWriter.MaxFieldLength.UNLIMITED)
for field in fields:
doc = Document()
doc.add(field)
writer.addDocument(doc)
writer.commit()
writer.close()
示例14: setUp
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import addDocument [as 别名]
def setUp(self):
self.directory = RAMDirectory()
writer = IndexWriter(self.directory, self.porterAnalyzer, True,
IndexWriter.MaxFieldLength.UNLIMITED)
doc = Document()
doc.add(Field("contents",
"The quick brown fox jumps over the lazy dogs",
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.close()
示例15: setUp
# 需要导入模块: from lucene import IndexWriter [as 别名]
# 或者: from lucene.IndexWriter import addDocument [as 别名]
def setUp(self):
# set up sample document
directory = RAMDirectory()
writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
IndexWriter.MaxFieldLength.UNLIMITED)
doc = Document()
doc.add(Field("field", "the quick brown fox jumped over the lazy dog",
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.close()
self.searcher = IndexSearcher(directory)