本文整理汇总了Python中lucene.Document.add方法的典型用法代码示例。如果您正苦于以下问题:Python Document.add方法的具体用法?Python Document.add怎么用?Python Document.add使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lucene.Document
的用法示例。
在下文中一共展示了Document.add方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: indexfeeds
# 需要导入模块: from lucene import Document [as 别名]
# 或者: from lucene.Document import add [as 别名]
def indexfeeds(self,writer):
"""
"""
feedlist=['http://today.reuters.com/rss/topNews',
'http://today.reuters.com/rss/domesticNews',
'http://today.reuters.com/rss/wordNews',
'http://rss.cnn.com/rss/edition.rss',
'http://rss.cnn.com/rss/edition_word.rss',
'http://rss.cnn.com/rss/edition_us.rss']
articletitles=[]
for feed in feedlist:
f=feedparser.parse(feed)
for e in f.entries:
if e.title in articletitles: continue
contents = e.title.encode('utf8') + self.strphtml(e.description.encode('utf8'))
try:
doc = Document()
doc.add(Field("name", e.title,
Field.Store.YES,
Field.Index.NOT_ANALYZED))
if len(contents) > 0:
doc.add(Field("contents", contents,
Field.Store.YES,
Field.Index.ANALYZED,
Field.TermVector.YES))
writer.addDocument(doc)
except Exception, e:
print 'Unable to index'
示例2: setUp
# 需要导入模块: from lucene import Document [as 别名]
# 或者: from lucene.Document import add [as 别名]
def setUp(self):
animals = [ "aardvark", "beaver", "coati",
"dog", "elephant", "frog", "gila monster",
"horse", "iguana", "javelina", "kangaroo",
"lemur", "moose", "nematode", "orca",
"python", "quokka", "rat", "scorpion",
"tarantula", "uromastyx", "vicuna",
"walrus", "xiphias", "yak", "zebra" ]
analyzer = WhitespaceAnalyzer()
aTOmDirectory = RAMDirectory()
nTOzDirectory = RAMDirectory()
aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True,
IndexWriter.MaxFieldLength.UNLIMITED)
nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True,
IndexWriter.MaxFieldLength.UNLIMITED)
for animal in animals:
doc = Document()
doc.add(Field("animal", animal,
Field.Store.YES, Field.Index.NOT_ANALYZED))
if animal[0].lower() < "n":
aTOmWriter.addDocument(doc)
else:
nTOzWriter.addDocument(doc)
aTOmWriter.close()
nTOzWriter.close()
self.searchers = [ IndexSearcher(aTOmDirectory),
IndexSearcher(nTOzDirectory) ]
示例3: index
# 需要导入模块: from lucene import Document [as 别名]
# 或者: from lucene.Document import add [as 别名]
def index(source, indexName):
if(not os.path.exists(indexName)):
os.mkdir(indexName)
indexDir = File(indexName)
writer = IndexWriter(SimpleFSDirectory(File(indexName)),StandardAnalyzer(Version.LUCENE_CURRENT), True,IndexWriter.MaxFieldLength.LIMITED)
p = re.compile("(GH\d+\-\d+)\n(.*?)\n+", re.DOTALL)
res = p.findall(source)
i = 0
for pair in res:
i += 1
doc = Document()
doc.add(Field("id", pair[0], Field.Store.YES, Field.Index.NO))
for t in pair[1].split():
doc.add(Field("content", t.replace("-","_"), Field.Store.NO, Field.Index.NOT_ANALYZED));
#doc.add(Field("content", pair[1], Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc)
writer.close()
print str(i)+ " docs indexed"
示例4: main
# 需要导入模块: from lucene import Document [as 别名]
# 或者: from lucene.Document import add [as 别名]
def main(cls, argv):
if len(argv) < 5:
print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
return
docsInIndex = int(argv[1])
# create an index called 'index-dir' in a temp directory
indexDir = os.path.join(tempfile.gettempdir(),
'index-dir')
dir = FSDirectory.open(indexDir,)
analyzer = SimpleAnalyzer()
writer = IndexWriter(dir, analyzer, True)
# set variables that affect speed of indexing
writer.setMergeFactor(int(argv[2]))
writer.setMaxMergeDocs(int(argv[3]))
writer.setMaxBufferedDocs(int(argv[4]))
# writer.infoStream = tempfile.out
print "Merge factor: ", writer.getMergeFactor()
print "Max merge docs:", writer.getMaxMergeDocs()
print "Max buffered docs:", writer.getMaxBufferedDocs()
start = time()
for i in xrange(docsInIndex):
doc = Document()
doc.add(Field("fieldname", "Bibamus",
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.close()
print "Time: ", timedelta(seconds=time() - start)
示例5: configure_lucene
# 需要导入模块: from lucene import Document [as 别名]
# 或者: from lucene.Document import add [as 别名]
def configure_lucene():
f = open('clique.txt','r')
lucene.initVM()
print 'Inside Function'
#indexDir = "/tmp/luceneindex"
dir = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()
print >> sys.stderr, "Reading lines from sys.stdin..."
for line in f:
line = line.replace('\t','')
line = line.replace('\r','')
line = line.replace('\n','')
line = line.replace('^','')
line = line.strip()
doc = Document()
doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
writer.optimize()
print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
writer.close()
示例6: run
# 需要导入模块: from lucene import Document [as 别名]
# 或者: from lucene.Document import add [as 别名]
def run(self):
env.attachCurrentThread()
stream = tweetstream.SampleStream("username", "password")
for tweet in stream:
try:
contents = unicode(tweet['text'])
user_name = tweet['user']['screen_name']
#print contents
#print user_name
doc = Document()
doc.add(Field("user_name", user_name, Field.Store.YES, Field.Index.NOT_ANALYZED))
if len(contents) > 0:
doc.add(Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED))
else:
pass
self.writer.addDocument(doc)
# optimize for fast search and commit the changes
self.writer.optimize()
self.writer.commit()
except Exception as e: pass
示例7: add_new_document_with_metadata
# 需要导入模块: from lucene import Document [as 别名]
# 或者: from lucene.Document import add [as 别名]
def add_new_document_with_metadata(writer,filepath,fieldnames,values):
file = open(filepath)
contents = unicode(file.read(), 'UTF-8')
file.close()
doc = Document()
# add name, path, and contents fields
doc.add(Field("name", os.path.basename(filepath),
Field.Store.YES,
Field.Index.NOT_ANALYZED))
doc.add(Field("path", os.path.realpath(filepath),
Field.Store.YES,
Field.Index.NOT_ANALYZED))
doc.add(Field("txtorg_id", str(uuid.uuid1()),
Field.Store.YES,
Field.Index.NOT_ANALYZED))
if len(contents) > 0:
doc.add(Field("contents", contents,
Field.Store.NO,
Field.Index.ANALYZED,
Field.TermVector.YES))
else:
print "warning: no content in %s" % filename
for idx in range(len(fieldnames)):
doc.add(Field(fieldnames[idx].lower(),values[idx].lower(),Field.Store.YES,Field.Index.NOT_ANALYZED))
writer.addDocument(doc)
示例8: reindex_all
# 需要导入模块: from lucene import Document [as 别名]
# 或者: from lucene.Document import add [as 别名]
def reindex_all(reader, writer, analyzer):
for i in xrange(reader.maxDoc()):
if reader.isDeleted(i): continue
doc = reader.document(i)
p = doc.get("path")
pkid = doc.get('txtorg_id')
if p is None:
# No filepath specified, just use original document
writer.updateDocument(Term("txtorg_id",pkid),doc,analyzer)
else:
# if a path field is found, try to read the file it points to and add a contents field
edited_doc = Document()
for f in doc.getFields():
edited_doc.add(Field.cast_(f))
try:
inf = open(p)
contents = unicode(inf.read(), 'UTF-8')
inf.close()
if len(contents) > 0:
edited_doc.add(Field("contents", contents,
Field.Store.NO,
Field.Index.ANALYZED,
Field.TermVector.YES))
else:
print "warning: no content in %s" % filename
except:
print "Could not read file; skipping"
writer.updateDocument(Term("txtorg_id",pkid),edited_doc,analyzer)
示例9: setUp
# 需要导入模块: from lucene import Document [as 别名]
# 或者: from lucene.Document import add [as 别名]
def setUp(self):
self.directory = RAMDirectory()
self.analyzer = WhitespaceAnalyzer()
writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED)
doc = Document()
doc.add(Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
doc = Document()
doc.add(Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.close()
self.searcher = IndexSearcher(self.directory, True)
self.reader = IndexReader.open(self.directory, True)
self.quick = SpanTermQuery(Term("f", "quick"))
self.brown = SpanTermQuery(Term("f", "brown"))
self.red = SpanTermQuery(Term("f", "red"))
self.fox = SpanTermQuery(Term("f", "fox"))
self.lazy = SpanTermQuery(Term("f", "lazy"))
self.sleepy = SpanTermQuery(Term("f", "sleepy"))
self.dog = SpanTermQuery(Term("f", "dog"))
self.cat = SpanTermQuery(Term("f", "cat"))
示例10: index
# 需要导入模块: from lucene import Document [as 别名]
# 或者: from lucene.Document import add [as 别名]
def index(string):
lucene.initVM()
indexDir = "REMOVEME.index-dir"
dir = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(Version.LUCENE_30)
try:
writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512))
except lucene.JavaError:
#print 'Inside Index Except'
writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
#e = sys.exc_info()[0]
#print e
#print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()
doc = Document()
doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
#print 'In the index function'
#print writer.numDocs()
#print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
#print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
writer.optimize()
#print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
#print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
#print 'ending Indexing'
#print string
#print 'Total indexes'
#print writer.numDocs()
writer.close()
示例11: addContents
# 需要导入模块: from lucene import Document [as 别名]
# 或者: from lucene.Document import add [as 别名]
def addContents(self,contents):
try:
#iwconfig = IndexWriterConfig(SimpleAnalyzer(),IndexWriter.MaxFieldLength.LIMITED)
writer = IndexWriter(self.ramIndex,SimpleAnalyzer(Version.LUCENE_CURRENT),True,IndexWriter.MaxFieldLength.LIMITED)
for content in contents:
doc = Document()
doc.add(Field("contents",content[1],Field.Store.NO,Field.Index.ANALYZED,Field.TermVector.YES))
writer.addDocument(doc)
writer.close()
except Exception,e:
print 'Unable to add content to RAM index'
示例12: indexSingleFieldDocs
# 需要导入模块: from lucene import Document [as 别名]
# 或者: from lucene.Document import add [as 别名]
def indexSingleFieldDocs(self, fields):
writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
IndexWriter.MaxFieldLength.UNLIMITED)
for field in fields:
doc = Document()
doc.add(field)
writer.addDocument(doc)
writer.commit()
writer.close()
示例13: setUp
# 需要导入模块: from lucene import Document [as 别名]
# 或者: from lucene.Document import add [as 别名]
def setUp(self):
self.directory = RAMDirectory()
writer = IndexWriter(self.directory, self.porterAnalyzer, True,
IndexWriter.MaxFieldLength.UNLIMITED)
doc = Document()
doc.add(Field("contents",
"The quick brown fox jumps over the lazy dogs",
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.close()
示例14: setUp
# 需要导入模块: from lucene import Document [as 别名]
# 或者: from lucene.Document import add [as 别名]
def setUp(self):
# set up sample document
directory = RAMDirectory()
writer = IndexWriter(directory, WhitespaceAnalyzer(), True,
IndexWriter.MaxFieldLength.UNLIMITED)
doc = Document()
doc.add(Field("field", "the quick brown fox jumped over the lazy dog",
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.close()
self.searcher = IndexSearcher(directory)
示例15: index
# 需要导入模块: from lucene import Document [as 别名]
# 或者: from lucene.Document import add [as 别名]
def index(self,path_to_index,path_files):
'indexes anchor texts from a given folder'
#lucene.initVM()
indexDir = path_to_index
directory_index = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(Version.LUCENE_35)
writer = IndexWriter(directory_index, analyzer, True, IndexWriter.MaxFieldLength(512))
listOfPathes = []
listOfPathes.extend(glob.glob(path_files+"*.txt"))
counter = 0
for path_to_file in listOfPathes:
print path_to_file
f = open(path_to_file,"r")
for line in f:
entry = line.split("\t")
counter+=1
"""
optimizes index after a certain amount of added documents
"""
if counter%500000==0:
print counter
writer.optimize()
doc = Document()
doc.add(Field("anchor", entry[0], Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("anchor_uri", entry[1], Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("dbpedia_uri", entry[2], Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("number", entry[3].replace("\n",""), Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.optimize()
f.close()
writer.close()
print counter
print "done"