本文整理汇总了Python中lucene.Document类的典型用法代码示例。如果您正苦于以下问题:Python Document类的具体用法?Python Document怎么用?Python Document使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Document类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: index
def index(string):
lucene.initVM()
indexDir = "REMOVEME.index-dir"
dir = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(Version.LUCENE_30)
try:
writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512))
except lucene.JavaError:
#print 'Inside Index Except'
writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
#e = sys.exc_info()[0]
#print e
#print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()
doc = Document()
doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
#print 'In the index function'
#print writer.numDocs()
#print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
#print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
writer.optimize()
#print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
#print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
#print 'ending Indexing'
#print string
#print 'Total indexes'
#print writer.numDocs()
writer.close()
示例2: index
def index(source, indexName):
if(not os.path.exists(indexName)):
os.mkdir(indexName)
indexDir = File(indexName)
writer = IndexWriter(SimpleFSDirectory(File(indexName)),StandardAnalyzer(Version.LUCENE_CURRENT), True,IndexWriter.MaxFieldLength.LIMITED)
p = re.compile("(GH\d+\-\d+)\n(.*?)\n+", re.DOTALL)
res = p.findall(source)
i = 0
for pair in res:
i += 1
doc = Document()
doc.add(Field("id", pair[0], Field.Store.YES, Field.Index.NO))
for t in pair[1].split():
doc.add(Field("content", t.replace("-","_"), Field.Store.NO, Field.Index.NOT_ANALYZED));
#doc.add(Field("content", pair[1], Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc)
writer.close()
print str(i)+ " docs indexed"
示例3: configure_lucene
def configure_lucene():
f = open('clique.txt','r')
lucene.initVM()
print 'Inside Function'
#indexDir = "/tmp/luceneindex"
dir = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()
print >> sys.stderr, "Reading lines from sys.stdin..."
for line in f:
line = line.replace('\t','')
line = line.replace('\r','')
line = line.replace('\n','')
line = line.replace('^','')
line = line.strip()
doc = Document()
doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
writer.optimize()
print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
writer.close()
示例4: setUp
def setUp(self):
animals = [ "aardvark", "beaver", "coati",
"dog", "elephant", "frog", "gila monster",
"horse", "iguana", "javelina", "kangaroo",
"lemur", "moose", "nematode", "orca",
"python", "quokka", "rat", "scorpion",
"tarantula", "uromastyx", "vicuna",
"walrus", "xiphias", "yak", "zebra" ]
analyzer = WhitespaceAnalyzer()
aTOmDirectory = RAMDirectory()
nTOzDirectory = RAMDirectory()
aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True,
IndexWriter.MaxFieldLength.UNLIMITED)
nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True,
IndexWriter.MaxFieldLength.UNLIMITED)
for animal in animals:
doc = Document()
doc.add(Field("animal", animal,
Field.Store.YES, Field.Index.NOT_ANALYZED))
if animal[0].lower() < "n":
aTOmWriter.addDocument(doc)
else:
nTOzWriter.addDocument(doc)
aTOmWriter.close()
nTOzWriter.close()
self.searchers = [ IndexSearcher(aTOmDirectory),
IndexSearcher(nTOzDirectory) ]
示例5: main
def main(cls, argv):
if len(argv) < 5:
print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
return
docsInIndex = int(argv[1])
# create an index called 'index-dir' in a temp directory
indexDir = os.path.join(tempfile.gettempdir(),
'index-dir')
dir = FSDirectory.open(indexDir,)
analyzer = SimpleAnalyzer()
writer = IndexWriter(dir, analyzer, True)
# set variables that affect speed of indexing
writer.setMergeFactor(int(argv[2]))
writer.setMaxMergeDocs(int(argv[3]))
writer.setMaxBufferedDocs(int(argv[4]))
# writer.infoStream = tempfile.out
print "Merge factor: ", writer.getMergeFactor()
print "Max merge docs:", writer.getMaxMergeDocs()
print "Max buffered docs:", writer.getMaxBufferedDocs()
start = time()
for i in xrange(docsInIndex):
doc = Document()
doc.add(Field("fieldname", "Bibamus",
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.close()
print "Time: ", timedelta(seconds=time() - start)
示例6: run
def run(self):
env.attachCurrentThread()
stream = tweetstream.SampleStream("username", "password")
for tweet in stream:
try:
contents = unicode(tweet['text'])
user_name = tweet['user']['screen_name']
#print contents
#print user_name
doc = Document()
doc.add(Field("user_name", user_name, Field.Store.YES, Field.Index.NOT_ANALYZED))
if len(contents) > 0:
doc.add(Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED))
else:
pass
self.writer.addDocument(doc)
# optimize for fast search and commit the changes
self.writer.optimize()
self.writer.commit()
except Exception as e: pass
示例7: indexfeeds
def indexfeeds(self,writer):
"""
"""
feedlist=['http://today.reuters.com/rss/topNews',
'http://today.reuters.com/rss/domesticNews',
'http://today.reuters.com/rss/wordNews',
'http://rss.cnn.com/rss/edition.rss',
'http://rss.cnn.com/rss/edition_word.rss',
'http://rss.cnn.com/rss/edition_us.rss']
articletitles=[]
for feed in feedlist:
f=feedparser.parse(feed)
for e in f.entries:
if e.title in articletitles: continue
contents = e.title.encode('utf8') + self.strphtml(e.description.encode('utf8'))
try:
doc = Document()
doc.add(Field("name", e.title,
Field.Store.YES,
Field.Index.NOT_ANALYZED))
if len(contents) > 0:
doc.add(Field("contents", contents,
Field.Store.YES,
Field.Index.ANALYZED,
Field.TermVector.YES))
writer.addDocument(doc)
except Exception, e:
print 'Unable to index'
示例8: reindex_all
def reindex_all(reader, writer, analyzer):
for i in xrange(reader.maxDoc()):
if reader.isDeleted(i): continue
doc = reader.document(i)
p = doc.get("path")
pkid = doc.get('txtorg_id')
if p is None:
# No filepath specified, just use original document
writer.updateDocument(Term("txtorg_id",pkid),doc,analyzer)
else:
# if a path field is found, try to read the file it points to and add a contents field
edited_doc = Document()
for f in doc.getFields():
edited_doc.add(Field.cast_(f))
try:
inf = open(p)
contents = unicode(inf.read(), 'UTF-8')
inf.close()
if len(contents) > 0:
edited_doc.add(Field("contents", contents,
Field.Store.NO,
Field.Index.ANALYZED,
Field.TermVector.YES))
else:
print "warning: no content in %s" % filename
except:
print "Could not read file; skipping"
writer.updateDocument(Term("txtorg_id",pkid),edited_doc,analyzer)
示例9: indexDocs
def indexDocs(self):
#from user_location_map
for root, _, filenames in os.walk(self.directory):
for filename in filenames:
try:
path = os.path.join(root, filename)
print path
f = open(path, 'r')
# every line in the file is a user_location_map entry
for line in f:
data = cjson.decode(line)
doc = Document()
tw_texts = []
num_tweets = {}
total_num_tweets = 0.0
for i in data['locations']:
tweets = [x['tx'] for x in i['tweets']]
num_tweets[i['name']] = len(i['tweets'])
tw_texts.extend(tweets)
total_num_tweets += len(tweets)
#tweets indexed as part of the UserMapIndexer
doc.add(Field("text", " ".join(tw_texts), Field.Store.NO,
Field.Index.ANALYZED))
doc.add(Field("num_tweets", cjson.encode(num_tweets),
Field.Store.YES,
Field.Index.NO))
doc.add(Field("user", data['user'], Field.Store.YES,
Field.Index.NO))
doc.setBoost(total_num_tweets)
self.writer.addDocument(doc)
f.close()
except Exception, e:
print "Failed in indexLocations:", e
print sys.exc_info()[0]
示例10: addDocuments
def addDocuments(self, dir, isCompound):
writer = IndexWriter(dir, SimpleAnalyzer(), True,
IndexWriter.MaxFieldLength.LIMITED)
writer.setUseCompoundFile(isCompound)
# change to adjust performance of indexing with FSDirectory
# writer.mergeFactor = writer.mergeFactor
# writer.maxMergeDocs = writer.maxMergeDocs
# writer.minMergeDocs = writer.minMergeDocs
for word in self.docs:
doc = Document()
doc.add(Field("keyword", word,
Field.Store.YES, Field.Index.NOT_ANALYZED))
doc.add(Field("unindexed", word,
Field.Store.YES, Field.Index.NO))
doc.add(Field("unstored", word,
Field.Store.NO, Field.Index.ANALYZED))
doc.add(Field("text", word,
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.optimize()
writer.close()
示例11: do_index
def do_index():
initVM()
indexDir = "/home/william/woyaoo/luceneindex"
version = Version.LUCENE_CURRENT
standardAnalyzer = StandardAnalyzer(version)
# chineseAnalyzer = CJKAnalyzer(version)
engine = data.engine_from_config("indexdb.config")
# engine = data.engine_from_config()
db = data.init_datafactory(engine)
docs = dbfactory.Session().query(doc_model.Doc).filter(doc_model.Doc.dateCreated > "20121220").all()
print len(docs)
idxDir = SimpleFSDirectory(File(indexDir))
perIndexCount = 5000
writer = IndexWriter(idxDir, standardAnalyzer, True, IndexWriter.MaxFieldLength(512))
# add field
for doc in docs:
# print repr(doc.description)
lucenedoc = Document()
descriptionValue = doc.description.strip("\r\n").encode("UTF-8")
# descriptionValue ='中国 abc'
print repr(descriptionValue)
lucenedoc.add(Field("url", doc.url, Field.Store.YES, Field.Index.NOT_ANALYZED))
lucenedoc.add(Field("intent", doc.intent, Field.Store.YES, Field.Index.NOT_ANALYZED))
# lucenedoc.add(Field('description', doc.description, Field.Store.YES, Field.Index.ANALYZED))
lucenedoc.add(Field("description", descriptionValue, Field.Store.YES, Field.Index.ANALYZED))
lucenedoc.add(Field("title", doc.title, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(lucenedoc)
writer.optimize()
writer.close()
print "index finished"
示例12: index
def index(self,path_to_index,path_files):
'indexes anchor texts from a given folder'
#lucene.initVM()
indexDir = path_to_index
directory_index = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(Version.LUCENE_35)
writer = IndexWriter(directory_index, analyzer, True, IndexWriter.MaxFieldLength(512))
listOfPathes = []
listOfPathes.extend(glob.glob(path_files+"*.txt"))
counter = 0
for path_to_file in listOfPathes:
print path_to_file
f = open(path_to_file,"r")
for line in f:
entry = line.split("\t")
counter+=1
"""
optimizes index after a certain amount of added documents
"""
if counter%500000==0:
print counter
writer.optimize()
doc = Document()
doc.add(Field("anchor", entry[0], Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("anchor_uri", entry[1], Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("dbpedia_uri", entry[2], Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("number", entry[3].replace("\n",""), Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.optimize()
f.close()
writer.close()
print counter
print "done"
示例13: addContents
def addContents(self,contents):
try:
#iwconfig = IndexWriterConfig(SimpleAnalyzer(),IndexWriter.MaxFieldLength.LIMITED)
writer = IndexWriter(self.ramIndex,SimpleAnalyzer(Version.LUCENE_CURRENT),True,IndexWriter.MaxFieldLength.LIMITED)
for content in contents:
doc = Document()
doc.add(Field("contents",content[1],Field.Store.NO,Field.Index.ANALYZED,Field.TermVector.YES))
writer.addDocument(doc)
writer.close()
except Exception,e:
print 'Unable to add content to RAM index'
示例14: setUp
def setUp(self):
self.directory = RAMDirectory()
writer = IndexWriter(self.directory, self.porterAnalyzer, True,
IndexWriter.MaxFieldLength.UNLIMITED)
doc = Document()
doc.add(Field("contents",
"The quick brown fox jumps over the lazy dogs",
Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
writer.close()
示例15: indexSingleFieldDocs
def indexSingleFieldDocs(self, fields):
writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
IndexWriter.MaxFieldLength.UNLIMITED)
for field in fields:
doc = Document()
doc.add(field)
writer.addDocument(doc)
writer.commit()
writer.close()