本文整理汇总了Python中org.apache.lucene.document.Document.get方法的典型用法代码示例。如果您正苦于以下问题:Python Document.get方法的具体用法?Python Document.get怎么用?Python Document.get使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.document.Document
的用法示例。
在下文中一共展示了Document.get方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import get [as 别名]
def main(indexDir, inputDir):
"""Creates a Lucene Index, and indexes every .json file it finds.
It utilizes a stopwords.txt to filter out stop words"""
lucene.initVM()
logger.info("Loading stop words from stopwords.txt")
f = open('stopwords.txt', 'r')
stopwords = set([])
for line in f:
stopwords.add(line.strip())
f.close()
logger.debug('Stop words: %s' % str(stopwords))
temp = CharArraySet(Version.LUCENE_CURRENT, 1, True)
for stopword in stopwords:
temp.add(stopword)
stopwords = temp
# Create index
logger.info("Creating Lucene index [%s]..." % indexDir)
dir = SimpleFSDirectory(File(indexDir))
analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopwords)
writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
writer = IndexWriter(dir, writerConfig)
logger.info("Currently there are %d documents in the index..." % writer.numDocs())
# Index documents
onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ]
for f in onlyfiles:
try:
journal_code = f.split('.')[0]
f = join(inputDir, f)
json_data = open(f)
data = json.load(json_data)
for entry in data:
doc = Document()
doc.add(Field("journal", journal_code, Field.Store.YES, Field.Index.NOT_ANALYZED))
doc.add(Field("url", entry['url'], Field.Store.YES, Field.Index.NOT_ANALYZED ))
doc.add(Field("date", entry['date'], Field.Store.YES, Field.Index.NOT_ANALYZED ))
doc.add(Field("title", entry['title'], Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
json_data.close()
except (IOError) as v:
try:
(code, message) = v
except:
code = 0
message = v
logger.error("I/O Error: " + str(message) + " (" + str(code) + ")")
logger.info("Indexed lines from stdin (%d documents in index)" % writer.numDocs())
# Wrap it up
#logger.info("About to optimize index of %d documents..." % writer.numDocs())
#writer.optimize()
#logger.info("...done optimizing index of %d documents" % writer.numDocs())
logger.info("Closing index of %d documents..." % writer.numDocs())
writer.close()
reader = IndexReader.open(dir)
with open('all.csv', 'wb') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
for i in xrange(0, reader.numDocs()):
doc = reader.document(i)
csvwriter.writerow([doc.get('journal'), doc.get('date'), doc.get('url').encode('utf8'), \
doc.get('title').strip().replace(',', '\,').encode('utf8')])