本文整理汇总了Python中org.apache.lucene.index.IndexWriter.numDocs方法的典型用法代码示例。如果您正苦于以下问题:Python IndexWriter.numDocs方法的具体用法?Python IndexWriter.numDocs怎么用?Python IndexWriter.numDocs使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.index.IndexWriter
的用法示例。
在下文中一共展示了IndexWriter.numDocs方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: create_index
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
def create_index():
lucene.initVM()
if os.path.exists(prm.index_folder):
shutil.rmtree(prm.index_folder)
indexDir = SimpleFSDirectory(File(prm.index_folder))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
wk = wiki.Wiki(prm.pages_path)
print "%d docs in index" % writer.numDocs()
print "Reading files from wikipedia..."
n = 0
for l in wk.get_text_iter():
doc = Document()
doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
n += 1
if n % 100000 == 0:
print 'indexing article', n
print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs())
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例2: wikipedia_indexer
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
def wikipedia_indexer(storage, wikipedia_file) :
lucene.initVM()
indexDir = SimpleFSDirectory(File(storage))
stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
for s in stopwords :
stops.add(s)
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
writer = IndexWriter(indexDir, writerConfig)
print "%d docs in index" % writer.numDocs()
print "Reading Documents"
f = open(wikipedia_file)
for i, line in enumerate(f) :
text = line.strip().decode('utf-8').split('\t')
title = text[0]
if 'disambigu' in text[0] or len(text) < 2:
continue
text = text[1]
doc = Document()
doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
writer.addDocument(doc)
if writer.numDocs() % 1000 == 0 :
print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例3: create_index
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
def create_index(storage, paths) :
lucene.initVM()
indexDir = SimpleFSDirectory(File(storage))
stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
for s in stopwords :
stops.add(s)
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
writer = IndexWriter(indexDir, writerConfig)
print "%d docs in index" % writer.numDocs()
print "Reading Documents"
import os
for path in paths :
for filen in os.listdir(path) :
text = sent_tokenize(get_data_from_file(path + filen))
total_sent = len(text)
for i in range(0, total_sent, 3) :
doc = Document()
a = i-5 if i-5 > 0 else 0
sentence = ' '.join(text[a:i+5])
doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print("Done %s" % (path+filen))
print "Indexed (%d docs in index)" % (writer.numDocs())
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例4: __init__
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
class LuceneIndexer:
def __init__(self, path_to_save):
self.path_to_save = path_to_save
self.num_docs = 0
lucene.initVM()
self.indexDir = SimpleFSDirectory(File(self.path_to_save))
self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
self.analyzer2 = WhitespaceAnalyzer(Version.LUCENE_4_10_1)
self.writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, self.analyzer2)
self.writer = IndexWriter(self.indexDir, self.writerConfig)
def add_document(self, fields, header, id_):
doc = Document()
if len(fields) > len(header):
sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_))
for field in fields:
sys.stderr.write('%s\n' % field)
return
for idx, field in enumerate(fields):
fname, fieldtype = header[idx]
if fieldtype is IntField:
field = int(field)
doc.add(fieldtype(fname, field, Field.Store.YES))
self.writer.addDocument(doc)
self.num_docs += 1
def close(self):
print 'Indexed %d lines from stdin (%d docs in index)' % (self.num_docs, self.writer.numDocs())
self.writer.close()
示例5: indexer
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
def indexer(docNumber, docText):
lucene.initVM()
indexDir = SimpleFSDirectory(File("index/"))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
doc = Document()
doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例6: lucene_indexing
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
def lucene_indexing():
lucene.initVM()
index_dir = os.getcwd()
dir = SimpleFSDirectory(File(index_dir))
analyzer = StandardAnalyzer(Version.LUCENE_48)
index_writer_config = IndexWriterConfig(Version.LUCENE_48, analyzer);
index_writer = IndexWriter(dir, index_writer_config)
for tfile in glob.glob(os.path.join(index_dir, '*.txt')):
print "Indexing: ", tfile
document = Document()
with open(tfile, 'r') as f:
content = f.read()
document.add(Field("text", content, Field.Store.YES,
Field.Index.ANALYZED))
document.add(Field("title", tfile, Field.Store.YES,
Field.Index.ANALYZED))
index_writer.addDocument(document)
print index_writer.numDocs()
index_writer.close()
示例7: create_index
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
def create_index(index) :
indexDir = SimpleFSDirectory(File(index))
stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
for s in stopwords :
stops.add(s)
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
writer = IndexWriter(indexDir, writerConfig)
print "%d docs in index" % writer.numDocs()
print "Reading Documents"
f = open('f:/nlp/data/questions/combine.txt')
for line in f :
line = get_data_from_text(line.decode('utf-8'))
doc = Document()
field = Field("text", line, Field.Store.YES, Field.Index.ANALYZED)
field.setBoost(2.0)
doc.add(field)
writer.addDocument(doc)
print "Indexed (%d docs in index)" % (writer.numDocs())
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例8: SimpleFSDirectory
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, IntField, StringField, TextField
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
LUCENE_TYPES={'i':IntField,'s':StringField,'t':TextField}
if __name__ == "__main__":
lucene.initVM()
indexDir = SimpleFSDirectory(File("lucene/"))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
print "%d docs in index" % writer.numDocs()
print "Reading lines from sys.stdin..."
header=[]
for n, l in enumerate(sys.stdin):
doc = Document()
fields = l.rstrip().split("\t")
for (idx,field) in enumerate(fields):
if n == 0:
typechar = field[-1]
if typechar not in set(['t','s','i']):
sys.stderr.write("unexpected type char in last character position of header field: %s\n" % (field))
exit(-1)
header.append([field,LUCENE_TYPES[typechar]])
else:
(fname,fieldtype) = header[idx]
if fieldtype is IntField:
示例9: IndexWriterConfig
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
try:
con = mdb.connect('localhost', 'root', '', 'cs246')
cur = con.cursor()
cur.execute("SELECT * FROM article_page;")
rows = cur.fetchall()
n = 0
for row in rows:
n = n+1
page_id = str(row[0])
page_title = str(row[1]).replace('_', ' ')
doc = Document()
doc.add(Field("title", page_title, Field.Store.YES, Field.Index.ANALYZED_NO_NORMS))
doc.add(Field("id", page_id, Field.Store.YES, Field.Index.NO))
writer.addDocument(doc)
print "total number of tuples", n
except mdb.Error, e:
print "Error %d: %s" % (e.args[0],e.args[1])
sys.exit(1)
finally:
if con:
con.close()
print "Created (%d docs in index)" % (writer.numDocs())
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例10: SimpleFSDirectory
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.util import Version
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig
from org.apache.lucene.document import Document, Field, FieldType
if __name__ == "__main__":
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
indexDir = "../pyFreya/freya/index/actual"
dir = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(dir, config)
with open("data",'r') as f:
for doc in f.read().split("newDocSep"):
docr = Document()
for field in doc.split("csvSep"):
fieldData = field.split("||")
try:docr.add(Field(fieldData[1], fieldData[2], Field.Store.YES, Field.Index.ANALYZED))
except:print "ups"
print "\n"
writer.addDocument(docr)
print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
writer.commit()
print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
print >> sys.stderr, "...done closing index of %d documents" % writer.numDocs()
writer.close()
示例11: main
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
def main(indexDir, inputDir):
"""Creates a Lucene Index, and indexes every .json file it finds.
It utilizes a stopwords.txt to filter out stop words"""
lucene.initVM()
logger.info("Loading stop words from stopwords.txt")
f = open('stopwords.txt', 'r')
stopwords = set([])
for line in f:
stopwords.add(line.strip())
f.close()
logger.debug('Stop words: %s' % str(stopwords))
temp = CharArraySet(Version.LUCENE_CURRENT, 1, True)
for stopword in stopwords:
temp.add(stopword)
stopwords = temp
# Create index
logger.info("Creating Lucene index [%s]..." % indexDir)
dir = SimpleFSDirectory(File(indexDir))
analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopwords)
writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
writer = IndexWriter(dir, writerConfig)
logger.info("Currently there are %d documents in the index..." % writer.numDocs())
# Index documents
onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ]
for f in onlyfiles:
try:
journal_code = f.split('.')[0]
f = join(inputDir, f)
json_data = open(f)
data = json.load(json_data)
for entry in data:
doc = Document()
doc.add(Field("journal", journal_code, Field.Store.YES, Field.Index.NOT_ANALYZED))
doc.add(Field("url", entry['url'], Field.Store.YES, Field.Index.NOT_ANALYZED ))
doc.add(Field("date", entry['date'], Field.Store.YES, Field.Index.NOT_ANALYZED ))
doc.add(Field("title", entry['title'], Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
json_data.close()
except (IOError) as v:
try:
(code, message) = v
except:
code = 0
message = v
logger.error("I/O Error: " + str(message) + " (" + str(code) + ")")
logger.info("Indexed lines from stdin (%d documents in index)" % writer.numDocs())
# Wrap it up
#logger.info("About to optimize index of %d documents..." % writer.numDocs())
#writer.optimize()
#logger.info("...done optimizing index of %d documents" % writer.numDocs())
logger.info("Closing index of %d documents..." % writer.numDocs())
writer.close()
reader = IndexReader.open(dir)
with open('all.csv', 'wb') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
for i in xrange(0, reader.numDocs()):
doc = reader.document(i)
csvwriter.writerow([doc.get('journal'), doc.get('date'), doc.get('url').encode('utf8'), \
doc.get('title').strip().replace(',', '\,').encode('utf8')])
示例12: StandardAnalyzer
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
lucene.initVM()
print "lucene version is:", lucene.VERSION
# Get the analyzer
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
# Get index storage
indexDir = SimpleFSDirectory(File("index/"))
# Get index writer
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer);
writer = IndexWriter(indexDir, config);
print "%d docs in index" % writer.numDocs()
for d in data:
rec = d['record']
if not rec['product_name'] or not rec['uniq_id']:
logging.info ("Incomplete product ... skipping")
logging.debug(rec)
continue
else:
doc = Document()
for k,v in rec.iteritems():
if k in keys:
doc.add(Field(k, v, Field.Store.YES, Field.Index.ANALYZED))
else:
if (k == 'product_specifications'):
specs = v['product_specification']
示例13: IndexFiles
# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
class IndexFiles(object):
def __init__(self, indexDir):
if not os.path.exists(indexDir):
os.mkdir(indexDir)
store = SimpleFSDirectory(File(indexDir))
analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
self.writer = IndexWriter(store, config)
def index(self, file, duplicates):
exact = [duplicate['duplicate'] for duplicate in duplicates if duplicate['sim'] == 1]
near = [duplicate['duplicate'] for duplicate in duplicates if duplicate['sim'] < 1]
with open(file) as file:
for document in file:
data = json.loads(document)
if (data['url'] in exact):
continue
doc = self.createDoc(data['url'], data['html'], data['url'] in near)
self.writer.addDocument(doc)
store_outlinks(data['url'], data['outlinks'])
self.writer.commit()
return self.writer.numDocs()
def createDoc(self, url, html, duplicate):
title, contents = self.parseHtml(url, html)
doc = Document()
doc.add(StringField("title", title, Field.Store.YES))
doc.add(StringField("url", url, Field.Store.YES))
doc.add(StringField("duplicate", str(duplicate).lower(), Field.Store.YES))
if len(contents) > 0:
doc.add(TextField("contents", contents, Field.Store.YES))
else:
print "Warning: No content in %s" % url
return doc
def close(self):
self.writer.close()
def parseHtml(self, url, html):
soup = BeautifulSoup(html, 'lxml')
title = self.getTitle(url, soup)
body = self.getBody(soup)
return title, body
def getTitle(self, url, soup):
if soup.title:
title = soup.title.get_text().strip()
elif soup.find("h1"):
title = " ".join(soup.find("h1").get_text().split())
else:
title = url.split("/")[-1]
return title
def getBody(self, soup):
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
[comment.extract() for comment in comments]
[style.decompose() for style in soup.find_all('style')]
[script.decompose() for script in soup.find_all('script')]
if soup.body:
return soup.body.get_text(" ", strip=True)
else:
return soup.get_text(" ", strip=True)