当前位置: 首页>>代码示例>>Python>>正文


Python IndexWriter.numDocs方法代码示例

本文整理汇总了Python中org.apache.lucene.index.IndexWriter.numDocs方法的典型用法代码示例。如果您正苦于以下问题:Python IndexWriter.numDocs方法的具体用法?Python IndexWriter.numDocs怎么用?Python IndexWriter.numDocs使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.lucene.index.IndexWriter的用法示例。


在下文中一共展示了IndexWriter.numDocs方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: create_index

# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
def create_index():

    lucene.initVM()
    if os.path.exists(prm.index_folder):
        shutil.rmtree(prm.index_folder)

    indexDir = SimpleFSDirectory(File(prm.index_folder))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    wk = wiki.Wiki(prm.pages_path)

    print "%d docs in index" % writer.numDocs()
    print "Reading files from wikipedia..."
    n = 0
    for l in wk.get_text_iter():
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        n += 1
        if n % 100000 == 0:
            print 'indexing article', n
    print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
开发者ID:domarps,项目名称:WebNav,代码行数:27,代码来源:lucene_search.py

示例2: wikipedia_indexer

# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
def wikipedia_indexer(storage, wikipedia_file) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	f = open(wikipedia_file)

	for i, line in enumerate(f) :
		text = line.strip().decode('utf-8').split('\t')
		title = text[0]
		if 'disambigu' in text[0] or len(text) < 2:
			continue
		text = text[1]
		doc = Document()
		doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO))
		doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
		doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED))
		writer.addDocument(doc)
		if writer.numDocs() % 1000 == 0 :
			print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i)
		
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()	
开发者ID:successar,项目名称:Lucene-QA,代码行数:33,代码来源:wikipedia_indexer.py

示例3: create_index

# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
def create_index(storage, paths) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	import os
	for path in paths :
		for filen in os.listdir(path) :
			text = sent_tokenize(get_data_from_file(path + filen))
			total_sent = len(text)
			for i in range(0, total_sent, 3) :
				doc = Document()
				a = i-5 if i-5 > 0 else 0
				sentence = ' '.join(text[a:i+5])
				doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
				writer.addDocument(doc)
			print("Done %s" % (path+filen))
			print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
开发者ID:successar,项目名称:Lucene-QA,代码行数:30,代码来源:sentence_indexer.py

示例4: __init__

# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
class LuceneIndexer:

    def __init__(self, path_to_save):
        self.path_to_save = path_to_save
        self.num_docs = 0
        lucene.initVM()
        self.indexDir = SimpleFSDirectory(File(self.path_to_save))
        self.analyzer = StandardAnalyzer(Version.LUCENE_4_10_1)
        self.analyzer2 = WhitespaceAnalyzer(Version.LUCENE_4_10_1)
        self.writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, self.analyzer2)
        self.writer = IndexWriter(self.indexDir, self.writerConfig)

    def add_document(self, fields, header, id_):
        doc = Document()
        if len(fields) > len(header):
            sys.stderr.write('SKIPPED_DOC\tunexpected_num_lines\t%s\n' % str(id_))
            for field in fields:
                sys.stderr.write('%s\n' % field)
            return
        for idx, field in enumerate(fields):
            fname, fieldtype = header[idx]
            if fieldtype is IntField:
                field = int(field)
            doc.add(fieldtype(fname, field, Field.Store.YES))
        self.writer.addDocument(doc)
        self.num_docs += 1

    def close(self):
        print 'Indexed %d lines from stdin (%d docs in index)' % (self.num_docs, self.writer.numDocs())
        self.writer.close()
开发者ID:ChristopherWilks,项目名称:ncbi_indexing,代码行数:32,代码来源:lucene_indexer.py

示例5: indexer

# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
def indexer(docNumber, docText):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File("index/"))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    doc = Document()
    doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED))
    writer.addDocument(doc)
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
开发者ID:arrudamichel,项目名称:Systems-Engineering,代码行数:13,代码来源:Indexer.py

示例6: lucene_indexing

# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
def lucene_indexing():
    lucene.initVM()
    index_dir = os.getcwd()
    dir = SimpleFSDirectory(File(index_dir))
    analyzer = StandardAnalyzer(Version.LUCENE_48)
    index_writer_config = IndexWriterConfig(Version.LUCENE_48, analyzer);
    index_writer = IndexWriter(dir, index_writer_config)

    for tfile in glob.glob(os.path.join(index_dir, '*.txt')):
        print "Indexing: ", tfile
        document = Document()
        with open(tfile, 'r') as f:
            content = f.read()
        document.add(Field("text", content, Field.Store.YES,
                           Field.Index.ANALYZED))
        document.add(Field("title", tfile, Field.Store.YES,
                           Field.Index.ANALYZED))
        index_writer.addDocument(document)
    print index_writer.numDocs()
    index_writer.close()
开发者ID:hzatarain,项目名称:somali,代码行数:22,代码来源:somali.py

示例7: create_index

# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
def create_index(index) :
	indexDir = SimpleFSDirectory(File(index))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	f = open('f:/nlp/data/questions/combine.txt')
	for line in f :
		line = get_data_from_text(line.decode('utf-8'))
		doc = Document()
		field = Field("text", line, Field.Store.YES, Field.Index.ANALYZED)
		field.setBoost(2.0)
		doc.add(field)
		writer.addDocument(doc)
	
	print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
开发者ID:successar,项目名称:Lucene-QA,代码行数:26,代码来源:statements_indexer.py

示例8: SimpleFSDirectory

# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.document import Document, Field, IntField, StringField, TextField
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version

LUCENE_TYPES={'i':IntField,'s':StringField,'t':TextField}

 
if __name__ == "__main__":
  lucene.initVM()
  indexDir = SimpleFSDirectory(File("lucene/"))
  writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
  writer = IndexWriter(indexDir, writerConfig)
 
  print "%d docs in index" % writer.numDocs()
  print "Reading lines from sys.stdin..."
  header=[]
  for n, l in enumerate(sys.stdin):
    doc = Document()
    fields = l.rstrip().split("\t")
    for (idx,field) in enumerate(fields):
        if n == 0:
            typechar = field[-1]
            if typechar not in set(['t','s','i']):
                sys.stderr.write("unexpected type char in last character position of header field: %s\n" % (field))
                exit(-1) 
            header.append([field,LUCENE_TYPES[typechar]])
        else:
            (fname,fieldtype) = header[idx]
            if fieldtype is IntField:
开发者ID:PhaniGaddipati,项目名称:snaptron,代码行数:33,代码来源:lucene_indexer.py

示例9: IndexWriterConfig

# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
	writer = IndexWriter(indexDir, writerConfig)

	try:
		con = mdb.connect('localhost', 'root', '', 'cs246')
		cur = con.cursor()
		cur.execute("SELECT * FROM article_page;")
		rows = cur.fetchall()
		n = 0
		for row in rows:
			n = n+1
			page_id = str(row[0])
			page_title = str(row[1]).replace('_', ' ')

			doc = Document()
			doc.add(Field("title", page_title, Field.Store.YES, Field.Index.ANALYZED_NO_NORMS))
			doc.add(Field("id", page_id, Field.Store.YES, Field.Index.NO))
			writer.addDocument(doc)
		print "total number of tuples", n
	except mdb.Error, e:
		print "Error %d: %s" % (e.args[0],e.args[1])
		sys.exit(1)
	finally:
		if con:    
			con.close()

	print "Created (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()

开发者ID:dongyangli,项目名称:cs246,代码行数:31,代码来源:Indexer.py

示例10: SimpleFSDirectory

# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
from org.apache.lucene.index import DirectoryReader
from org.apache.lucene.queryparser.classic import QueryParser
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.util import Version
from org.apache.lucene.index import FieldInfo, IndexWriter, IndexWriterConfig
from org.apache.lucene.document import Document, Field, FieldType
if __name__ == "__main__":
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    indexDir = "../pyFreya/freya/index/actual"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(dir, config)
    with open("data",'r') as f:
        for doc in f.read().split("newDocSep"):
            docr = Document()
            for field in doc.split("csvSep"):
                fieldData = field.split("||")
                try:docr.add(Field(fieldData[1], fieldData[2], Field.Store.YES, Field.Index.ANALYZED))
                except:print "ups"
            print "\n"
            writer.addDocument(docr)
    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
    writer.commit()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    print >> sys.stderr, "...done closing index of %d documents" % writer.numDocs()
    writer.close()
开发者ID:gitter-badger,项目名称:pyFreya,代码行数:33,代码来源:indexer.py

示例11: main

# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
def main(indexDir, inputDir):
	"""Creates a Lucene Index, and indexes every .json file it finds.
	It utilizes a stopwords.txt to filter out stop words"""
	lucene.initVM()

	logger.info("Loading stop words from stopwords.txt")
	f = open('stopwords.txt', 'r')
	stopwords = set([])
	for line in f:
		stopwords.add(line.strip())
	f.close()
	logger.debug('Stop words: %s' % str(stopwords))
	temp = CharArraySet(Version.LUCENE_CURRENT, 1, True)

	for stopword in stopwords:
		temp.add(stopword)

	stopwords = temp

	# Create index
	logger.info("Creating Lucene index [%s]..." % indexDir)

	dir = SimpleFSDirectory(File(indexDir))
	analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopwords)
	writerConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
	writer = IndexWriter(dir, writerConfig)

	logger.info("Currently there are %d documents in the index..." % writer.numDocs())

	# Index documents
	onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ]
	for f in onlyfiles:
		try:
			journal_code = f.split('.')[0]
			f = join(inputDir, f)
			json_data = open(f)
			data = json.load(json_data)
			for entry in data:
				doc = Document()
				doc.add(Field("journal", journal_code, Field.Store.YES, Field.Index.NOT_ANALYZED))
				doc.add(Field("url", entry['url'], Field.Store.YES, Field.Index.NOT_ANALYZED ))
				doc.add(Field("date", entry['date'], Field.Store.YES, Field.Index.NOT_ANALYZED ))
				doc.add(Field("title", entry['title'], Field.Store.YES, Field.Index.ANALYZED))
				writer.addDocument(doc)
			json_data.close()
		except (IOError) as v:
			try:
				(code, message) = v
			except:
				code = 0
				message = v
			logger.error("I/O Error: " + str(message) + " (" + str(code) + ")")
	logger.info("Indexed lines from stdin (%d documents in index)" % writer.numDocs())

	# Wrap it up
	#logger.info("About to optimize index of %d documents..." % writer.numDocs())
	#writer.optimize()
	#logger.info("...done optimizing index of %d documents" % writer.numDocs())

	logger.info("Closing index of %d documents..." % writer.numDocs())
	writer.close()

	reader = IndexReader.open(dir)
	with open('all.csv', 'wb') as csvfile:
		csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
		for i in xrange(0, reader.numDocs()):
			doc = reader.document(i)
			csvwriter.writerow([doc.get('journal'), doc.get('date'), doc.get('url').encode('utf8'), \
				doc.get('title').strip().replace(',', '\,').encode('utf8')])
开发者ID:kinow,项目名称:crawlers-noticias,代码行数:71,代码来源:index.py

示例12: StandardAnalyzer

# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
    

    lucene.initVM()
  
    print "lucene version is:", lucene.VERSION
    # Get the analyzer
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    # Get index storage
    indexDir = SimpleFSDirectory(File("index/"))

    # Get index writer
    config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer);
    writer = IndexWriter(indexDir, config);

    print "%d docs in index" % writer.numDocs()

    for d in data:
        rec = d['record']
        if not rec['product_name'] or not rec['uniq_id']:
            logging.info ("Incomplete product ... skipping")
            logging.debug(rec)
            continue
        else:
            doc = Document()
            for k,v in rec.iteritems():
                if k in keys:
                    doc.add(Field(k, v, Field.Store.YES, Field.Index.ANALYZED))
                else:
                    if (k == 'product_specifications'):
                        specs = v['product_specification']
开发者ID:Diwahars,项目名称:python-scripts-1,代码行数:33,代码来源:pylucene_index.py

示例13: IndexFiles

# 需要导入模块: from org.apache.lucene.index import IndexWriter [as 别名]
# 或者: from org.apache.lucene.index.IndexWriter import numDocs [as 别名]
class IndexFiles(object):

    def __init__(self, indexDir):
        if not os.path.exists(indexDir):
            os.mkdir(indexDir)

        store = SimpleFSDirectory(File(indexDir))

        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        
        self.writer = IndexWriter(store, config)

    def index(self, file, duplicates):
        exact = [duplicate['duplicate'] for duplicate in duplicates if duplicate['sim'] == 1]
        near = [duplicate['duplicate'] for duplicate in duplicates if duplicate['sim'] < 1]

        with open(file) as file:
            for document in file:
                data = json.loads(document)
                if (data['url'] in exact):
                    continue

                doc = self.createDoc(data['url'], data['html'], data['url'] in near)
                self.writer.addDocument(doc)
                store_outlinks(data['url'], data['outlinks'])

    	self.writer.commit()

        return self.writer.numDocs()

    def createDoc(self, url, html, duplicate):
        title, contents = self.parseHtml(url, html)

        doc = Document()
        doc.add(StringField("title", title, Field.Store.YES))
        doc.add(StringField("url", url, Field.Store.YES))
        doc.add(StringField("duplicate", str(duplicate).lower(), Field.Store.YES))

        if len(contents) > 0:
            doc.add(TextField("contents", contents, Field.Store.YES))
        else:
            print "Warning: No content in %s" % url

        return doc

    def close(self):
    	self.writer.close()

    def parseHtml(self, url, html):
        soup = BeautifulSoup(html, 'lxml')
        title = self.getTitle(url, soup)
        body = self.getBody(soup)

        return title, body

    def getTitle(self, url, soup):
        if soup.title:
            title = soup.title.get_text().strip()
        elif soup.find("h1"):
            title = " ".join(soup.find("h1").get_text().split())
        else:
            title = url.split("/")[-1]

        return title

    def getBody(self, soup):
        comments = soup.findAll(text=lambda text:isinstance(text, Comment))
        [comment.extract() for comment in comments]
        [style.decompose() for style in soup.find_all('style')]
        [script.decompose() for script in soup.find_all('script')]

        if soup.body:
            return soup.body.get_text(" ", strip=True)
        else:
            return soup.get_text(" ", strip=True)
开发者ID:thoughts1053,项目名称:search,代码行数:80,代码来源:indexer.py


注:本文中的org.apache.lucene.index.IndexWriter.numDocs方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。