当前位置: 首页>>代码示例>>Python>>正文


Python Document.add方法代码示例

本文整理汇总了Python中org.apache.lucene.document.Document.add方法的典型用法代码示例。如果您正苦于以下问题:Python Document.add方法的具体用法?Python Document.add怎么用?Python Document.add使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.lucene.document.Document的用法示例。


在下文中一共展示了Document.add方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: addDocumentToIndex

# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
    def addDocumentToIndex(self, title, text):
        doc = Document()

        doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED))

        self.writer.addDocument(doc)
开发者ID:abell25,项目名称:TestTaker,代码行数:9,代码来源:WikiPageIndex.py

示例2: create_index

# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def create_index():

    lucene.initVM()
    if os.path.exists(prm.index_folder):
        shutil.rmtree(prm.index_folder)

    indexDir = SimpleFSDirectory(File(prm.index_folder))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    wk = wiki.Wiki(prm.pages_path)

    print "%d docs in index" % writer.numDocs()
    print "Reading files from wikipedia..."
    n = 0
    for l in wk.get_text_iter():
        doc = Document()
        doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        n += 1
        if n % 100000 == 0:
            print 'indexing article', n
    print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs())
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
开发者ID:domarps,项目名称:WebNav,代码行数:27,代码来源:lucene_search.py

示例3: index_docs

# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
    def index_docs(self, tweets, writer):
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        t1.setStoreTermVectors(True)
        t1.setStoreTermVectorOffsets(True)

        # add each tweet to the index
        for tweet in tweets:
            try:
                # strip out URLs because they provide false index matches
                contents = []
                for word in tweet[1].text.split():
                    if word.startswith("http://") or word.startswith("https://"):
                        continue
                    contents.append(word)
                contents = " ".join(contents)

                if len(contents) == 0: continue

                doc = Document()
                doc.add(Field("contents", contents, t1))
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in index_docs:", e
开发者ID:ryancutter,项目名称:OnlyWorthy,代码行数:29,代码来源:onlyworthy_dev.py

示例4: indexDictionary

# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def indexDictionary(d, writer):
    for k, v in d.iteritems():
        doc = Document()
        doc.add(Field('filename', k, Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field('content', v, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
    return writer.numDocs()
开发者ID:ffuuugor,项目名称:deepHackQA,代码行数:9,代码来源:mlucene.py

示例5: addDoc

# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
    def addDoc(self, writer, text):

        d = Document()
        f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED)

        d.add(f)
        writer.addDocument(d)
开发者ID:devs4v,项目名称:devs4v-information-retrieval15,代码行数:9,代码来源:test_Highlighter.py

示例6: addDocument

# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
    def addDocument(self, writer, new_doc, metadata, fields_to_process, bow_info):
        """
            Add a document to the index. Does this using direct Lucene access.

            :param new_doc: dict of fields with values
            :type new_doc:dict
            :param metadata: ditto
            :type metadata:dict
            :param fields_to_process: only add these fields from the doc dict
            :type fields_to_process:list
        """
        doc = Document()
        total_numTerms=bow_info["total_numterms"]
        # each BOW now comes with its field
        for field in fields_to_process:
            field_object=Field(field, new_doc[field], Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)
##            boost=math.sqrt(numTerms[field]) / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0)
            boost=1 / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0)
            field_object.setBoost(float(boost))
            doc.add(field_object)

        json_metadata=json.dumps(metadata)
        doc.add(Field("guid", guid, Field.Store.YES, Field.Index.ANALYZED))
        doc.add(Field("bow_info", json.dumps(bow_info), Field.Store.YES, Field.Index.NO))
        doc.add(Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO))
        doc.add(Field("year_from", metadata["year"], Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
开发者ID:danieldmm,项目名称:minerva,代码行数:29,代码来源:lucene_index.py

示例7: testCompressionTools

# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
    def testCompressionTools(self):

        bytes = JArray('byte')(self.binaryValCompressed)
        binaryFldCompressed = StoredField("binaryCompressed", CompressionTools.compress(bytes))
        stringFldCompressed = StoredField("stringCompressed", CompressionTools.compressString(self.binaryValCompressed))
    
        doc = Document()
        doc.add(binaryFldCompressed)
        doc.add(stringFldCompressed)
    
        # add the doc to a ram index
        writer = self.getWriter(analyzer=StandardAnalyzer(Version.LUCENE_CURRENT))
        writer.addDocument(doc)
        writer.close()
    
        # open a reader and fetch the document
        reader = self.getReader()
        docFromReader = reader.document(0)
        self.assert_(docFromReader is not None)
    
        # fetch the binary compressed field and compare it's content with
        # the original one
        bytes = CompressionTools.decompress(docFromReader.getBinaryValue("binaryCompressed"))
        binaryFldCompressedTest = bytes.string_
        self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed)
        self.assertEqual(CompressionTools.decompressString(docFromReader.getBinaryValue("stringCompressed")), self.binaryValCompressed)

        reader.close()
开发者ID:devs4v,项目名称:devs4v-information-retrieval15,代码行数:30,代码来源:test_BinaryDocument.py

示例8: create_index

# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def create_index(storage, paths) :
	lucene.initVM()
	indexDir = SimpleFSDirectory(File(storage))
	stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
	for s in stopwords :
		stops.add(s)
	analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
	writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
	writer = IndexWriter(indexDir, writerConfig)

	print "%d docs in index" % writer.numDocs()
	print "Reading Documents"

	import os
	for path in paths :
		for filen in os.listdir(path) :
			text = sent_tokenize(get_data_from_file(path + filen))
			total_sent = len(text)
			for i in range(0, total_sent, 3) :
				doc = Document()
				a = i-5 if i-5 > 0 else 0
				sentence = ' '.join(text[a:i+5])
				doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
				writer.addDocument(doc)
			print("Done %s" % (path+filen))
			print "Indexed (%d docs in index)" % (writer.numDocs())
	print "Closing index of %d docs..." % writer.numDocs()
	writer.close()
开发者ID:successar,项目名称:Lucene-QA,代码行数:30,代码来源:sentence_indexer.py

示例9: rewrite

# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
	def rewrite(data_string):
		data=json.loads(data_string)
		toupdate=json.loads(update)
		#primary_key_modified=False

		#delete the appropriate document
		query=BooleanQuery()
		for key in primary_keys_map:
			temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
			query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
		

		#modify the values
		for key,value in toupdate.items():
			#if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!)	
			if add_field_if_not_exists==False:
				if key in data.keys():
					data[key]=value
			else:		
				data[key]=value

		#this deletion statement has been intenstionally added here		
		#only if the modified data,has primary keys already not existing,will the updating process continue
		primary_key_update=False
		for key in toupdate.keys():
			if key in primary_keys_map:
				primary_key_update=True
				break
		if primary_key_update == True:
			query_search=BooleanQuery()
			for key in primary_keys_map:
				temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
				query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST))
			hits=searcher.search(query_search,MAX_RESULTS).scoreDocs
			if len(hits) > 0:
				return 106			
		writer.deleteDocuments(query)

		#add the newly modified document
		doc=Document()
		#index files wrt primary key
		for primary_key in primary_keys_map:
			try:
				field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED)
				doc.add(field)
			except:
				# primary_keys_map.pop(collection_name)
				return 101
		#compress data using snappy if compression is on		
		if to_be_compressed_input==True:
			temp=json.dumps(data)
			data_string=base64.b64encode(snappy.compress(temp))
		else:
			temp=json.dumps(data)
			data_string=base64.b64encode(temp)

		field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED)
		doc.add(field)
		writer.addDocument(doc)
开发者ID:NitinJamadagni,项目名称:Mini-NoSQL-Database,代码行数:61,代码来源:Handler_callable.py

示例10: _addDocument

# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
 def _addDocument(self, identifier, isformatof, sort=None):
     doc = Document()
     if isformatof:
         doc.add(NumericDocValuesField("__isformatof__", long(isformatof)))
     if sort:
         doc.add(NumericDocValuesField("__sort__", long(sort)))
     consume(self.lucene.addDocument(identifier, doc))
     self.lucene.commit()  # Explicitly, not required: since commitCount=1.
开发者ID:jerryba,项目名称:meresco-lucene,代码行数:10,代码来源:dedupfiltercollectortest.py

示例11: create_document

# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
 def create_document(self, contents):
     """Create a Lucene document from the specified contents.
     Contents is a list of fields to be indexed, represented as a dictionary
     with keys 'field_name', 'field_type', and 'field_value'."""
     doc = Document()
     for f in contents:
         doc.add(Field(f['field_name'], f['field_value'],
                       self.ldf.get_field(f['field_type'])))
     return doc
开发者ID:renespeck,项目名称:TAGME_Reproducibility,代码行数:11,代码来源:lucene_tools.py

示例12: index_text

# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
 def index_text(self, sha1, full_text):
     """Index the full text and map it to the source sha1."""
     document = Document()
     document.add(Field("sha1", sha1, ImageIndexer.hash_field))
     if full_text:
         document.add(Field("full_text", full_text, ImageIndexer.text_field))
         self.writer.updateDocument(Term("sha1", sha1), document)
     else:
         logging.info("No text for sha1 %s", sha1)
开发者ID:BitCurator,项目名称:bca-webtools,代码行数:11,代码来源:text_indexer.py

示例13: indexer

# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def indexer(docNumber, docText):
    lucene.initVM()
    indexDir = SimpleFSDirectory(File("index/"))
    writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer())
    writer = IndexWriter(indexDir, writerConfig)
    doc = Document()
    doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED))
    doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED))
    writer.addDocument(doc)
    print "Closing index of %d docs..." % writer.numDocs()
    writer.close()
开发者ID:arrudamichel,项目名称:Systems-Engineering,代码行数:13,代码来源:Indexer.py

示例14: setUp

# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
    def setUp(self):
        super(PhraseQueryTestCase, self).setUp()

        doc = Document()
        doc.add(Field("field", "one two three four five", TextField.TYPE_STORED))
        writer = self.getWriter()
        writer.addDocument(doc)
        writer.close()

        self.searcher = self.getSearcher()
        self.builder = PhraseQuery.Builder()
开发者ID:svn2github,项目名称:pylucene,代码行数:13,代码来源:test_PhraseQuery.py

示例15: dummyIndex

# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
 def dummyIndex(self):
     """
     Create a dummy index - to avoid problems updating it
     """
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
     writer = IndexWriter(self.indexDir, config)
     doc = Document()
     doc.add(Field('uid', 'dummy', StringField.TYPE_STORED))
     writer.addDocument(doc)
     writer.commit()
     writer.close()
     return
开发者ID:andersardo,项目名称:gedMerge,代码行数:15,代码来源:luceneDB.py


注:本文中的org.apache.lucene.document.Document.add方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。