本文整理汇总了Python中org.apache.lucene.document.Document.add方法的典型用法代码示例。如果您正苦于以下问题:Python Document.add方法的具体用法?Python Document.add怎么用?Python Document.add使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.document.Document
的用法示例。
在下文中一共展示了Document.add方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: addDocumentToIndex
# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def addDocumentToIndex(self, title, text):
doc = Document()
doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED))
self.writer.addDocument(doc)
示例2: create_index
# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def create_index():
lucene.initVM()
if os.path.exists(prm.index_folder):
shutil.rmtree(prm.index_folder)
indexDir = SimpleFSDirectory(File(prm.index_folder))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
wk = wiki.Wiki(prm.pages_path)
print "%d docs in index" % writer.numDocs()
print "Reading files from wikipedia..."
n = 0
for l in wk.get_text_iter():
doc = Document()
doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
n += 1
if n % 100000 == 0:
print 'indexing article', n
print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs())
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例3: index_docs
# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def index_docs(self, tweets, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(True)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
t1.setStoreTermVectors(True)
t1.setStoreTermVectorOffsets(True)
# add each tweet to the index
for tweet in tweets:
try:
# strip out URLs because they provide false index matches
contents = []
for word in tweet[1].text.split():
if word.startswith("http://") or word.startswith("https://"):
continue
contents.append(word)
contents = " ".join(contents)
if len(contents) == 0: continue
doc = Document()
doc.add(Field("contents", contents, t1))
writer.addDocument(doc)
except Exception, e:
print "Failed in index_docs:", e
示例4: indexDictionary
# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def indexDictionary(d, writer):
for k, v in d.iteritems():
doc = Document()
doc.add(Field('filename', k, Field.Store.YES, Field.Index.NOT_ANALYZED))
doc.add(Field('content', v, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
return writer.numDocs()
示例5: addDoc
# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def addDoc(self, writer, text):
d = Document()
f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED)
d.add(f)
writer.addDocument(d)
示例6: addDocument
# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def addDocument(self, writer, new_doc, metadata, fields_to_process, bow_info):
"""
Add a document to the index. Does this using direct Lucene access.
:param new_doc: dict of fields with values
:type new_doc:dict
:param metadata: ditto
:type metadata:dict
:param fields_to_process: only add these fields from the doc dict
:type fields_to_process:list
"""
doc = Document()
total_numTerms=bow_info["total_numterms"]
# each BOW now comes with its field
for field in fields_to_process:
field_object=Field(field, new_doc[field], Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)
## boost=math.sqrt(numTerms[field]) / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0)
boost=1 / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0)
field_object.setBoost(float(boost))
doc.add(field_object)
json_metadata=json.dumps(metadata)
doc.add(Field("guid", guid, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("bow_info", json.dumps(bow_info), Field.Store.YES, Field.Index.NO))
doc.add(Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO))
doc.add(Field("year_from", metadata["year"], Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
示例7: testCompressionTools
# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def testCompressionTools(self):
bytes = JArray('byte')(self.binaryValCompressed)
binaryFldCompressed = StoredField("binaryCompressed", CompressionTools.compress(bytes))
stringFldCompressed = StoredField("stringCompressed", CompressionTools.compressString(self.binaryValCompressed))
doc = Document()
doc.add(binaryFldCompressed)
doc.add(stringFldCompressed)
# add the doc to a ram index
writer = self.getWriter(analyzer=StandardAnalyzer(Version.LUCENE_CURRENT))
writer.addDocument(doc)
writer.close()
# open a reader and fetch the document
reader = self.getReader()
docFromReader = reader.document(0)
self.assert_(docFromReader is not None)
# fetch the binary compressed field and compare it's content with
# the original one
bytes = CompressionTools.decompress(docFromReader.getBinaryValue("binaryCompressed"))
binaryFldCompressedTest = bytes.string_
self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed)
self.assertEqual(CompressionTools.decompressString(docFromReader.getBinaryValue("stringCompressed")), self.binaryValCompressed)
reader.close()
示例8: create_index
# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def create_index(storage, paths) :
lucene.initVM()
indexDir = SimpleFSDirectory(File(storage))
stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
for s in stopwords :
stops.add(s)
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
writer = IndexWriter(indexDir, writerConfig)
print "%d docs in index" % writer.numDocs()
print "Reading Documents"
import os
for path in paths :
for filen in os.listdir(path) :
text = sent_tokenize(get_data_from_file(path + filen))
total_sent = len(text)
for i in range(0, total_sent, 3) :
doc = Document()
a = i-5 if i-5 > 0 else 0
sentence = ' '.join(text[a:i+5])
doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print("Done %s" % (path+filen))
print "Indexed (%d docs in index)" % (writer.numDocs())
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例9: rewrite
# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def rewrite(data_string):
data=json.loads(data_string)
toupdate=json.loads(update)
#primary_key_modified=False
#delete the appropriate document
query=BooleanQuery()
for key in primary_keys_map:
temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
#modify the values
for key,value in toupdate.items():
#if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!)
if add_field_if_not_exists==False:
if key in data.keys():
data[key]=value
else:
data[key]=value
#this deletion statement has been intenstionally added here
#only if the modified data,has primary keys already not existing,will the updating process continue
primary_key_update=False
for key in toupdate.keys():
if key in primary_keys_map:
primary_key_update=True
break
if primary_key_update == True:
query_search=BooleanQuery()
for key in primary_keys_map:
temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST))
hits=searcher.search(query_search,MAX_RESULTS).scoreDocs
if len(hits) > 0:
return 106
writer.deleteDocuments(query)
#add the newly modified document
doc=Document()
#index files wrt primary key
for primary_key in primary_keys_map:
try:
field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED)
doc.add(field)
except:
# primary_keys_map.pop(collection_name)
return 101
#compress data using snappy if compression is on
if to_be_compressed_input==True:
temp=json.dumps(data)
data_string=base64.b64encode(snappy.compress(temp))
else:
temp=json.dumps(data)
data_string=base64.b64encode(temp)
field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED)
doc.add(field)
writer.addDocument(doc)
示例10: _addDocument
# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def _addDocument(self, identifier, isformatof, sort=None):
doc = Document()
if isformatof:
doc.add(NumericDocValuesField("__isformatof__", long(isformatof)))
if sort:
doc.add(NumericDocValuesField("__sort__", long(sort)))
consume(self.lucene.addDocument(identifier, doc))
self.lucene.commit() # Explicitly, not required: since commitCount=1.
示例11: create_document
# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def create_document(self, contents):
"""Create a Lucene document from the specified contents.
Contents is a list of fields to be indexed, represented as a dictionary
with keys 'field_name', 'field_type', and 'field_value'."""
doc = Document()
for f in contents:
doc.add(Field(f['field_name'], f['field_value'],
self.ldf.get_field(f['field_type'])))
return doc
示例12: index_text
# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def index_text(self, sha1, full_text):
"""Index the full text and map it to the source sha1."""
document = Document()
document.add(Field("sha1", sha1, ImageIndexer.hash_field))
if full_text:
document.add(Field("full_text", full_text, ImageIndexer.text_field))
self.writer.updateDocument(Term("sha1", sha1), document)
else:
logging.info("No text for sha1 %s", sha1)
示例13: indexer
# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def indexer(docNumber, docText):
lucene.initVM()
indexDir = SimpleFSDirectory(File("index/"))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
doc = Document()
doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例14: setUp
# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def setUp(self):
super(PhraseQueryTestCase, self).setUp()
doc = Document()
doc.add(Field("field", "one two three four five", TextField.TYPE_STORED))
writer = self.getWriter()
writer.addDocument(doc)
writer.close()
self.searcher = self.getSearcher()
self.builder = PhraseQuery.Builder()
示例15: dummyIndex
# 需要导入模块: from org.apache.lucene.document import Document [as 别名]
# 或者: from org.apache.lucene.document.Document import add [as 别名]
def dummyIndex(self):
"""
Create a dummy index - to avoid problems updating it
"""
config = IndexWriterConfig(self.analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(self.indexDir, config)
doc = Document()
doc.add(Field('uid', 'dummy', StringField.TYPE_STORED))
writer.addDocument(doc)
writer.commit()
writer.close()
return