本文整理汇总了Python中org.apache.lucene.document.Document类的典型用法代码示例。如果您正苦于以下问题:Python Document类的具体用法?Python Document怎么用?Python Document使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Document类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: addDoc
def addDoc(self, writer, text):
d = Document()
f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED)
d.add(f)
writer.addDocument(d)
示例2: create_index
def create_index(storage, paths) :
lucene.initVM()
indexDir = SimpleFSDirectory(File(storage))
stops = CharArraySet(Version.LUCENE_4_10_1, 0, True)
for s in stopwords :
stops.add(s)
analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops)
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer)
writer = IndexWriter(indexDir, writerConfig)
print "%d docs in index" % writer.numDocs()
print "Reading Documents"
import os
for path in paths :
for filen in os.listdir(path) :
text = sent_tokenize(get_data_from_file(path + filen))
total_sent = len(text)
for i in range(0, total_sent, 3) :
doc = Document()
a = i-5 if i-5 > 0 else 0
sentence = ' '.join(text[a:i+5])
doc.add(Field("text", sentence, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print("Done %s" % (path+filen))
print "Indexed (%d docs in index)" % (writer.numDocs())
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例3: testCompressionTools
def testCompressionTools(self):
bytes = JArray('byte')(self.binaryValCompressed)
binaryFldCompressed = StoredField("binaryCompressed", CompressionTools.compress(bytes))
stringFldCompressed = StoredField("stringCompressed", CompressionTools.compressString(self.binaryValCompressed))
doc = Document()
doc.add(binaryFldCompressed)
doc.add(stringFldCompressed)
# add the doc to a ram index
writer = self.getWriter(analyzer=StandardAnalyzer(Version.LUCENE_CURRENT))
writer.addDocument(doc)
writer.close()
# open a reader and fetch the document
reader = self.getReader()
docFromReader = reader.document(0)
self.assert_(docFromReader is not None)
# fetch the binary compressed field and compare it's content with
# the original one
bytes = CompressionTools.decompress(docFromReader.getBinaryValue("binaryCompressed"))
binaryFldCompressedTest = bytes.string_
self.assertEqual(binaryFldCompressedTest, self.binaryValCompressed)
self.assertEqual(CompressionTools.decompressString(docFromReader.getBinaryValue("stringCompressed")), self.binaryValCompressed)
reader.close()
示例4: create_index
def create_index():
lucene.initVM()
if os.path.exists(prm.index_folder):
shutil.rmtree(prm.index_folder)
indexDir = SimpleFSDirectory(File(prm.index_folder))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
wk = wiki.Wiki(prm.pages_path)
print "%d docs in index" % writer.numDocs()
print "Reading files from wikipedia..."
n = 0
for l in wk.get_text_iter():
doc = Document()
doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
n += 1
if n % 100000 == 0:
print 'indexing article', n
print "Indexed %d docs from wikipedia (%d docs in index)" % (n, writer.numDocs())
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例5: addDocumentToIndex
def addDocumentToIndex(self, title, text):
doc = Document()
doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED))
self.writer.addDocument(doc)
示例6: indexDictionary
def indexDictionary(d, writer):
for k, v in d.iteritems():
doc = Document()
doc.add(Field('filename', k, Field.Store.YES, Field.Index.NOT_ANALYZED))
doc.add(Field('content', v, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
return writer.numDocs()
示例7: index_docs
def index_docs(self, tweets, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(True)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
t1.setStoreTermVectors(True)
t1.setStoreTermVectorOffsets(True)
# add each tweet to the index
for tweet in tweets:
try:
# strip out URLs because they provide false index matches
contents = []
for word in tweet[1].text.split():
if word.startswith("http://") or word.startswith("https://"):
continue
contents.append(word)
contents = " ".join(contents)
if len(contents) == 0: continue
doc = Document()
doc.add(Field("contents", contents, t1))
writer.addDocument(doc)
except Exception, e:
print "Failed in index_docs:", e
示例8: rewrite
def rewrite(data_string):
data=json.loads(data_string)
toupdate=json.loads(update)
#primary_key_modified=False
#delete the appropriate document
query=BooleanQuery()
for key in primary_keys_map:
temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
query.add(BooleanClause(temp,BooleanClause.Occur.MUST))
#modify the values
for key,value in toupdate.items():
#if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!)
if add_field_if_not_exists==False:
if key in data.keys():
data[key]=value
else:
data[key]=value
#this deletion statement has been intenstionally added here
#only if the modified data,has primary keys already not existing,will the updating process continue
primary_key_update=False
for key in toupdate.keys():
if key in primary_keys_map:
primary_key_update=True
break
if primary_key_update == True:
query_search=BooleanQuery()
for key in primary_keys_map:
temp=QueryParser(Version.LUCENE_CURRENT,key,analyzer).parse(data[key])
query_search.add(BooleanClause(temp,BooleanClause.Occur.MUST))
hits=searcher.search(query_search,MAX_RESULTS).scoreDocs
if len(hits) > 0:
return 106
writer.deleteDocuments(query)
#add the newly modified document
doc=Document()
#index files wrt primary key
for primary_key in primary_keys_map:
try:
field=Field(primary_key,data[primary_key],Field.Store.NO,Field.Index.ANALYZED)
doc.add(field)
except:
# primary_keys_map.pop(collection_name)
return 101
#compress data using snappy if compression is on
if to_be_compressed_input==True:
temp=json.dumps(data)
data_string=base64.b64encode(snappy.compress(temp))
else:
temp=json.dumps(data)
data_string=base64.b64encode(temp)
field=Field("$DATA$",data_string,Field.Store.YES,Field.Index.ANALYZED)
doc.add(field)
writer.addDocument(doc)
示例9: _addDocument
def _addDocument(self, identifier, isformatof, sort=None):
doc = Document()
if isformatof:
doc.add(NumericDocValuesField("__isformatof__", long(isformatof)))
if sort:
doc.add(NumericDocValuesField("__sort__", long(sort)))
consume(self.lucene.addDocument(identifier, doc))
self.lucene.commit() # Explicitly, not required: since commitCount=1.
示例10: index_text
def index_text(self, sha1, full_text):
"""Index the full text and map it to the source sha1."""
document = Document()
document.add(Field("sha1", sha1, ImageIndexer.hash_field))
if full_text:
document.add(Field("full_text", full_text, ImageIndexer.text_field))
self.writer.updateDocument(Term("sha1", sha1), document)
else:
logging.info("No text for sha1 %s", sha1)
示例11: create_document
def create_document(self, contents):
"""Create a Lucene document from the specified contents.
Contents is a list of fields to be indexed, represented as a dictionary
with keys 'field_name', 'field_type', and 'field_value'."""
doc = Document()
for f in contents:
doc.add(Field(f['field_name'], f['field_value'],
self.ldf.get_field(f['field_type'])))
return doc
示例12: indexDocs
def indexDocs(self, root, writer):
f = codecs.open('picIndex.txt','r',encoding='utf-8')
picDict = {}
for line in f.xreadlines():
ls = line.split('seg^*')
url = ls[0]
title = ls[1]
src = ls[2]
alt = ls[3]
picDict[src] = [url,title,alt]
f.close()
for src in picDict:
doc = Document()
doc.add(Field("src", src,
Field.Store.YES,
Field.Index.NOT_ANALYZED))
doc.add(Field("url", picDict[src][0],
Field.Store.YES,
Field.Index.NOT_ANALYZED))
doc.add(Field("title", picDict[src][1],
Field.Store.YES,
Field.Index.NOT_ANALYZED))
doc.add(Field("alt", picDict[src][2],
Field.Store.YES,
Field.Index.ANALYZED))
writer.addDocument(doc)
示例13: index
def index(indexdir):
lucene.initVM()
indexDir = SimpleFSDirectory(File(indexdir))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
f = open('data/docid.documento-xml.txt')
st = PorterStemmer()
for i, line in enumerate(f.readlines()):
id, xmltext = line.split('\t')
xmltext = xmltext.rstrip('\n')
xmldoc = minidom.parseString(xmltext)
title = xmldoc.getElementsByTagName("TITLE")
title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue
authors = xmldoc.getElementsByTagName("AUTHORS")
authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue
abstract = xmldoc.getElementsByTagName("ABSTRACT")
abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue
doc = Document()
doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED))
writer.addDocument(doc)
print "indexed %s docs" % (i+1)
writer.close()
示例14: indexer
def indexer(docNumber, docText):
lucene.initVM()
indexDir = SimpleFSDirectory(File("index/"))
writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, PorterStemmerAnalyzer())
writer = IndexWriter(indexDir, writerConfig)
doc = Document()
doc.add(Field("docNumber", docNumber, Field.Store.YES, Field.Index.ANALYZED))
doc.add(Field("docText", docText, Field.Store.YES, Field.Index.ANALYZED))
writer.addDocument(doc)
print "Closing index of %d docs..." % writer.numDocs()
writer.close()
示例15: setUp
def setUp(self):
super(PhraseQueryTestCase, self).setUp()
doc = Document()
doc.add(Field("field", "one two three four five", TextField.TYPE_STORED))
writer = self.getWriter()
writer.addDocument(doc)
writer.close()
self.searcher = self.getSearcher()
self.builder = PhraseQuery.Builder()