本文整理汇总了Python中org.apache.lucene.document.FieldType类的典型用法代码示例。如果您正苦于以下问题:Python FieldType类的具体用法?Python FieldType怎么用?Python FieldType使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了FieldType类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
def run(self):
print "Booting lucene driver worker...."
lucene.initVM()
self.fieldType1 = FieldType()
self.fieldType1.setIndexed(True)
self.fieldType1.setStored(False)
self.fieldType1.setTokenized(True)
self.fieldType2 = FieldType()
self.fieldType2.setIndexed(True)
self.fieldType2.setStored(True)
self.fieldType2.setTokenized(False)
while(True):
data = self.queue.get()
da = data[1]
response = None
try:
self.fil = File(da['data']['indexdir'])
self.d = NIOFSDirectory(self.fil)
self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
self.conf = IndexWriterConfig(
Version.LUCENE_CURRENT,
self.analyzer)
response = getattr(self, da['action'])(da['data'])
self.d.close()
except Exception as e:
print e
if response is None:
response = {}
self.ret[data[0]] = response
示例2: index
def index(self, root):
t = FieldType()
t.setIndexed(True)
t.setStored(True)
t.setTokenized(True)
t.setStoreTermVectors(True)
for path, dirs, files in os.walk(root):
for file in files:
filePath = os.path.join(path, file)
fd = open(filePath)
content = unicode(fd.read(), 'iso-8859-1')
fd.close()
doc = Document()
doc.add(Field('name', file, StringField.TYPE_STORED))
parent = os.path.split(path)[1]
doc.add(Field('parent', parent, StringField.TYPE_STORED))
if len(content) > 0:
doc.add(Field('content', content, t))
print 'Indexing %s' % file
self.mWriter.addDocument(doc)
self.mWriter.commit()
self.mWriter.close()
示例3: create_field_types
def create_field_types(self):
""" Create the field types that will be used to specify
what actions lucene should take on the various fields
supplied to index.
"""
self.field_clean = FieldType()
self.field_clean.setIndexed(True)
self.field_clean.setStored(True)
self.field_clean.setTokenized(False)
self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
self.field_dirty = FieldType()
self.field_dirty.setIndexed(True)
self.field_dirty.setStored(False)
self.field_dirty.setTokenized(True)
self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
示例4: __init__
def __init__(self):
"""Init possible field types."""
# FIELD_ID: stored, indexed, non-tokenized
self.field_id = FieldType()
self.field_id.setIndexed(True)
self.field_id.setStored(True)
self.field_id.setTokenized(False)
# FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions)
# for storing IDs with term vector info
self.field_id_tv = FieldType()
self.field_id_tv.setIndexed(True)
self.field_id_tv.setStored(True)
self.field_id_tv.setTokenized(False)
self.field_id_tv.setStoreTermVectors(True)
# FIELD_TEXT: stored, indexed, tokenized, with positions
self.field_text = FieldType()
self.field_text.setIndexed(True)
self.field_text.setStored(True)
self.field_text.setTokenized(True)
# FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions)
self.field_text_tv = FieldType()
self.field_text_tv.setIndexed(True)
self.field_text_tv.setStored(True)
self.field_text_tv.setTokenized(True)
self.field_text_tv.setStoreTermVectors(True)
# FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions
# (but no character offsets)
self.field_text_tvp = FieldType()
self.field_text_tvp.setIndexed(True)
self.field_text_tvp.setStored(True)
self.field_text_tvp.setTokenized(True)
self.field_text_tvp.setStoreTermVectors(True)
self.field_text_tvp.setStoreTermVectorPositions(True)
# FIELD_TEXT_NTV: not stored, indexed, tokenized, with term vectors (without positions)
self.field_text_ntv = FieldType()
self.field_text_ntv.setIndexed(True)
self.field_text_ntv.setStored(False)
self.field_text_ntv.setTokenized(True)
self.field_text_ntv.setStoreTermVectors(True)
# FIELD_TEXT_TVP: not stored, indexed, tokenized, with term vectors and positions
# (but no character offsets)
self.field_text_ntvp = FieldType()
self.field_text_ntvp.setIndexed(True)
self.field_text_ntvp.setStored(False)
self.field_text_ntvp.setTokenized(True)
self.field_text_ntvp.setStoreTermVectors(True)
self.field_text_ntvp.setStoreTermVectorPositions(True)
示例5: testBinaryFieldInIndex
def testBinaryFieldInIndex(self):
ft = FieldType()
ft.setStored(True)
bytes = JArray('byte')(self.binaryValStored)
binaryFldStored = StoredField("binaryStored", bytes)
stringFldStored = Field("stringStored", self.binaryValStored, ft)
doc = Document()
doc.add(binaryFldStored)
doc.add(stringFldStored)
# test for field count
self.assertEqual(2, doc.fields.size())
# add the doc to a ram index
writer = self.getWriter(analyzer=StandardAnalyzer())
writer.addDocument(doc)
writer.close()
# open a reader and fetch the document
reader = self.getReader()
docFromReader = reader.document(0)
self.assertTrue(docFromReader is not None)
# fetch the binary stored field and compare it's content with the
# original one
bytes = docFromReader.getBinaryValue("binaryStored")
binaryFldStoredTest = bytes.bytes.bytes_
self.assertEqual(binaryFldStoredTest, self.binaryValStored)
# fetch the string field and compare it's content with the original
# one
stringFldStoredTest = docFromReader.get("stringStored")
self.assertEqual(stringFldStoredTest, self.binaryValStored.decode())
reader.close()
示例6: setUp
def setUp(self):
super(Test_Bug1842, self).setUp()
self.analyzer = StandardAnalyzer()
w1 = self.getWriter(analyzer=self.analyzer)
doc1 = Document()
ftype = FieldType()
ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
ftype.setTokenized(True)
ftype.setStoreTermVectors(True)
ftype.freeze()
doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))
w1.addDocument(doc1)
w1.close()
示例7: setUp
def setUp(self):
super(Test_Bug1842, self).setUp()
self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
w1 = self.getWriter(analyzer=self.analyzer)
doc1 = Document()
ftype = FieldType()
ftype.setStored(False)
ftype.setIndexed(True)
ftype.setStoreTermVectors(True)
doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))
w1.addDocument(doc1)
w1.close()
示例8: lazyImport
def lazyImport():
global imported
if imported:
return
from meresco.pylucene import getJVM
getJVM()
from java.nio.file import Paths
from org.apache.lucene.document import Document, StringField, Field, FieldType
from org.apache.lucene.search import IndexSearcher, TermQuery
from org.apache.lucene.index import DirectoryReader, Term, IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.util import Version
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
UNINDEXED_TYPE = FieldType()
UNINDEXED_TYPE.setIndexOptions(IndexOptions.NONE)
UNINDEXED_TYPE.setStored(True)
UNINDEXED_TYPE.setTokenized(False)
imported = True
globals().update(locals())
示例9: index_docs
def index_docs(self, train_set, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)
t2.setStoreTermVectorOffsets(True)
t2.setStoreTermVectorPayloads(True)
t2.setStoreTermVectorPositions(True)
t2.setStoreTermVectors(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for ii in train_set:
doc = Document()
doc.add(Field("answer", ii['Answer'], t1))
doc.add(Field("qid", ii['Question ID'], t1))
doc.add(Field("category", ii['category'], t1))
doc.add(Field("position", ii['Sentence Position'], t1))
doc.add(Field("question", ii['Question Text'], t2))
doc.add(Field("wiki_plain",
self.wiki_reader.get_text(ii['Answer']), t2))
writer.addDocument(doc)
示例10: indexTable
def indexTable(self, writer):
#connection
con = None
#define the index of all the fields
#---------step 2:connect to mysql----------
con = mdb.connect('localhost','root','testgce','douban_movie_v3')
#t_num = FieldType.NumericType it is wrong!!
t_num = FieldType()
t_num.setStored(False)
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
t3 = FieldType()
t3.setIndexed(True)
t3.setStored(True)
t3.setTokenized(True)
t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
maxDict = utils.maxDict
#加权数值范围
base = DOC_BOOST_RANGE[0]
upper = DOC_BOOST_RANGE[1]
with con:
# Careful with codecs
con.set_character_set('utf8')
cur = con.cursor()
# Aagin the codecs
cur.execute('SET NAMES utf8;')
cur.execute('SET CHARACTER SET utf8;')
cur.execute('SET character_set_connection=utf8;')
#------step 3: choose the right table------
cur.execute("SELECT * FROM movie_items")
numrows = int(cur.rowcount)
print 'numrows:',numrows
for i in range(numrows):
print
row = cur.fetchone()
#------step 4:Index your field------
summary = row[SUMMARY]
subject_id = row[SUBJECT_ID]
print 'id'+subject_id
year = utils.formatYear(row[YEAR])
try:
date = DateTools.stringToDate(year.replace('-',' '))
wtfFile = open('wtf.txt','a')
dateStr = DateTools.dateToString(date,DateTools.Resolution.DAY)
except:
wtfFile.write(year+'\n')
doc = Document()
#boosting
boostProb = utils.calcBoostProb(row,maxDict,dateStr)
boost = base + boostProb*(upper-base)
doc.add(FloatField("boost",boost,Field.Store.YES))
doc.add(StringField("year",dateStr,Field.Store.YES))
print 'dateStr:'+dateStr
#A text field is a sequence of terms that has been tokenized while a string field is a single term (although it can also be multivalued.)
do_count = row[DO_COUNT] if row[DO_COUNT] != None else 0
wish_count = row[COLLECT_COUNT] if row[WISH_COUNT] != None else 0
#fields which should not be analyzed
doc.add(FloatField("rating_average",float(row[RATING_AVERAGE]),Field.Store.YES))
doc.add(FloatField("rating_stars", float(row[RATING_STARS]), Field.Store.YES))
doc.add(IntField("reviews_count", int(row[REVIEWS_COUNT]), Field.Store.YES))
#doc.add(FloatField("year", float(row[YEAR]), Field.Store.YES).setBoost(boost))
doc.add(IntField("collect_count", int(row[COLLECT_COUNT]), Field.Store.YES))
doc.add(IntField("do_count", int(do_count), Field.Store.YES))
doc.add(IntField("wish_count", int(wish_count), Field.Store.YES))
doc.add(IntField("subject_id", int(row[SUBJECT_ID]), Field.Store.YES))
doc.add(IntField("comments_count", int(row[COMMENTS_COUNT]), Field.Store.YES))
doc.add(IntField("ratings_count", int(row[RATINGS_COUNT]), Field.Store.YES))
doc.add(StringField("image_small", row[IMAGE_SMALL], Field.Store.YES))
#fields which should be analyzed with WhitespaceAnalyzer
#attention!!! dont use a long sentence like :
#.........这里部分代码省略.........
示例11: indexDocs
def indexDocs(self,root,writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(True)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames,filenames in os.walk(root):
# traverse through the doc directory
for filename in filenames:
# only if this file ends with '.c'
if not filename.endswith('.c'):
continue
try:
# only add the filename and path for indexing
path = os.path.join(root,filename)
print "adding file : ",path
file = open(path)
contents = unicode(file.read(),'utf-8')
file.close()
doc = Document()
doc.add(Field("name",filename,t1))
doc.add(Field("path",root,t1))
# if len(contents) > 0:
# doc.add(Field("contents",contents,t2))
# else:
# print "warning: no content in ",filename
writer.addDocument(doc)
except Exception,e:
print "failed in indexDocs:",e
示例12: indexDocs
def indexDocs(self, root, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
if not filename.endswith('.html'):
continue
print "adding", filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = unicode(file.read(), 'iso-8859-1')
file.close()
doc = Document()
doc.add(Field("name", filename, t1))
doc.add(Field("path", root, t1))
if len(contents) > 0:
doc.add(Field("contents", contents, t2))
else:
print "warning: no content in %s" % filename
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例13: LuceneDocumentField
class LuceneDocumentField(object):
"""Internal handler class for possible field types"""
def __init__(self):
"""Init possible field types"""
# FIELD_ID: stored, indexed, non-tokenized
self.field_id = FieldType()
self.field_id.setIndexed(True)
self.field_id.setStored(True)
self.field_id.setTokenized(False)
# FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions)
# for storing IDs with term vector info
self.field_id_tv = FieldType()
self.field_id_tv.setIndexed(True)
self.field_id_tv.setStored(True)
self.field_id_tv.setTokenized(False)
self.field_id_tv.setStoreTermVectors(True)
# FIELD_TEXT: stored, indexed, tokenized, with positions
self.field_text = FieldType()
self.field_text.setIndexed(True)
self.field_text.setStored(True)
self.field_text.setTokenized(True)
# FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions)
self.field_text_tv = FieldType()
self.field_text_tv.setIndexed(True)
self.field_text_tv.setStored(True)
self.field_text_tv.setTokenized(True)
self.field_text_tv.setStoreTermVectors(True)
# FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions
# (but no character offsets)
self.field_text_tvp = FieldType()
self.field_text_tvp.setIndexed(True)
self.field_text_tvp.setStored(True)
self.field_text_tvp.setTokenized(True)
self.field_text_tvp.setStoreTermVectors(True)
self.field_text_tvp.setStoreTermVectorPositions(True)
def get_field(self, type):
"""Get Lucene FieldType object for the corresponding internal FIELDTYPE_ value"""
if type == Lucene.FIELDTYPE_ID:
return self.field_id
elif type == Lucene.FIELDTYPE_ID_TV:
return self.field_id_tv
elif type == Lucene.FIELDTYPE_TEXT:
return self.field_text
elif type == Lucene.FIELDTYPE_TEXT_TV:
return self.field_text_tv
elif type == Lucene.FIELDTYPE_TEXT_TVP:
return self.field_text_tvp
else:
raise Exception("Unknown field type")
示例14: indexDocs
def indexDocs(root, writer):
"""
indexed: name title content
stored: date name tilte sumary
:param root:
:param writer:
:return:
"""
#index and store
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
#only index, but not store
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
#only store
t3 = FieldType()
t3.setIndexed(False)
t3.setStored(True)
t3.setTokenized(False)
t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
for root, dirnames, filenames in os.walk(root):
print filenames
for filename in filenames:
if not filename.endswith('.md'):
continue
print "adding", filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = unicode(file.read(), 'utf-8')
file.close()
date, name = get_date_name(filename)
title, content = get_post_title_content(contents)
summary = content[:200] if content else ''
print date, name, title
doc = Document()
doc.add(Field('date', date, t3))
doc.add(Field('name', name, t1))
doc.add(Field('title', title, t1))
doc.add(Field('content', content, t2))
doc.add(Field('summary', summary, t3))
# doc.add(Field("name", filename, t1))
# doc.add(Field("path", root, t1))
# if len(contents) > 0:
# doc.add(Field("contents", contents, t2))
# else:
# print "warning: no content in %s" % filename
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例15: indexDocs
def indexDocs(self, url, writer):
type1 = FieldType()
type1.setIndexed(True)
type1.setStored(True)
type1.setTokenized(False)
type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
type2 = FieldType()
type2.setIndexed(True)
type2.setStored(True)
type2.setTokenized(True)
type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
# Read Feeds
feeds = feedparser.parse(url)
for item in feeds["entries"]:
print "adding", item["title"]
try:
link = item["link"]
contents = item["description"].encode("utf-8")
contents = re.sub('<[^<]+?>', '', ''.join(contents))
title = item["title"]
doc = Document()
doc.add(Field("url", link, type1))
doc.add(Field("title", title, type1))
if len(contents) > 0:
doc.add(Field("contents", contents, type2))
else:
print "warning: no content in %s" % item["title"]
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e