本文整理汇总了Python中org.apache.lucene.document.FieldType.setStoreTermVectorPositions方法的典型用法代码示例。如果您正苦于以下问题:Python FieldType.setStoreTermVectorPositions方法的具体用法?Python FieldType.setStoreTermVectorPositions怎么用?Python FieldType.setStoreTermVectorPositions使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.document.FieldType
的用法示例。
在下文中一共展示了FieldType.setStoreTermVectorPositions方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: index_docs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectorPositions [as 别名]
def index_docs(self, train_set, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)
t2.setStoreTermVectorOffsets(True)
t2.setStoreTermVectorPayloads(True)
t2.setStoreTermVectorPositions(True)
t2.setStoreTermVectors(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for ii in train_set:
doc = Document()
doc.add(Field("answer", ii['Answer'], t1))
doc.add(Field("qid", ii['Question ID'], t1))
doc.add(Field("category", ii['category'], t1))
doc.add(Field("position", ii['Sentence Position'], t1))
doc.add(Field("question", ii['Question Text'], t2))
doc.add(Field("wiki_plain",
self.wiki_reader.get_text(ii['Answer']), t2))
writer.addDocument(doc)
示例2: LuceneDocumentField
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectorPositions [as 别名]
class LuceneDocumentField(object):
"""Internal handler class for possible field types"""
def __init__(self):
"""Init possible field types"""
# FIELD_ID: stored, indexed, non-tokenized
self.field_id = FieldType()
self.field_id.setIndexed(True)
self.field_id.setStored(True)
self.field_id.setTokenized(False)
# FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions)
# for storing IDs with term vector info
self.field_id_tv = FieldType()
self.field_id_tv.setIndexed(True)
self.field_id_tv.setStored(True)
self.field_id_tv.setTokenized(False)
self.field_id_tv.setStoreTermVectors(True)
# FIELD_TEXT: stored, indexed, tokenized, with positions
self.field_text = FieldType()
self.field_text.setIndexed(True)
self.field_text.setStored(True)
self.field_text.setTokenized(True)
# FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions)
self.field_text_tv = FieldType()
self.field_text_tv.setIndexed(True)
self.field_text_tv.setStored(True)
self.field_text_tv.setTokenized(True)
self.field_text_tv.setStoreTermVectors(True)
# FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions
# (but no character offsets)
self.field_text_tvp = FieldType()
self.field_text_tvp.setIndexed(True)
self.field_text_tvp.setStored(True)
self.field_text_tvp.setTokenized(True)
self.field_text_tvp.setStoreTermVectors(True)
self.field_text_tvp.setStoreTermVectorPositions(True)
def get_field(self, type):
"""Get Lucene FieldType object for the corresponding internal FIELDTYPE_ value"""
if type == Lucene.FIELDTYPE_ID:
return self.field_id
elif type == Lucene.FIELDTYPE_ID_TV:
return self.field_id_tv
elif type == Lucene.FIELDTYPE_TEXT:
return self.field_text
elif type == Lucene.FIELDTYPE_TEXT_TV:
return self.field_text_tv
elif type == Lucene.FIELDTYPE_TEXT_TVP:
return self.field_text_tvp
else:
raise Exception("Unknown field type")
示例3: tweetIndexer
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectorPositions [as 别名]
def tweetIndexer(self, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)
t2.setStoreTermVectorOffsets(True)
t2.setStoreTermVectorPayloads(True)
t2.setStoreTermVectorPositions(True)
t2.setStoreTermVectors(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
x = 0
for i in range(0,500):
if not os.path.isfile("json/tweets-" + str(i) + ".json"):
break
print "adding tweets-" + str(i) + ".json"
tweets = open("json/tweets-" + str(i) + ".json", "r")
for line in tweets.readlines():
tweet = json.loads(line)
if 'limit' in tweet:
continue
try:
doc = Document()
doc.add(Field("file", "json/tweets-" + str(i) + ".json", t1))
sname = tweet['user']['screen_name']
tid = str(tweet['id'])
text = tweet['text']
uname = tweet['user']['name']
created = tweet['created_at']
tstamp = tweet['timestamp_ms']
place = ""
if tweet['place']:
place = tweet['place']['full_name'] + ", " + tweet['place']['country']
lat = ""
lng = ""
titles = ""
urls = ""
exist = "false"
if tweet['coordinates']:
lat = str(tweet['coordinates']['coordinates'][1])
lng = str(tweet['coordinates']['coordinates'][0])
else:
lat = str((tweet['place']['bounding_box']['coordinates'][0][0][1] + tweet['place']['bounding_box']['coordinates'][0][2][1])/2)
lng = str((tweet['place']['bounding_box']['coordinates'][0][0][0] + tweet['place']['bounding_box']['coordinates'][0][2][0])/2)
if len(tweet['entities']['urls']) != 0:
exist = "true"
for index in range(len(tweet['entities']['urls'])):
title = tweet['entities']['urls'][index]['url_title']
if title == None:
titles += ",-"
else:
title = title.encode('ascii','ignore')
titles += "," + str(title)
urls += " " + str(tweet['entities']['urls'][index]['expanded_url'])
searchable = text + " " + urls + " " + uname + " " + sname + " " + place
doc.add(Field("lookup", searchable, t2))
doc.add(Field("text", text, t2))
doc.add(Field("user_name", uname, t2))
doc.add(Field("screen_name", sname, t2))
doc.add(Field("tweet_id", tid, t2))
doc.add(Field("created_at", created, t2))
doc.add(Field("geo_lat", lat, t2))
doc.add(Field("geo_lng", lng, t2))
doc.add(Field("url_exist", exist, t2))
doc.add(Field("url_url", urls, t2))
doc.add(Field("url_title", titles, t2))
doc.add(Field("timestamp", tstamp, t2))
writer.addDocument(doc)
x += 1
except Exception, e:
pass
tweets.close()
示例4: RAMDirectory
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectorPositions [as 别名]
from org.apache.lucene.index import \
IndexWriterConfig, IndexWriter, DirectoryReader, IndexOptions
if __name__ == '__main__':
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
directory = RAMDirectory()
iconfig = IndexWriterConfig(LimitTokenCountAnalyzer(StandardAnalyzer(), 100))
iwriter = IndexWriter(directory, iconfig)
ft = FieldType()
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
ts = ["this bernhard is the text to be index text",
"this claudia is the text to be indexed"]
for t in ts:
doc = Document()
doc.add(Field("fieldname", t, ft))
iwriter.addDocument(doc)
iwriter.commit()
iwriter.close()
ireader = DirectoryReader.open(directory)
for doc in xrange(0, len(ts)):
tv = ireader.getTermVector(doc, "fieldname")
示例5: open_searcher
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectorPositions [as 别名]
return writer
def open_searcher(writer):
from org.apache.lucene.search import IndexSearcher
reader = writer.getReader()
searcher = IndexSearcher(reader)
return reader, searcher
from org.apache.lucene.document import Document, Field, FieldType, TextField, StringField
from org.apache.lucene.util import BytesRef, BytesRefIterator
from org.apache.lucene.index import Term
vectorFieldType = FieldType(TextField.TYPE_NOT_STORED)
vectorFieldType.setIndexed(True)
vectorFieldType.setTokenized(True)
vectorFieldType.setStoreTermVectors(True)
vectorFieldType.setStoreTermVectorPositions(False)
writer = open_writer('data/index')
def addToIndex(lxmlNode):
uri = xpathFirst(lxmlNode, '//oa:hasTarget/@rdf:resource')
print uri
seen = set()
doc = Document()
for fieldName in FIELD_NAMES:
seen.clear()
for subpath in [
'', '/*/rdfs:label', '/*/skos:prefLabel', '/*/skos:altLabel',
'/*/dcterms:title', '/*/foaf:name']:
for value in xpath(lxmlNode, '//%(fieldName)s%(subpath)s/text()' % locals()):
if value in seen: