本文整理汇总了Python中org.apache.lucene.document.FieldType.setStoreTermVectors方法的典型用法代码示例。如果您正苦于以下问题:Python FieldType.setStoreTermVectors方法的具体用法?Python FieldType.setStoreTermVectors怎么用?Python FieldType.setStoreTermVectors使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.document.FieldType
的用法示例。
在下文中一共展示了FieldType.setStoreTermVectors方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: index_docs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectors [as 别名]
def index_docs(self, train_set, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)
t2.setStoreTermVectorOffsets(True)
t2.setStoreTermVectorPayloads(True)
t2.setStoreTermVectorPositions(True)
t2.setStoreTermVectors(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for ii in train_set:
doc = Document()
doc.add(Field("answer", ii['Answer'], t1))
doc.add(Field("qid", ii['Question ID'], t1))
doc.add(Field("category", ii['category'], t1))
doc.add(Field("position", ii['Sentence Position'], t1))
doc.add(Field("question", ii['Question Text'], t2))
doc.add(Field("wiki_plain",
self.wiki_reader.get_text(ii['Answer']), t2))
writer.addDocument(doc)
示例2: index_docs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectors [as 别名]
def index_docs(self, tweets, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(True)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
t1.setStoreTermVectors(True)
t1.setStoreTermVectorOffsets(True)
# add each tweet to the index
for tweet in tweets:
try:
# strip out URLs because they provide false index matches
contents = []
for word in tweet[1].text.split():
if word.startswith("http://") or word.startswith("https://"):
continue
contents.append(word)
contents = " ".join(contents)
if len(contents) == 0: continue
doc = Document()
doc.add(Field("contents", contents, t1))
writer.addDocument(doc)
except Exception, e:
print "Failed in index_docs:", e
示例3: index
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectors [as 别名]
def index(self, root):
t = FieldType()
t.setIndexed(True)
t.setStored(True)
t.setTokenized(True)
t.setStoreTermVectors(True)
for path, dirs, files in os.walk(root):
for file in files:
filePath = os.path.join(path, file)
fd = open(filePath)
content = unicode(fd.read(), 'iso-8859-1')
fd.close()
doc = Document()
doc.add(Field('name', file, StringField.TYPE_STORED))
parent = os.path.split(path)[1]
doc.add(Field('parent', parent, StringField.TYPE_STORED))
if len(content) > 0:
doc.add(Field('content', content, t))
print 'Indexing %s' % file
self.mWriter.addDocument(doc)
self.mWriter.commit()
self.mWriter.close()
示例4: setUp
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectors [as 别名]
def setUp(self):
super(Test_Bug1842, self).setUp()
self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
w1 = self.getWriter(analyzer=self.analyzer)
doc1 = Document()
ftype = FieldType()
ftype.setStored(False)
ftype.setIndexed(True)
ftype.setStoreTermVectors(True)
doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))
w1.addDocument(doc1)
w1.close()
示例5: setUp
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectors [as 别名]
def setUp(self):
super(Test_Bug1842, self).setUp()
self.analyzer = StandardAnalyzer()
w1 = self.getWriter(analyzer=self.analyzer)
doc1 = Document()
ftype = FieldType()
ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
ftype.setTokenized(True)
ftype.setStoreTermVectors(True)
ftype.freeze()
doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))
w1.addDocument(doc1)
w1.close()
示例6: LuceneDocumentField
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectors [as 别名]
class LuceneDocumentField(object):
"""Internal handler class for possible field types"""
def __init__(self):
"""Init possible field types"""
# FIELD_ID: stored, indexed, non-tokenized
self.field_id = FieldType()
self.field_id.setIndexed(True)
self.field_id.setStored(True)
self.field_id.setTokenized(False)
# FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions)
# for storing IDs with term vector info
self.field_id_tv = FieldType()
self.field_id_tv.setIndexed(True)
self.field_id_tv.setStored(True)
self.field_id_tv.setTokenized(False)
self.field_id_tv.setStoreTermVectors(True)
# FIELD_TEXT: stored, indexed, tokenized, with positions
self.field_text = FieldType()
self.field_text.setIndexed(True)
self.field_text.setStored(True)
self.field_text.setTokenized(True)
# FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions)
self.field_text_tv = FieldType()
self.field_text_tv.setIndexed(True)
self.field_text_tv.setStored(True)
self.field_text_tv.setTokenized(True)
self.field_text_tv.setStoreTermVectors(True)
# FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions
# (but no character offsets)
self.field_text_tvp = FieldType()
self.field_text_tvp.setIndexed(True)
self.field_text_tvp.setStored(True)
self.field_text_tvp.setTokenized(True)
self.field_text_tvp.setStoreTermVectors(True)
self.field_text_tvp.setStoreTermVectorPositions(True)
def get_field(self, type):
"""Get Lucene FieldType object for the corresponding internal FIELDTYPE_ value"""
if type == Lucene.FIELDTYPE_ID:
return self.field_id
elif type == Lucene.FIELDTYPE_ID_TV:
return self.field_id_tv
elif type == Lucene.FIELDTYPE_TEXT:
return self.field_text
elif type == Lucene.FIELDTYPE_TEXT_TV:
return self.field_text_tv
elif type == Lucene.FIELDTYPE_TEXT_TVP:
return self.field_text_tvp
else:
raise Exception("Unknown field type")
示例7: tweetIndexer
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectors [as 别名]
def tweetIndexer(self, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)
t2.setStoreTermVectorOffsets(True)
t2.setStoreTermVectorPayloads(True)
t2.setStoreTermVectorPositions(True)
t2.setStoreTermVectors(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
x = 0
for i in range(0,500):
if not os.path.isfile("json/tweets-" + str(i) + ".json"):
break
print "adding tweets-" + str(i) + ".json"
tweets = open("json/tweets-" + str(i) + ".json", "r")
for line in tweets.readlines():
tweet = json.loads(line)
if 'limit' in tweet:
continue
try:
doc = Document()
doc.add(Field("file", "json/tweets-" + str(i) + ".json", t1))
sname = tweet['user']['screen_name']
tid = str(tweet['id'])
text = tweet['text']
uname = tweet['user']['name']
created = tweet['created_at']
tstamp = tweet['timestamp_ms']
place = ""
if tweet['place']:
place = tweet['place']['full_name'] + ", " + tweet['place']['country']
lat = ""
lng = ""
titles = ""
urls = ""
exist = "false"
if tweet['coordinates']:
lat = str(tweet['coordinates']['coordinates'][1])
lng = str(tweet['coordinates']['coordinates'][0])
else:
lat = str((tweet['place']['bounding_box']['coordinates'][0][0][1] + tweet['place']['bounding_box']['coordinates'][0][2][1])/2)
lng = str((tweet['place']['bounding_box']['coordinates'][0][0][0] + tweet['place']['bounding_box']['coordinates'][0][2][0])/2)
if len(tweet['entities']['urls']) != 0:
exist = "true"
for index in range(len(tweet['entities']['urls'])):
title = tweet['entities']['urls'][index]['url_title']
if title == None:
titles += ",-"
else:
title = title.encode('ascii','ignore')
titles += "," + str(title)
urls += " " + str(tweet['entities']['urls'][index]['expanded_url'])
searchable = text + " " + urls + " " + uname + " " + sname + " " + place
doc.add(Field("lookup", searchable, t2))
doc.add(Field("text", text, t2))
doc.add(Field("user_name", uname, t2))
doc.add(Field("screen_name", sname, t2))
doc.add(Field("tweet_id", tid, t2))
doc.add(Field("created_at", created, t2))
doc.add(Field("geo_lat", lat, t2))
doc.add(Field("geo_lng", lng, t2))
doc.add(Field("url_exist", exist, t2))
doc.add(Field("url_url", urls, t2))
doc.add(Field("url_title", titles, t2))
doc.add(Field("timestamp", tstamp, t2))
writer.addDocument(doc)
x += 1
except Exception, e:
pass
tweets.close()
示例8: RAMDirectory
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectors [as 别名]
from org.apache.lucene.document import Document, Field, FieldType
from org.apache.lucene.util import BytesRef, BytesRefIterator
from org.apache.lucene.index import \
IndexWriterConfig, IndexWriter, DirectoryReader, IndexOptions
if __name__ == '__main__':
lucene.initVM(vmargs=['-Djava.awt.headless=true'])
directory = RAMDirectory()
iconfig = IndexWriterConfig(LimitTokenCountAnalyzer(StandardAnalyzer(), 100))
iwriter = IndexWriter(directory, iconfig)
ft = FieldType()
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
ts = ["this bernhard is the text to be index text",
"this claudia is the text to be indexed"]
for t in ts:
doc = Document()
doc.add(Field("fieldname", t, ft))
iwriter.addDocument(doc)
iwriter.commit()
iwriter.close()
ireader = DirectoryReader.open(directory)
示例9: IndexWriter
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectors [as 别名]
writer = IndexWriter(directory, config)
return writer
def open_searcher(writer):
from org.apache.lucene.search import IndexSearcher
reader = writer.getReader()
searcher = IndexSearcher(reader)
return reader, searcher
from org.apache.lucene.document import Document, Field, FieldType, TextField, StringField
from org.apache.lucene.util import BytesRef, BytesRefIterator
from org.apache.lucene.index import Term
vectorFieldType = FieldType(TextField.TYPE_NOT_STORED)
vectorFieldType.setIndexed(True)
vectorFieldType.setTokenized(True)
vectorFieldType.setStoreTermVectors(True)
vectorFieldType.setStoreTermVectorPositions(False)
writer = open_writer('data/index')
def addToIndex(lxmlNode):
uri = xpathFirst(lxmlNode, '//oa:hasTarget/@rdf:resource')
print uri
seen = set()
doc = Document()
for fieldName in FIELD_NAMES:
seen.clear()
for subpath in [
'', '/*/rdfs:label', '/*/skos:prefLabel', '/*/skos:altLabel',
'/*/dcterms:title', '/*/foaf:name']:
for value in xpath(lxmlNode, '//%(fieldName)s%(subpath)s/text()' % locals()):