本文整理汇总了Python中org.apache.lucene.document.FieldType.setIndexOptions方法的典型用法代码示例。如果您正苦于以下问题:Python FieldType.setIndexOptions方法的具体用法?Python FieldType.setIndexOptions怎么用?Python FieldType.setIndexOptions使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.document.FieldType
的用法示例。
在下文中一共展示了FieldType.setIndexOptions方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: indexDocs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 别名]
def indexDocs(self, root, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
if not filename.endswith('.html'):
continue
print "adding", filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = unicode(file.read(), 'iso-8859-1')
file.close()
doc = Document()
doc.add(Field("name", filename, t1))
doc.add(Field("path", root, t1))
if len(contents) > 0:
doc.add(Field("contents", contents, t2))
else:
print "warning: no content in %s" % filename
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例2: index_docs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 别名]
def index_docs(self, tweets, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(True)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
t1.setStoreTermVectors(True)
t1.setStoreTermVectorOffsets(True)
# add each tweet to the index
for tweet in tweets:
try:
# strip out URLs because they provide false index matches
contents = []
for word in tweet[1].text.split():
if word.startswith("http://") or word.startswith("https://"):
continue
contents.append(word)
contents = " ".join(contents)
if len(contents) == 0: continue
doc = Document()
doc.add(Field("contents", contents, t1))
writer.addDocument(doc)
except Exception, e:
print "Failed in index_docs:", e
示例3: index_docs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 别名]
def index_docs(self, train_set, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)
t2.setStoreTermVectorOffsets(True)
t2.setStoreTermVectorPayloads(True)
t2.setStoreTermVectorPositions(True)
t2.setStoreTermVectors(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for ii in train_set:
doc = Document()
doc.add(Field("answer", ii['Answer'], t1))
doc.add(Field("qid", ii['Question ID'], t1))
doc.add(Field("category", ii['category'], t1))
doc.add(Field("position", ii['Sentence Position'], t1))
doc.add(Field("question", ii['Question Text'], t2))
doc.add(Field("wiki_plain",
self.wiki_reader.get_text(ii['Answer']), t2))
writer.addDocument(doc)
示例4: indexDocs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 别名]
def indexDocs(self, url, writer):
type1 = FieldType()
type1.setIndexed(True)
type1.setStored(True)
type1.setTokenized(False)
type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
type2 = FieldType()
type2.setIndexed(True)
type2.setStored(True)
type2.setTokenized(True)
type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
# Read Feeds
feeds = feedparser.parse(url)
for item in feeds["entries"]:
print "adding", item["title"]
try:
link = item["link"]
contents = item["description"].encode("utf-8")
contents = re.sub('<[^<]+?>', '', ''.join(contents))
title = item["title"]
doc = Document()
doc.add(Field("url", link, type1))
doc.add(Field("title", title, type1))
if len(contents) > 0:
doc.add(Field("contents", contents, type2))
else:
print "warning: no content in %s" % item["title"]
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例5: indexDocs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 别名]
def indexDocs(self, root, writer):
t1 = FieldType() # for short items, e.g. file name.
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # DOCS_AND_FREQS_AND_POSITIONS_OFFSETS
t2 = FieldType() # for content
t2.setIndexed(True)
t2.setStored(False) # don't store the original text
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
print "adding", filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = unicode(file.read(), 'iso-8859-1')
file.close()
doc = Document()
doc.add(Field("name", filename, t1))
doc.add(Field("path", root, t1))
if len(contents) > 0:
doc.add(Field("contents", contents, t2))
else:
print "warning: no content in %s" % filename
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例6: _createNoTermsFrequencyFieldType
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 别名]
def _createNoTermsFrequencyFieldType():
f = FieldType()
f.setIndexed(True)
f.setTokenized(True)
f.setOmitNorms(True)
f.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
f.freeze()
return f
示例7: Indexer
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 别名]
class Indexer(object):
def __init__(self, **kwargs):
""" Initialize a new instance of the Indexer
:param output: The output directory of the underlying index
:param anaylzer: The overloaded analyzer to work with
"""
self.output = kwargs.get("root", "index")
if not os.path.exists(self.output):
os.mkdir(self.output)
self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT))
self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
self.store = SimpleFSDirectory(File(self.output))
self.writer = IndexWriter(self.store, self.config)
self.create_field_types()
def index(self, document):
""" Given a new document, add it to the index.
:param document: The document to add to the indexer
"""
try:
self.writer.addDocument(document)
except Exception:
logger.exception("Failed to index the supplied document")
def shutdown(self):
""" Shutdown the currently processing indexer.
"""
try:
# self.writer.optimize()
self.writer.close()
except Exception:
logger.exception("Failed to shutdown the indexer correctly")
def create_field_types(self):
""" Create the field types that will be used to specify
what actions lucene should take on the various fields
supplied to index.
"""
self.field_clean = FieldType()
self.field_clean.setIndexed(True)
self.field_clean.setStored(True)
self.field_clean.setTokenized(False)
self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
self.field_dirty = FieldType()
self.field_dirty.setIndexed(True)
self.field_dirty.setStored(False)
self.field_dirty.setTokenized(True)
self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
示例8: index_article
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 别名]
def index_article(writer, art_id, art_body):
art_id_field = FieldType()
art_id_field.setIndexed(True)
art_id_field.setStored(True)
art_id_field.setTokenized(False)
art_id_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
art_body_field = FieldType()
art_body_field.setIndexed(True)
art_body_field.setStored(True)
art_body_field.setTokenized(True)
art_body_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
doc = Document()
doc.add(Field("art_id", str(art_id), art_id_field))
doc.add(Field("art_body", art_body, art_body_field))
writer.addDocument(doc)
示例9: setUp
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 别名]
def setUp(self):
super(Test_Bug1842, self).setUp()
self.analyzer = StandardAnalyzer()
w1 = self.getWriter(analyzer=self.analyzer)
doc1 = Document()
ftype = FieldType()
ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
ftype.setTokenized(True)
ftype.setStoreTermVectors(True)
ftype.freeze()
doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))
w1.addDocument(doc1)
w1.close()
示例10: lazyImport
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 别名]
def lazyImport():
global imported
if imported:
return
from meresco.pylucene import getJVM
getJVM()
from java.nio.file import Paths
from org.apache.lucene.document import Document, StringField, Field, FieldType
from org.apache.lucene.search import IndexSearcher, TermQuery
from org.apache.lucene.index import DirectoryReader, Term, IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.util import Version
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
UNINDEXED_TYPE = FieldType()
UNINDEXED_TYPE.setIndexOptions(IndexOptions.NONE)
UNINDEXED_TYPE.setStored(True)
UNINDEXED_TYPE.setTokenized(False)
imported = True
globals().update(locals())
示例11: indexDocs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 别名]
def indexDocs(self, root, writer):
#Create a new FieldType with default properties.
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)#True if this field's value should be analyzed by the Analyzer.
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
#Create a new FieldType with default properties.
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)#True if this field's value should be analyzed by the Analyzer.
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
if not filename.endswith('.txt'):
continue
print 'adding', filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = file.read()
file.close()
doc = Document()
doc.add(Field('name', filename, t1))
doc.add(Field('path', root, t1))
if len(contents) > 0:
doc.add(Field('contents', contents, t2))
print 'length of content is %d'%(len(contents))
else:
print 'warning: no content in %s' % filename
writer.addDocument(doc)
except Exception, e:
print 'Failed in indexDocs:', e
示例12: indexDocs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 别名]
def indexDocs(self,root,writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(True)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames,filenames in os.walk(root):
# traverse through the doc directory
for filename in filenames:
# only if this file ends with '.c'
if not filename.endswith('.c'):
continue
try:
# only add the filename and path for indexing
path = os.path.join(root,filename)
print "adding file : ",path
file = open(path)
contents = unicode(file.read(),'utf-8')
file.close()
doc = Document()
doc.add(Field("name",filename,t1))
doc.add(Field("path",root,t1))
# if len(contents) > 0:
# doc.add(Field("contents",contents,t2))
# else:
# print "warning: no content in ",filename
writer.addDocument(doc)
except Exception,e:
print "failed in indexDocs:",e
示例13: index_wiki
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 别名]
def index_wiki(wiki_xmlfile, index_directory_name):
lucene.initVM()
# Initialize index directory and analyzer.
version = Version.LUCENE_CURRENT
store = FSDirectory.open(File(index_directory_name))
analyzer = StandardAnalyzer(version)
# Creates config file.
config = IndexWriterConfig(version, analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
# Set document content field type.
content_fieldtype = FieldType()
content_fieldtype.setIndexed(True)
content_fieldtype.setStored(True)
content_fieldtype.setTokenized(True)
content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
# Set document title field type.
title_fieldtype = FieldType()
title_fieldtype.setIndexed(True)
title_fieldtype.setStored(True)
title_fieldtype.setTokenized(True)
title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
# Set document url field type.
url_fieldtype = FieldType()
url_fieldtype.setIndexed(True)
url_fieldtype.setStored(True)
url_fieldtype.setTokenized(False)
url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for xmldoc in wikicorpusxml((wiki_xmlfile)):
content = xmldoc.partition('>')[2].partition('<')[0].strip()
title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
doc = Document()
doc.add(Field("contents", content, content_fieldtype))
doc.add(Field("title", title, title_fieldtype))
doc.add(Field("url", url, url_fieldtype))
writer.addDocument(doc)
writer.commit()
writer.close()
示例14: tweetIndexer
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 别名]
def tweetIndexer(self, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)
t2.setStoreTermVectorOffsets(True)
t2.setStoreTermVectorPayloads(True)
t2.setStoreTermVectorPositions(True)
t2.setStoreTermVectors(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
x = 0
for i in range(0,500):
if not os.path.isfile("json/tweets-" + str(i) + ".json"):
break
print "adding tweets-" + str(i) + ".json"
tweets = open("json/tweets-" + str(i) + ".json", "r")
for line in tweets.readlines():
tweet = json.loads(line)
if 'limit' in tweet:
continue
try:
doc = Document()
doc.add(Field("file", "json/tweets-" + str(i) + ".json", t1))
sname = tweet['user']['screen_name']
tid = str(tweet['id'])
text = tweet['text']
uname = tweet['user']['name']
created = tweet['created_at']
tstamp = tweet['timestamp_ms']
place = ""
if tweet['place']:
place = tweet['place']['full_name'] + ", " + tweet['place']['country']
lat = ""
lng = ""
titles = ""
urls = ""
exist = "false"
if tweet['coordinates']:
lat = str(tweet['coordinates']['coordinates'][1])
lng = str(tweet['coordinates']['coordinates'][0])
else:
lat = str((tweet['place']['bounding_box']['coordinates'][0][0][1] + tweet['place']['bounding_box']['coordinates'][0][2][1])/2)
lng = str((tweet['place']['bounding_box']['coordinates'][0][0][0] + tweet['place']['bounding_box']['coordinates'][0][2][0])/2)
if len(tweet['entities']['urls']) != 0:
exist = "true"
for index in range(len(tweet['entities']['urls'])):
title = tweet['entities']['urls'][index]['url_title']
if title == None:
titles += ",-"
else:
title = title.encode('ascii','ignore')
titles += "," + str(title)
urls += " " + str(tweet['entities']['urls'][index]['expanded_url'])
searchable = text + " " + urls + " " + uname + " " + sname + " " + place
doc.add(Field("lookup", searchable, t2))
doc.add(Field("text", text, t2))
doc.add(Field("user_name", uname, t2))
doc.add(Field("screen_name", sname, t2))
doc.add(Field("tweet_id", tid, t2))
doc.add(Field("created_at", created, t2))
doc.add(Field("geo_lat", lat, t2))
doc.add(Field("geo_lng", lng, t2))
doc.add(Field("url_exist", exist, t2))
doc.add(Field("url_url", urls, t2))
doc.add(Field("url_title", titles, t2))
doc.add(Field("timestamp", tstamp, t2))
writer.addDocument(doc)
x += 1
except Exception, e:
pass
tweets.close()
示例15: __init__
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 别名]
def __init__(self):
self.mDocumentDirectory = settings.ADMINS_ENGINE.mDocumentDirectory
self.mIndexDirectory = settings.ADMINS_ENGINE.mIndexDirectory
self.mAnalyzers = settings.ADMINS_ENGINE.getIndexingAnalyzers()
############################# Writer Configurattion #####################################
map = HashMap()
map.put('name', self.mAnalyzers['name'])
map.put('parent', self.mAnalyzers['parent'])
map.put('content', self.mAnalyzers['default'])
map.put('id', self.mAnalyzers['id'])
analyzerWrapper = PerFieldAnalyzerWrapper(self.mAnalyzers['default'], map)
self.mWriterConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzerWrapper)
self.mWriterConfig.setOpenMode(settings.ADMINS_ENGINE.mOpenMode)
if settings.ADMINS_ENGINE.mSimilarity != None:
self.mWriterConfig.setSimilarity(settings.ADMINS_ENGINE.mSimilarity)
########################################################################################
directory = SimpleFSDirectory(File(self.mIndexDirectory))
self.mIndexWriter = IndexWriter(directory, self.mWriterConfig)
############################# FieldType Prepration #####################
nameField = FieldType()
nameField.setIndexed(True)
nameField.setStored(True)
nameField.setTokenized(True)
nameField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
parentField = FieldType()
parentField.setIndexed(True)
parentField.setStored(True)
parentField.setTokenized(True)
parentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
contentField = FieldType()
contentField.setIndexed(True)
contentField.setStored(True)
contentField.setTokenized(True)
contentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
idField = FieldType()
idField.setIndexed(True)
idField.setStored(True)
idField.setTokenized(False)
idField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
self.mFieldTypes = {
'name' : nameField,
'parent' : parentField,
'content' : contentField,
'id' : idField
}
#######################################################################
self.mLog = ""