本文整理匯總了Python中org.apache.lucene.document.FieldType.setIndexOptions方法的典型用法代碼示例。如果您正苦於以下問題:Python FieldType.setIndexOptions方法的具體用法?Python FieldType.setIndexOptions怎麽用?Python FieldType.setIndexOptions使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.apache.lucene.document.FieldType
的用法示例。
在下文中一共展示了FieldType.setIndexOptions方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: indexDocs
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 別名]
def indexDocs(self, root, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
if not filename.endswith('.html'):
continue
print "adding", filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = unicode(file.read(), 'iso-8859-1')
file.close()
doc = Document()
doc.add(Field("name", filename, t1))
doc.add(Field("path", root, t1))
if len(contents) > 0:
doc.add(Field("contents", contents, t2))
else:
print "warning: no content in %s" % filename
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例2: index_docs
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 別名]
def index_docs(self, tweets, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(True)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
t1.setStoreTermVectors(True)
t1.setStoreTermVectorOffsets(True)
# add each tweet to the index
for tweet in tweets:
try:
# strip out URLs because they provide false index matches
contents = []
for word in tweet[1].text.split():
if word.startswith("http://") or word.startswith("https://"):
continue
contents.append(word)
contents = " ".join(contents)
if len(contents) == 0: continue
doc = Document()
doc.add(Field("contents", contents, t1))
writer.addDocument(doc)
except Exception, e:
print "Failed in index_docs:", e
示例3: index_docs
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 別名]
def index_docs(self, train_set, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)
t2.setStoreTermVectorOffsets(True)
t2.setStoreTermVectorPayloads(True)
t2.setStoreTermVectorPositions(True)
t2.setStoreTermVectors(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for ii in train_set:
doc = Document()
doc.add(Field("answer", ii['Answer'], t1))
doc.add(Field("qid", ii['Question ID'], t1))
doc.add(Field("category", ii['category'], t1))
doc.add(Field("position", ii['Sentence Position'], t1))
doc.add(Field("question", ii['Question Text'], t2))
doc.add(Field("wiki_plain",
self.wiki_reader.get_text(ii['Answer']), t2))
writer.addDocument(doc)
示例4: indexDocs
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 別名]
def indexDocs(self, url, writer):
type1 = FieldType()
type1.setIndexed(True)
type1.setStored(True)
type1.setTokenized(False)
type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
type2 = FieldType()
type2.setIndexed(True)
type2.setStored(True)
type2.setTokenized(True)
type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
# Read Feeds
feeds = feedparser.parse(url)
for item in feeds["entries"]:
print "adding", item["title"]
try:
link = item["link"]
contents = item["description"].encode("utf-8")
contents = re.sub('<[^<]+?>', '', ''.join(contents))
title = item["title"]
doc = Document()
doc.add(Field("url", link, type1))
doc.add(Field("title", title, type1))
if len(contents) > 0:
doc.add(Field("contents", contents, type2))
else:
print "warning: no content in %s" % item["title"]
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例5: indexDocs
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 別名]
def indexDocs(self, root, writer):
t1 = FieldType() # for short items, e.g. file name.
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # DOCS_AND_FREQS_AND_POSITIONS_OFFSETS
t2 = FieldType() # for content
t2.setIndexed(True)
t2.setStored(False) # don't store the original text
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
print "adding", filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = unicode(file.read(), 'iso-8859-1')
file.close()
doc = Document()
doc.add(Field("name", filename, t1))
doc.add(Field("path", root, t1))
if len(contents) > 0:
doc.add(Field("contents", contents, t2))
else:
print "warning: no content in %s" % filename
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例6: _createNoTermsFrequencyFieldType
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 別名]
def _createNoTermsFrequencyFieldType():
f = FieldType()
f.setIndexed(True)
f.setTokenized(True)
f.setOmitNorms(True)
f.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
f.freeze()
return f
示例7: Indexer
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 別名]
class Indexer(object):
def __init__(self, **kwargs):
""" Initialize a new instance of the Indexer
:param output: The output directory of the underlying index
:param anaylzer: The overloaded analyzer to work with
"""
self.output = kwargs.get("root", "index")
if not os.path.exists(self.output):
os.mkdir(self.output)
self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT))
self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
self.store = SimpleFSDirectory(File(self.output))
self.writer = IndexWriter(self.store, self.config)
self.create_field_types()
def index(self, document):
""" Given a new document, add it to the index.
:param document: The document to add to the indexer
"""
try:
self.writer.addDocument(document)
except Exception:
logger.exception("Failed to index the supplied document")
def shutdown(self):
""" Shutdown the currently processing indexer.
"""
try:
# self.writer.optimize()
self.writer.close()
except Exception:
logger.exception("Failed to shutdown the indexer correctly")
def create_field_types(self):
""" Create the field types that will be used to specify
what actions lucene should take on the various fields
supplied to index.
"""
self.field_clean = FieldType()
self.field_clean.setIndexed(True)
self.field_clean.setStored(True)
self.field_clean.setTokenized(False)
self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
self.field_dirty = FieldType()
self.field_dirty.setIndexed(True)
self.field_dirty.setStored(False)
self.field_dirty.setTokenized(True)
self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
示例8: index_article
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 別名]
def index_article(writer, art_id, art_body):
art_id_field = FieldType()
art_id_field.setIndexed(True)
art_id_field.setStored(True)
art_id_field.setTokenized(False)
art_id_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
art_body_field = FieldType()
art_body_field.setIndexed(True)
art_body_field.setStored(True)
art_body_field.setTokenized(True)
art_body_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
doc = Document()
doc.add(Field("art_id", str(art_id), art_id_field))
doc.add(Field("art_body", art_body, art_body_field))
writer.addDocument(doc)
示例9: setUp
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 別名]
def setUp(self):
super(Test_Bug1842, self).setUp()
self.analyzer = StandardAnalyzer()
w1 = self.getWriter(analyzer=self.analyzer)
doc1 = Document()
ftype = FieldType()
ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
ftype.setTokenized(True)
ftype.setStoreTermVectors(True)
ftype.freeze()
doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))
w1.addDocument(doc1)
w1.close()
示例10: lazyImport
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 別名]
def lazyImport():
global imported
if imported:
return
from meresco.pylucene import getJVM
getJVM()
from java.nio.file import Paths
from org.apache.lucene.document import Document, StringField, Field, FieldType
from org.apache.lucene.search import IndexSearcher, TermQuery
from org.apache.lucene.index import DirectoryReader, Term, IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.util import Version
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
UNINDEXED_TYPE = FieldType()
UNINDEXED_TYPE.setIndexOptions(IndexOptions.NONE)
UNINDEXED_TYPE.setStored(True)
UNINDEXED_TYPE.setTokenized(False)
imported = True
globals().update(locals())
示例11: indexDocs
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 別名]
def indexDocs(self, root, writer):
#Create a new FieldType with default properties.
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)#True if this field's value should be analyzed by the Analyzer.
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
#Create a new FieldType with default properties.
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)#True if this field's value should be analyzed by the Analyzer.
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
if not filename.endswith('.txt'):
continue
print 'adding', filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = file.read()
file.close()
doc = Document()
doc.add(Field('name', filename, t1))
doc.add(Field('path', root, t1))
if len(contents) > 0:
doc.add(Field('contents', contents, t2))
print 'length of content is %d'%(len(contents))
else:
print 'warning: no content in %s' % filename
writer.addDocument(doc)
except Exception, e:
print 'Failed in indexDocs:', e
示例12: indexDocs
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 別名]
def indexDocs(self,root,writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(True)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames,filenames in os.walk(root):
# traverse through the doc directory
for filename in filenames:
# only if this file ends with '.c'
if not filename.endswith('.c'):
continue
try:
# only add the filename and path for indexing
path = os.path.join(root,filename)
print "adding file : ",path
file = open(path)
contents = unicode(file.read(),'utf-8')
file.close()
doc = Document()
doc.add(Field("name",filename,t1))
doc.add(Field("path",root,t1))
# if len(contents) > 0:
# doc.add(Field("contents",contents,t2))
# else:
# print "warning: no content in ",filename
writer.addDocument(doc)
except Exception,e:
print "failed in indexDocs:",e
示例13: index_wiki
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 別名]
def index_wiki(wiki_xmlfile, index_directory_name):
lucene.initVM()
# Initialize index directory and analyzer.
version = Version.LUCENE_CURRENT
store = FSDirectory.open(File(index_directory_name))
analyzer = StandardAnalyzer(version)
# Creates config file.
config = IndexWriterConfig(version, analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
# Set document content field type.
content_fieldtype = FieldType()
content_fieldtype.setIndexed(True)
content_fieldtype.setStored(True)
content_fieldtype.setTokenized(True)
content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
# Set document title field type.
title_fieldtype = FieldType()
title_fieldtype.setIndexed(True)
title_fieldtype.setStored(True)
title_fieldtype.setTokenized(True)
title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
# Set document url field type.
url_fieldtype = FieldType()
url_fieldtype.setIndexed(True)
url_fieldtype.setStored(True)
url_fieldtype.setTokenized(False)
url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for xmldoc in wikicorpusxml((wiki_xmlfile)):
content = xmldoc.partition('>')[2].partition('<')[0].strip()
title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
doc = Document()
doc.add(Field("contents", content, content_fieldtype))
doc.add(Field("title", title, title_fieldtype))
doc.add(Field("url", url, url_fieldtype))
writer.addDocument(doc)
writer.commit()
writer.close()
示例14: tweetIndexer
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 別名]
def tweetIndexer(self, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)
t2.setStoreTermVectorOffsets(True)
t2.setStoreTermVectorPayloads(True)
t2.setStoreTermVectorPositions(True)
t2.setStoreTermVectors(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
x = 0
for i in range(0,500):
if not os.path.isfile("json/tweets-" + str(i) + ".json"):
break
print "adding tweets-" + str(i) + ".json"
tweets = open("json/tweets-" + str(i) + ".json", "r")
for line in tweets.readlines():
tweet = json.loads(line)
if 'limit' in tweet:
continue
try:
doc = Document()
doc.add(Field("file", "json/tweets-" + str(i) + ".json", t1))
sname = tweet['user']['screen_name']
tid = str(tweet['id'])
text = tweet['text']
uname = tweet['user']['name']
created = tweet['created_at']
tstamp = tweet['timestamp_ms']
place = ""
if tweet['place']:
place = tweet['place']['full_name'] + ", " + tweet['place']['country']
lat = ""
lng = ""
titles = ""
urls = ""
exist = "false"
if tweet['coordinates']:
lat = str(tweet['coordinates']['coordinates'][1])
lng = str(tweet['coordinates']['coordinates'][0])
else:
lat = str((tweet['place']['bounding_box']['coordinates'][0][0][1] + tweet['place']['bounding_box']['coordinates'][0][2][1])/2)
lng = str((tweet['place']['bounding_box']['coordinates'][0][0][0] + tweet['place']['bounding_box']['coordinates'][0][2][0])/2)
if len(tweet['entities']['urls']) != 0:
exist = "true"
for index in range(len(tweet['entities']['urls'])):
title = tweet['entities']['urls'][index]['url_title']
if title == None:
titles += ",-"
else:
title = title.encode('ascii','ignore')
titles += "," + str(title)
urls += " " + str(tweet['entities']['urls'][index]['expanded_url'])
searchable = text + " " + urls + " " + uname + " " + sname + " " + place
doc.add(Field("lookup", searchable, t2))
doc.add(Field("text", text, t2))
doc.add(Field("user_name", uname, t2))
doc.add(Field("screen_name", sname, t2))
doc.add(Field("tweet_id", tid, t2))
doc.add(Field("created_at", created, t2))
doc.add(Field("geo_lat", lat, t2))
doc.add(Field("geo_lng", lng, t2))
doc.add(Field("url_exist", exist, t2))
doc.add(Field("url_url", urls, t2))
doc.add(Field("url_title", titles, t2))
doc.add(Field("timestamp", tstamp, t2))
writer.addDocument(doc)
x += 1
except Exception, e:
pass
tweets.close()
示例15: __init__
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setIndexOptions [as 別名]
def __init__(self):
self.mDocumentDirectory = settings.ADMINS_ENGINE.mDocumentDirectory
self.mIndexDirectory = settings.ADMINS_ENGINE.mIndexDirectory
self.mAnalyzers = settings.ADMINS_ENGINE.getIndexingAnalyzers()
############################# Writer Configurattion #####################################
map = HashMap()
map.put('name', self.mAnalyzers['name'])
map.put('parent', self.mAnalyzers['parent'])
map.put('content', self.mAnalyzers['default'])
map.put('id', self.mAnalyzers['id'])
analyzerWrapper = PerFieldAnalyzerWrapper(self.mAnalyzers['default'], map)
self.mWriterConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzerWrapper)
self.mWriterConfig.setOpenMode(settings.ADMINS_ENGINE.mOpenMode)
if settings.ADMINS_ENGINE.mSimilarity != None:
self.mWriterConfig.setSimilarity(settings.ADMINS_ENGINE.mSimilarity)
########################################################################################
directory = SimpleFSDirectory(File(self.mIndexDirectory))
self.mIndexWriter = IndexWriter(directory, self.mWriterConfig)
############################# FieldType Prepration #####################
nameField = FieldType()
nameField.setIndexed(True)
nameField.setStored(True)
nameField.setTokenized(True)
nameField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
parentField = FieldType()
parentField.setIndexed(True)
parentField.setStored(True)
parentField.setTokenized(True)
parentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
contentField = FieldType()
contentField.setIndexed(True)
contentField.setStored(True)
contentField.setTokenized(True)
contentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
idField = FieldType()
idField.setIndexed(True)
idField.setStored(True)
idField.setTokenized(False)
idField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
self.mFieldTypes = {
'name' : nameField,
'parent' : parentField,
'content' : contentField,
'id' : idField
}
#######################################################################
self.mLog = ""