本文整理汇总了Python中org.apache.lucene.document.FieldType.setIndexed方法的典型用法代码示例。如果您正苦于以下问题:Python FieldType.setIndexed方法的具体用法?Python FieldType.setIndexed怎么用?Python FieldType.setIndexed使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.document.FieldType
的用法示例。
在下文中一共展示了FieldType.setIndexed方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: index_docs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
def index_docs(self, tweets, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(True)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
t1.setStoreTermVectors(True)
t1.setStoreTermVectorOffsets(True)
# add each tweet to the index
for tweet in tweets:
try:
# strip out URLs because they provide false index matches
contents = []
for word in tweet[1].text.split():
if word.startswith("http://") or word.startswith("https://"):
continue
contents.append(word)
contents = " ".join(contents)
if len(contents) == 0: continue
doc = Document()
doc.add(Field("contents", contents, t1))
writer.addDocument(doc)
except Exception, e:
print "Failed in index_docs:", e
示例2: indexDocs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
def indexDocs(self, root, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
if not filename.endswith('.html'):
continue
print "adding", filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = unicode(file.read(), 'iso-8859-1')
file.close()
doc = Document()
doc.add(Field("name", filename, t1))
doc.add(Field("path", root, t1))
if len(contents) > 0:
doc.add(Field("contents", contents, t2))
else:
print "warning: no content in %s" % filename
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例3: index_docs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
def index_docs(self, train_set, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)
t2.setStoreTermVectorOffsets(True)
t2.setStoreTermVectorPayloads(True)
t2.setStoreTermVectorPositions(True)
t2.setStoreTermVectors(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for ii in train_set:
doc = Document()
doc.add(Field("answer", ii['Answer'], t1))
doc.add(Field("qid", ii['Question ID'], t1))
doc.add(Field("category", ii['category'], t1))
doc.add(Field("position", ii['Sentence Position'], t1))
doc.add(Field("question", ii['Question Text'], t2))
doc.add(Field("wiki_plain",
self.wiki_reader.get_text(ii['Answer']), t2))
writer.addDocument(doc)
示例4: indexDocs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
def indexDocs(self, root, writer):
t1 = FieldType() # for short items, e.g. file name.
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # DOCS_AND_FREQS_AND_POSITIONS_OFFSETS
t2 = FieldType() # for content
t2.setIndexed(True)
t2.setStored(False) # don't store the original text
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
print "adding", filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = unicode(file.read(), 'iso-8859-1')
file.close()
doc = Document()
doc.add(Field("name", filename, t1))
doc.add(Field("path", root, t1))
if len(contents) > 0:
doc.add(Field("contents", contents, t2))
else:
print "warning: no content in %s" % filename
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例5: index
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
def index(self, root):
t = FieldType()
t.setIndexed(True)
t.setStored(True)
t.setTokenized(True)
t.setStoreTermVectors(True)
for path, dirs, files in os.walk(root):
for file in files:
filePath = os.path.join(path, file)
fd = open(filePath)
content = unicode(fd.read(), 'iso-8859-1')
fd.close()
doc = Document()
doc.add(Field('name', file, StringField.TYPE_STORED))
parent = os.path.split(path)[1]
doc.add(Field('parent', parent, StringField.TYPE_STORED))
if len(content) > 0:
doc.add(Field('content', content, t))
print 'Indexing %s' % file
self.mWriter.addDocument(doc)
self.mWriter.commit()
self.mWriter.close()
示例6: indexDocs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
def indexDocs(self, url, writer):
type1 = FieldType()
type1.setIndexed(True)
type1.setStored(True)
type1.setTokenized(False)
type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
type2 = FieldType()
type2.setIndexed(True)
type2.setStored(True)
type2.setTokenized(True)
type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
# Read Feeds
feeds = feedparser.parse(url)
for item in feeds["entries"]:
print "adding", item["title"]
try:
link = item["link"]
contents = item["description"].encode("utf-8")
contents = re.sub('<[^<]+?>', '', ''.join(contents))
title = item["title"]
doc = Document()
doc.add(Field("url", link, type1))
doc.add(Field("title", title, type1))
if len(contents) > 0:
doc.add(Field("contents", contents, type2))
else:
print "warning: no content in %s" % item["title"]
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例7: _createNoTermsFrequencyFieldType
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
def _createNoTermsFrequencyFieldType():
f = FieldType()
f.setIndexed(True)
f.setTokenized(True)
f.setOmitNorms(True)
f.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
f.freeze()
return f
示例8: LuceneDocumentField
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
class LuceneDocumentField(object):
"""Internal handler class for possible field types"""
def __init__(self):
"""Init possible field types"""
# FIELD_ID: stored, indexed, non-tokenized
self.field_id = FieldType()
self.field_id.setIndexed(True)
self.field_id.setStored(True)
self.field_id.setTokenized(False)
# FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions)
# for storing IDs with term vector info
self.field_id_tv = FieldType()
self.field_id_tv.setIndexed(True)
self.field_id_tv.setStored(True)
self.field_id_tv.setTokenized(False)
self.field_id_tv.setStoreTermVectors(True)
# FIELD_TEXT: stored, indexed, tokenized, with positions
self.field_text = FieldType()
self.field_text.setIndexed(True)
self.field_text.setStored(True)
self.field_text.setTokenized(True)
# FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions)
self.field_text_tv = FieldType()
self.field_text_tv.setIndexed(True)
self.field_text_tv.setStored(True)
self.field_text_tv.setTokenized(True)
self.field_text_tv.setStoreTermVectors(True)
# FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions
# (but no character offsets)
self.field_text_tvp = FieldType()
self.field_text_tvp.setIndexed(True)
self.field_text_tvp.setStored(True)
self.field_text_tvp.setTokenized(True)
self.field_text_tvp.setStoreTermVectors(True)
self.field_text_tvp.setStoreTermVectorPositions(True)
def get_field(self, type):
"""Get Lucene FieldType object for the corresponding internal FIELDTYPE_ value"""
if type == Lucene.FIELDTYPE_ID:
return self.field_id
elif type == Lucene.FIELDTYPE_ID_TV:
return self.field_id_tv
elif type == Lucene.FIELDTYPE_TEXT:
return self.field_text
elif type == Lucene.FIELDTYPE_TEXT_TV:
return self.field_text_tv
elif type == Lucene.FIELDTYPE_TEXT_TVP:
return self.field_text_tvp
else:
raise Exception("Unknown field type")
示例9: Indexer
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
class Indexer(object):
def __init__(self, **kwargs):
""" Initialize a new instance of the Indexer
:param output: The output directory of the underlying index
:param anaylzer: The overloaded analyzer to work with
"""
self.output = kwargs.get("root", "index")
if not os.path.exists(self.output):
os.mkdir(self.output)
self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT))
self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
self.store = SimpleFSDirectory(File(self.output))
self.writer = IndexWriter(self.store, self.config)
self.create_field_types()
def index(self, document):
""" Given a new document, add it to the index.
:param document: The document to add to the indexer
"""
try:
self.writer.addDocument(document)
except Exception:
logger.exception("Failed to index the supplied document")
def shutdown(self):
""" Shutdown the currently processing indexer.
"""
try:
# self.writer.optimize()
self.writer.close()
except Exception:
logger.exception("Failed to shutdown the indexer correctly")
def create_field_types(self):
""" Create the field types that will be used to specify
what actions lucene should take on the various fields
supplied to index.
"""
self.field_clean = FieldType()
self.field_clean.setIndexed(True)
self.field_clean.setStored(True)
self.field_clean.setTokenized(False)
self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
self.field_dirty = FieldType()
self.field_dirty.setIndexed(True)
self.field_dirty.setStored(False)
self.field_dirty.setTokenized(True)
self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
示例10: setUp
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
def setUp(self):
super(Test_Bug1842, self).setUp()
self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
w1 = self.getWriter(analyzer=self.analyzer)
doc1 = Document()
ftype = FieldType()
ftype.setStored(False)
ftype.setIndexed(True)
ftype.setStoreTermVectors(True)
doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))
w1.addDocument(doc1)
w1.close()
示例11: index_article
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
def index_article(writer, art_id, art_body):
art_id_field = FieldType()
art_id_field.setIndexed(True)
art_id_field.setStored(True)
art_id_field.setTokenized(False)
art_id_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
art_body_field = FieldType()
art_body_field.setIndexed(True)
art_body_field.setStored(True)
art_body_field.setTokenized(True)
art_body_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
doc = Document()
doc.add(Field("art_id", str(art_id), art_id_field))
doc.add(Field("art_body", art_body, art_body_field))
writer.addDocument(doc)
示例12: indexDocs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
def indexDocs(self, root, writer):
#Create a new FieldType with default properties.
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)#True if this field's value should be analyzed by the Analyzer.
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
#Create a new FieldType with default properties.
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)#True if this field's value should be analyzed by the Analyzer.
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
if not filename.endswith('.txt'):
continue
print 'adding', filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = file.read()
file.close()
doc = Document()
doc.add(Field('name', filename, t1))
doc.add(Field('path', root, t1))
if len(contents) > 0:
doc.add(Field('contents', contents, t2))
print 'length of content is %d'%(len(contents))
else:
print 'warning: no content in %s' % filename
writer.addDocument(doc)
except Exception, e:
print 'Failed in indexDocs:', e
示例13: indexDocs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
def indexDocs(self,root,writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(True)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames,filenames in os.walk(root):
# traverse through the doc directory
for filename in filenames:
# only if this file ends with '.c'
if not filename.endswith('.c'):
continue
try:
# only add the filename and path for indexing
path = os.path.join(root,filename)
print "adding file : ",path
file = open(path)
contents = unicode(file.read(),'utf-8')
file.close()
doc = Document()
doc.add(Field("name",filename,t1))
doc.add(Field("path",root,t1))
# if len(contents) > 0:
# doc.add(Field("contents",contents,t2))
# else:
# print "warning: no content in ",filename
writer.addDocument(doc)
except Exception,e:
print "failed in indexDocs:",e
示例14: index_wiki
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
def index_wiki(wiki_xmlfile, index_directory_name):
lucene.initVM()
# Initialize index directory and analyzer.
version = Version.LUCENE_CURRENT
store = FSDirectory.open(File(index_directory_name))
analyzer = StandardAnalyzer(version)
# Creates config file.
config = IndexWriterConfig(version, analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
# Set document content field type.
content_fieldtype = FieldType()
content_fieldtype.setIndexed(True)
content_fieldtype.setStored(True)
content_fieldtype.setTokenized(True)
content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
# Set document title field type.
title_fieldtype = FieldType()
title_fieldtype.setIndexed(True)
title_fieldtype.setStored(True)
title_fieldtype.setTokenized(True)
title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
# Set document url field type.
url_fieldtype = FieldType()
url_fieldtype.setIndexed(True)
url_fieldtype.setStored(True)
url_fieldtype.setTokenized(False)
url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for xmldoc in wikicorpusxml((wiki_xmlfile)):
content = xmldoc.partition('>')[2].partition('<')[0].strip()
title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
doc = Document()
doc.add(Field("contents", content, content_fieldtype))
doc.add(Field("title", title, title_fieldtype))
doc.add(Field("url", url, url_fieldtype))
writer.addDocument(doc)
writer.commit()
writer.close()
示例15: indexDocs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
def indexDocs(self, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for folder in os.listdir(rawDir):
for fileName in os.listdir(rawDir+'/'+folder):
if not fileName.endswith('.json'):
continue
print fileName
subject_id = fileName.split('_')[0]
# print rawDict
print 'id:'+subject_id
print 'folder:'+str(folder)
rawPath = rawDir+'/' +folder +'/'+subject_id+'_res.json'
adjPath = adjDir+'/' +folder +'/'+subject_id+'_adj.json'
#tfidfPath = tf_idfDir + '/' + subject_id+'_tfidf.json'
print adjPath
rawFile = open(rawPath,'r')
raw = rawFile.read()
if raw =='':
with open(baseDir+'/'+'err_no_raw_content.txt','a') as err:
err.write(subject_id+'\n')
# if subject_id =='6018943':
# rawFile.seek(0)
rawFile.close()
adjFile = open(adjPath,'r')
adj = adjFile.read()
adjFile.close()
raw = getRidOfBOM(raw)
adj = getRidOfBOM(adj)
if raw != '':
rawDict = json.loads(raw)
adjDict = json.loads(adj)
rawAll = rawDict['summary'] +' '+ rawDict['user_tags'] + ' '+rawDict['comments']
summary_adjs = adjDict['summary_adjs']
comments_adjs = adjDict['comments_adjs']
title = adjDict['title']
rating_average = adjDict['rating_average']
comments_count = adjDict['comments_count']
doc = Document()
doc.add(Field("folder",folder,t1))
doc.add(Field("title", title, t1))
doc.add(Field("subject_id", subject_id, t1))
doc.add(IntField("comments_count", comments_count, Field.Store.YES))
doc.add(FloatField("rating_average", rating_average, Field.Store.YES))
if len(summary_adjs)>0:
print summary_adjs
exit()
doc.add(Field("summary_adjs", summary_adjs, t2))
#if len(comments_adjs)>0:
doc.add(Field("comments_adjs", comments_adjs, t2))
writer.addDocument(doc)