本文整理汇总了Python中org.apache.lucene.document.FieldType.setTokenized方法的典型用法代码示例。如果您正苦于以下问题:Python FieldType.setTokenized方法的具体用法?Python FieldType.setTokenized怎么用?Python FieldType.setTokenized使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.document.FieldType
的用法示例。
在下文中一共展示了FieldType.setTokenized方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: index
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def index(self, root):
t = FieldType()
t.setIndexed(True)
t.setStored(True)
t.setTokenized(True)
t.setStoreTermVectors(True)
for path, dirs, files in os.walk(root):
for file in files:
filePath = os.path.join(path, file)
fd = open(filePath)
content = unicode(fd.read(), 'iso-8859-1')
fd.close()
doc = Document()
doc.add(Field('name', file, StringField.TYPE_STORED))
parent = os.path.split(path)[1]
doc.add(Field('parent', parent, StringField.TYPE_STORED))
if len(content) > 0:
doc.add(Field('content', content, t))
print 'Indexing %s' % file
self.mWriter.addDocument(doc)
self.mWriter.commit()
self.mWriter.close()
示例2: indexDocs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def indexDocs(self, root, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
if not filename.endswith('.html'):
continue
print "adding", filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = unicode(file.read(), 'iso-8859-1')
file.close()
doc = Document()
doc.add(Field("name", filename, t1))
doc.add(Field("path", root, t1))
if len(contents) > 0:
doc.add(Field("contents", contents, t2))
else:
print "warning: no content in %s" % filename
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例3: index_docs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def index_docs(self, train_set, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)
t2.setStoreTermVectorOffsets(True)
t2.setStoreTermVectorPayloads(True)
t2.setStoreTermVectorPositions(True)
t2.setStoreTermVectors(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for ii in train_set:
doc = Document()
doc.add(Field("answer", ii['Answer'], t1))
doc.add(Field("qid", ii['Question ID'], t1))
doc.add(Field("category", ii['category'], t1))
doc.add(Field("position", ii['Sentence Position'], t1))
doc.add(Field("question", ii['Question Text'], t2))
doc.add(Field("wiki_plain",
self.wiki_reader.get_text(ii['Answer']), t2))
writer.addDocument(doc)
示例4: indexDocs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def indexDocs(self, url, writer):
type1 = FieldType()
type1.setIndexed(True)
type1.setStored(True)
type1.setTokenized(False)
type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
type2 = FieldType()
type2.setIndexed(True)
type2.setStored(True)
type2.setTokenized(True)
type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
# Read Feeds
feeds = feedparser.parse(url)
for item in feeds["entries"]:
print "adding", item["title"]
try:
link = item["link"]
contents = item["description"].encode("utf-8")
contents = re.sub('<[^<]+?>', '', ''.join(contents))
title = item["title"]
doc = Document()
doc.add(Field("url", link, type1))
doc.add(Field("title", title, type1))
if len(contents) > 0:
doc.add(Field("contents", contents, type2))
else:
print "warning: no content in %s" % item["title"]
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例5: index_docs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def index_docs(self, tweets, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(True)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
t1.setStoreTermVectors(True)
t1.setStoreTermVectorOffsets(True)
# add each tweet to the index
for tweet in tweets:
try:
# strip out URLs because they provide false index matches
contents = []
for word in tweet[1].text.split():
if word.startswith("http://") or word.startswith("https://"):
continue
contents.append(word)
contents = " ".join(contents)
if len(contents) == 0: continue
doc = Document()
doc.add(Field("contents", contents, t1))
writer.addDocument(doc)
except Exception, e:
print "Failed in index_docs:", e
示例6: indexDocs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def indexDocs(self, root, writer):
t1 = FieldType() # for short items, e.g. file name.
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # DOCS_AND_FREQS_AND_POSITIONS_OFFSETS
t2 = FieldType() # for content
t2.setIndexed(True)
t2.setStored(False) # don't store the original text
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
print "adding", filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = unicode(file.read(), 'iso-8859-1')
file.close()
doc = Document()
doc.add(Field("name", filename, t1))
doc.add(Field("path", root, t1))
if len(contents) > 0:
doc.add(Field("contents", contents, t2))
else:
print "warning: no content in %s" % filename
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例7: _createNoTermsFrequencyFieldType
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def _createNoTermsFrequencyFieldType():
f = FieldType()
f.setIndexed(True)
f.setTokenized(True)
f.setOmitNorms(True)
f.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
f.freeze()
return f
示例8: LuceneDocumentField
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
class LuceneDocumentField(object):
"""Internal handler class for possible field types"""
def __init__(self):
"""Init possible field types"""
# FIELD_ID: stored, indexed, non-tokenized
self.field_id = FieldType()
self.field_id.setIndexed(True)
self.field_id.setStored(True)
self.field_id.setTokenized(False)
# FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions)
# for storing IDs with term vector info
self.field_id_tv = FieldType()
self.field_id_tv.setIndexed(True)
self.field_id_tv.setStored(True)
self.field_id_tv.setTokenized(False)
self.field_id_tv.setStoreTermVectors(True)
# FIELD_TEXT: stored, indexed, tokenized, with positions
self.field_text = FieldType()
self.field_text.setIndexed(True)
self.field_text.setStored(True)
self.field_text.setTokenized(True)
# FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions)
self.field_text_tv = FieldType()
self.field_text_tv.setIndexed(True)
self.field_text_tv.setStored(True)
self.field_text_tv.setTokenized(True)
self.field_text_tv.setStoreTermVectors(True)
# FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions
# (but no character offsets)
self.field_text_tvp = FieldType()
self.field_text_tvp.setIndexed(True)
self.field_text_tvp.setStored(True)
self.field_text_tvp.setTokenized(True)
self.field_text_tvp.setStoreTermVectors(True)
self.field_text_tvp.setStoreTermVectorPositions(True)
def get_field(self, type):
"""Get Lucene FieldType object for the corresponding internal FIELDTYPE_ value"""
if type == Lucene.FIELDTYPE_ID:
return self.field_id
elif type == Lucene.FIELDTYPE_ID_TV:
return self.field_id_tv
elif type == Lucene.FIELDTYPE_TEXT:
return self.field_text
elif type == Lucene.FIELDTYPE_TEXT_TV:
return self.field_text_tv
elif type == Lucene.FIELDTYPE_TEXT_TVP:
return self.field_text_tvp
else:
raise Exception("Unknown field type")
示例9: Indexer
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
class Indexer(object):
def __init__(self, **kwargs):
""" Initialize a new instance of the Indexer
:param output: The output directory of the underlying index
:param anaylzer: The overloaded analyzer to work with
"""
self.output = kwargs.get("root", "index")
if not os.path.exists(self.output):
os.mkdir(self.output)
self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT))
self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
self.store = SimpleFSDirectory(File(self.output))
self.writer = IndexWriter(self.store, self.config)
self.create_field_types()
def index(self, document):
""" Given a new document, add it to the index.
:param document: The document to add to the indexer
"""
try:
self.writer.addDocument(document)
except Exception:
logger.exception("Failed to index the supplied document")
def shutdown(self):
""" Shutdown the currently processing indexer.
"""
try:
# self.writer.optimize()
self.writer.close()
except Exception:
logger.exception("Failed to shutdown the indexer correctly")
def create_field_types(self):
""" Create the field types that will be used to specify
what actions lucene should take on the various fields
supplied to index.
"""
self.field_clean = FieldType()
self.field_clean.setIndexed(True)
self.field_clean.setStored(True)
self.field_clean.setTokenized(False)
self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
self.field_dirty = FieldType()
self.field_dirty.setIndexed(True)
self.field_dirty.setStored(False)
self.field_dirty.setTokenized(True)
self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
示例10: index_article
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def index_article(writer, art_id, art_body):
art_id_field = FieldType()
art_id_field.setIndexed(True)
art_id_field.setStored(True)
art_id_field.setTokenized(False)
art_id_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
art_body_field = FieldType()
art_body_field.setIndexed(True)
art_body_field.setStored(True)
art_body_field.setTokenized(True)
art_body_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
doc = Document()
doc.add(Field("art_id", str(art_id), art_id_field))
doc.add(Field("art_body", art_body, art_body_field))
writer.addDocument(doc)
示例11: setUp
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def setUp(self):
super(Test_Bug1842, self).setUp()
self.analyzer = StandardAnalyzer()
w1 = self.getWriter(analyzer=self.analyzer)
doc1 = Document()
ftype = FieldType()
ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
ftype.setTokenized(True)
ftype.setStoreTermVectors(True)
ftype.freeze()
doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))
w1.addDocument(doc1)
w1.close()
示例12: lazyImport
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def lazyImport():
global imported
if imported:
return
from meresco.pylucene import getJVM
getJVM()
from java.nio.file import Paths
from org.apache.lucene.document import Document, StringField, Field, FieldType
from org.apache.lucene.search import IndexSearcher, TermQuery
from org.apache.lucene.index import DirectoryReader, Term, IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.util import Version
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
UNINDEXED_TYPE = FieldType()
UNINDEXED_TYPE.setIndexOptions(IndexOptions.NONE)
UNINDEXED_TYPE.setStored(True)
UNINDEXED_TYPE.setTokenized(False)
imported = True
globals().update(locals())
示例13: indexDocs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def indexDocs(self, root, writer):
#Create a new FieldType with default properties.
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)#True if this field's value should be analyzed by the Analyzer.
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
#Create a new FieldType with default properties.
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)#True if this field's value should be analyzed by the Analyzer.
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
if not filename.endswith('.txt'):
continue
print 'adding', filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = file.read()
file.close()
doc = Document()
doc.add(Field('name', filename, t1))
doc.add(Field('path', root, t1))
if len(contents) > 0:
doc.add(Field('contents', contents, t2))
print 'length of content is %d'%(len(contents))
else:
print 'warning: no content in %s' % filename
writer.addDocument(doc)
except Exception, e:
print 'Failed in indexDocs:', e
示例14: indexDocs
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def indexDocs(self,root,writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(True)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames,filenames in os.walk(root):
# traverse through the doc directory
for filename in filenames:
# only if this file ends with '.c'
if not filename.endswith('.c'):
continue
try:
# only add the filename and path for indexing
path = os.path.join(root,filename)
print "adding file : ",path
file = open(path)
contents = unicode(file.read(),'utf-8')
file.close()
doc = Document()
doc.add(Field("name",filename,t1))
doc.add(Field("path",root,t1))
# if len(contents) > 0:
# doc.add(Field("contents",contents,t2))
# else:
# print "warning: no content in ",filename
writer.addDocument(doc)
except Exception,e:
print "failed in indexDocs:",e
示例15: index_wiki
# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def index_wiki(wiki_xmlfile, index_directory_name):
lucene.initVM()
# Initialize index directory and analyzer.
version = Version.LUCENE_CURRENT
store = FSDirectory.open(File(index_directory_name))
analyzer = StandardAnalyzer(version)
# Creates config file.
config = IndexWriterConfig(version, analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
# Set document content field type.
content_fieldtype = FieldType()
content_fieldtype.setIndexed(True)
content_fieldtype.setStored(True)
content_fieldtype.setTokenized(True)
content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
# Set document title field type.
title_fieldtype = FieldType()
title_fieldtype.setIndexed(True)
title_fieldtype.setStored(True)
title_fieldtype.setTokenized(True)
title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
# Set document url field type.
url_fieldtype = FieldType()
url_fieldtype.setIndexed(True)
url_fieldtype.setStored(True)
url_fieldtype.setTokenized(False)
url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for xmldoc in wikicorpusxml((wiki_xmlfile)):
content = xmldoc.partition('>')[2].partition('<')[0].strip()
title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
doc = Document()
doc.add(Field("contents", content, content_fieldtype))
doc.add(Field("title", title, title_fieldtype))
doc.add(Field("url", url, url_fieldtype))
writer.addDocument(doc)
writer.commit()
writer.close()