本文整理匯總了Python中org.apache.lucene.document.FieldType.setTokenized方法的典型用法代碼示例。如果您正苦於以下問題:Python FieldType.setTokenized方法的具體用法?Python FieldType.setTokenized怎麽用?Python FieldType.setTokenized使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類org.apache.lucene.document.FieldType
的用法示例。
在下文中一共展示了FieldType.setTokenized方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: index
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 別名]
def index(self, root):
t = FieldType()
t.setIndexed(True)
t.setStored(True)
t.setTokenized(True)
t.setStoreTermVectors(True)
for path, dirs, files in os.walk(root):
for file in files:
filePath = os.path.join(path, file)
fd = open(filePath)
content = unicode(fd.read(), 'iso-8859-1')
fd.close()
doc = Document()
doc.add(Field('name', file, StringField.TYPE_STORED))
parent = os.path.split(path)[1]
doc.add(Field('parent', parent, StringField.TYPE_STORED))
if len(content) > 0:
doc.add(Field('content', content, t))
print 'Indexing %s' % file
self.mWriter.addDocument(doc)
self.mWriter.commit()
self.mWriter.close()
示例2: indexDocs
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 別名]
def indexDocs(self, root, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
if not filename.endswith('.html'):
continue
print "adding", filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = unicode(file.read(), 'iso-8859-1')
file.close()
doc = Document()
doc.add(Field("name", filename, t1))
doc.add(Field("path", root, t1))
if len(contents) > 0:
doc.add(Field("contents", contents, t2))
else:
print "warning: no content in %s" % filename
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例3: index_docs
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 別名]
def index_docs(self, train_set, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)
t2.setStoreTermVectorOffsets(True)
t2.setStoreTermVectorPayloads(True)
t2.setStoreTermVectorPositions(True)
t2.setStoreTermVectors(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for ii in train_set:
doc = Document()
doc.add(Field("answer", ii['Answer'], t1))
doc.add(Field("qid", ii['Question ID'], t1))
doc.add(Field("category", ii['category'], t1))
doc.add(Field("position", ii['Sentence Position'], t1))
doc.add(Field("question", ii['Question Text'], t2))
doc.add(Field("wiki_plain",
self.wiki_reader.get_text(ii['Answer']), t2))
writer.addDocument(doc)
示例4: indexDocs
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 別名]
def indexDocs(self, url, writer):
type1 = FieldType()
type1.setIndexed(True)
type1.setStored(True)
type1.setTokenized(False)
type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
type2 = FieldType()
type2.setIndexed(True)
type2.setStored(True)
type2.setTokenized(True)
type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
# Read Feeds
feeds = feedparser.parse(url)
for item in feeds["entries"]:
print "adding", item["title"]
try:
link = item["link"]
contents = item["description"].encode("utf-8")
contents = re.sub('<[^<]+?>', '', ''.join(contents))
title = item["title"]
doc = Document()
doc.add(Field("url", link, type1))
doc.add(Field("title", title, type1))
if len(contents) > 0:
doc.add(Field("contents", contents, type2))
else:
print "warning: no content in %s" % item["title"]
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例5: index_docs
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 別名]
def index_docs(self, tweets, writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(True)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
t1.setStoreTermVectors(True)
t1.setStoreTermVectorOffsets(True)
# add each tweet to the index
for tweet in tweets:
try:
# strip out URLs because they provide false index matches
contents = []
for word in tweet[1].text.split():
if word.startswith("http://") or word.startswith("https://"):
continue
contents.append(word)
contents = " ".join(contents)
if len(contents) == 0: continue
doc = Document()
doc.add(Field("contents", contents, t1))
writer.addDocument(doc)
except Exception, e:
print "Failed in index_docs:", e
示例6: indexDocs
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 別名]
def indexDocs(self, root, writer):
t1 = FieldType() # for short items, e.g. file name.
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # DOCS_AND_FREQS_AND_POSITIONS_OFFSETS
t2 = FieldType() # for content
t2.setIndexed(True)
t2.setStored(False) # don't store the original text
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
print "adding", filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = unicode(file.read(), 'iso-8859-1')
file.close()
doc = Document()
doc.add(Field("name", filename, t1))
doc.add(Field("path", root, t1))
if len(contents) > 0:
doc.add(Field("contents", contents, t2))
else:
print "warning: no content in %s" % filename
writer.addDocument(doc)
except Exception, e:
print "Failed in indexDocs:", e
示例7: _createNoTermsFrequencyFieldType
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 別名]
def _createNoTermsFrequencyFieldType():
f = FieldType()
f.setIndexed(True)
f.setTokenized(True)
f.setOmitNorms(True)
f.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
f.freeze()
return f
示例8: LuceneDocumentField
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 別名]
class LuceneDocumentField(object):
"""Internal handler class for possible field types"""
def __init__(self):
"""Init possible field types"""
# FIELD_ID: stored, indexed, non-tokenized
self.field_id = FieldType()
self.field_id.setIndexed(True)
self.field_id.setStored(True)
self.field_id.setTokenized(False)
# FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions)
# for storing IDs with term vector info
self.field_id_tv = FieldType()
self.field_id_tv.setIndexed(True)
self.field_id_tv.setStored(True)
self.field_id_tv.setTokenized(False)
self.field_id_tv.setStoreTermVectors(True)
# FIELD_TEXT: stored, indexed, tokenized, with positions
self.field_text = FieldType()
self.field_text.setIndexed(True)
self.field_text.setStored(True)
self.field_text.setTokenized(True)
# FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions)
self.field_text_tv = FieldType()
self.field_text_tv.setIndexed(True)
self.field_text_tv.setStored(True)
self.field_text_tv.setTokenized(True)
self.field_text_tv.setStoreTermVectors(True)
# FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions
# (but no character offsets)
self.field_text_tvp = FieldType()
self.field_text_tvp.setIndexed(True)
self.field_text_tvp.setStored(True)
self.field_text_tvp.setTokenized(True)
self.field_text_tvp.setStoreTermVectors(True)
self.field_text_tvp.setStoreTermVectorPositions(True)
def get_field(self, type):
"""Get Lucene FieldType object for the corresponding internal FIELDTYPE_ value"""
if type == Lucene.FIELDTYPE_ID:
return self.field_id
elif type == Lucene.FIELDTYPE_ID_TV:
return self.field_id_tv
elif type == Lucene.FIELDTYPE_TEXT:
return self.field_text
elif type == Lucene.FIELDTYPE_TEXT_TV:
return self.field_text_tv
elif type == Lucene.FIELDTYPE_TEXT_TVP:
return self.field_text_tvp
else:
raise Exception("Unknown field type")
示例9: Indexer
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 別名]
class Indexer(object):
def __init__(self, **kwargs):
""" Initialize a new instance of the Indexer
:param output: The output directory of the underlying index
:param anaylzer: The overloaded analyzer to work with
"""
self.output = kwargs.get("root", "index")
if not os.path.exists(self.output):
os.mkdir(self.output)
self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT))
self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
self.store = SimpleFSDirectory(File(self.output))
self.writer = IndexWriter(self.store, self.config)
self.create_field_types()
def index(self, document):
""" Given a new document, add it to the index.
:param document: The document to add to the indexer
"""
try:
self.writer.addDocument(document)
except Exception:
logger.exception("Failed to index the supplied document")
def shutdown(self):
""" Shutdown the currently processing indexer.
"""
try:
# self.writer.optimize()
self.writer.close()
except Exception:
logger.exception("Failed to shutdown the indexer correctly")
def create_field_types(self):
""" Create the field types that will be used to specify
what actions lucene should take on the various fields
supplied to index.
"""
self.field_clean = FieldType()
self.field_clean.setIndexed(True)
self.field_clean.setStored(True)
self.field_clean.setTokenized(False)
self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
self.field_dirty = FieldType()
self.field_dirty.setIndexed(True)
self.field_dirty.setStored(False)
self.field_dirty.setTokenized(True)
self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
示例10: index_article
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 別名]
def index_article(writer, art_id, art_body):
art_id_field = FieldType()
art_id_field.setIndexed(True)
art_id_field.setStored(True)
art_id_field.setTokenized(False)
art_id_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
art_body_field = FieldType()
art_body_field.setIndexed(True)
art_body_field.setStored(True)
art_body_field.setTokenized(True)
art_body_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
doc = Document()
doc.add(Field("art_id", str(art_id), art_id_field))
doc.add(Field("art_body", art_body, art_body_field))
writer.addDocument(doc)
示例11: setUp
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 別名]
def setUp(self):
super(Test_Bug1842, self).setUp()
self.analyzer = StandardAnalyzer()
w1 = self.getWriter(analyzer=self.analyzer)
doc1 = Document()
ftype = FieldType()
ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
ftype.setTokenized(True)
ftype.setStoreTermVectors(True)
ftype.freeze()
doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))
w1.addDocument(doc1)
w1.close()
示例12: lazyImport
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 別名]
def lazyImport():
global imported
if imported:
return
from meresco.pylucene import getJVM
getJVM()
from java.nio.file import Paths
from org.apache.lucene.document import Document, StringField, Field, FieldType
from org.apache.lucene.search import IndexSearcher, TermQuery
from org.apache.lucene.index import DirectoryReader, Term, IndexWriter, IndexWriterConfig, IndexOptions
from org.apache.lucene.store import FSDirectory
from org.apache.lucene.util import Version
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
UNINDEXED_TYPE = FieldType()
UNINDEXED_TYPE.setIndexOptions(IndexOptions.NONE)
UNINDEXED_TYPE.setStored(True)
UNINDEXED_TYPE.setTokenized(False)
imported = True
globals().update(locals())
示例13: indexDocs
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 別名]
def indexDocs(self, root, writer):
#Create a new FieldType with default properties.
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(False)#True if this field's value should be analyzed by the Analyzer.
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
#Create a new FieldType with default properties.
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(True)
t2.setTokenized(True)#True if this field's value should be analyzed by the Analyzer.
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
if not filename.endswith('.txt'):
continue
print 'adding', filename
try:
path = os.path.join(root, filename)
file = open(path)
contents = file.read()
file.close()
doc = Document()
doc.add(Field('name', filename, t1))
doc.add(Field('path', root, t1))
if len(contents) > 0:
doc.add(Field('contents', contents, t2))
print 'length of content is %d'%(len(contents))
else:
print 'warning: no content in %s' % filename
writer.addDocument(doc)
except Exception, e:
print 'Failed in indexDocs:', e
示例14: indexDocs
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 別名]
def indexDocs(self,root,writer):
t1 = FieldType()
t1.setIndexed(True)
t1.setStored(True)
t1.setTokenized(True)
t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
t2 = FieldType()
t2.setIndexed(True)
t2.setStored(False)
t2.setTokenized(True)
t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for root, dirnames,filenames in os.walk(root):
# traverse through the doc directory
for filename in filenames:
# only if this file ends with '.c'
if not filename.endswith('.c'):
continue
try:
# only add the filename and path for indexing
path = os.path.join(root,filename)
print "adding file : ",path
file = open(path)
contents = unicode(file.read(),'utf-8')
file.close()
doc = Document()
doc.add(Field("name",filename,t1))
doc.add(Field("path",root,t1))
# if len(contents) > 0:
# doc.add(Field("contents",contents,t2))
# else:
# print "warning: no content in ",filename
writer.addDocument(doc)
except Exception,e:
print "failed in indexDocs:",e
示例15: index_wiki
# 需要導入模塊: from org.apache.lucene.document import FieldType [as 別名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 別名]
def index_wiki(wiki_xmlfile, index_directory_name):
lucene.initVM()
# Initialize index directory and analyzer.
version = Version.LUCENE_CURRENT
store = FSDirectory.open(File(index_directory_name))
analyzer = StandardAnalyzer(version)
# Creates config file.
config = IndexWriterConfig(version, analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)
# Set document content field type.
content_fieldtype = FieldType()
content_fieldtype.setIndexed(True)
content_fieldtype.setStored(True)
content_fieldtype.setTokenized(True)
content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
# Set document title field type.
title_fieldtype = FieldType()
title_fieldtype.setIndexed(True)
title_fieldtype.setStored(True)
title_fieldtype.setTokenized(True)
title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
# Set document url field type.
url_fieldtype = FieldType()
url_fieldtype.setIndexed(True)
url_fieldtype.setStored(True)
url_fieldtype.setTokenized(False)
url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
for xmldoc in wikicorpusxml((wiki_xmlfile)):
content = xmldoc.partition('>')[2].partition('<')[0].strip()
title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
doc = Document()
doc.add(Field("contents", content, content_fieldtype))
doc.add(Field("title", title, title_fieldtype))
doc.add(Field("url", url, url_fieldtype))
writer.addDocument(doc)
writer.commit()
writer.close()