Python FieldType.setTokenized方法代码示例

本文整理汇总了Python中org.apache.lucene.document.FieldType.setTokenized方法的典型用法代码示例。如果您正苦于以下问题：Python FieldType.setTokenized方法的具体用法？Python FieldType.setTokenized怎么用？Python FieldType.setTokenized使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.apache.lucene.document.FieldType的用法示例。

在下文中一共展示了FieldType.setTokenized方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: index

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
	def index(self, root):

		t = FieldType()
		t.setIndexed(True)
		t.setStored(True)
		t.setTokenized(True)
		t.setStoreTermVectors(True)
		
		for path, dirs, files in os.walk(root):
			
			for file in files:
				
				filePath = os.path.join(path, file)
				fd = open(filePath)
				content = unicode(fd.read(), 'iso-8859-1')
				fd.close()
				
				doc = Document()
				doc.add(Field('name', file, StringField.TYPE_STORED))

				parent = os.path.split(path)[1]
				doc.add(Field('parent', parent, StringField.TYPE_STORED))

				if len(content) > 0:
					doc.add(Field('content', content, t))

				print 'Indexing %s' % file
				self.mWriter.addDocument(doc)

		self.mWriter.commit()
		self.mWriter.close()

开发者ID:haonguyen14，项目名称:CLIFinder，代码行数:33，代码来源:Indexer.py

示例2: indexDocs

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        
        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                if not filename.endswith('.html'):
                    continue
                print "adding", filename
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = unicode(file.read(), 'iso-8859-1')
                    file.close()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e

开发者ID:devs4v，项目名称:devs4v-information-retrieval15，代码行数:36，代码来源:IndexFiles.py

示例3: index_docs

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
    def index_docs(self, train_set, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPayloads(True)
        t2.setStoreTermVectorPositions(True)
        t2.setStoreTermVectors(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for ii in train_set:
            doc = Document()
            doc.add(Field("answer", ii['Answer'], t1))
            doc.add(Field("qid", ii['Question ID'], t1))
            doc.add(Field("category", ii['category'], t1))
            doc.add(Field("position", ii['Sentence Position'], t1))
            doc.add(Field("question", ii['Question Text'], t2))
            doc.add(Field("wiki_plain",
                          self.wiki_reader.get_text(ii['Answer']), t2))
            writer.addDocument(doc)

开发者ID:sangheestyle，项目名称:nlp2014，代码行数:30，代码来源:index.py

示例4: indexDocs

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
    def indexDocs(self, url, writer):
        type1 = FieldType()
        type1.setIndexed(True)
        type1.setStored(True)
        type1.setTokenized(False)
        type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        type2 = FieldType()
        type2.setIndexed(True)
        type2.setStored(True)
        type2.setTokenized(True)
        type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
        
        # Read Feeds
        feeds = feedparser.parse(url)

        for item in feeds["entries"]:
            print "adding", item["title"] 
            try:
                link = item["link"] 
                contents = item["description"].encode("utf-8")
                contents = re.sub('<[^<]+?>', '', ''.join(contents))
                title = item["title"]
                doc = Document()
                doc.add(Field("url", link, type1))
                doc.add(Field("title", title, type1))
                if len(contents) > 0:
                    doc.add(Field("contents", contents, type2))
                else:
                    print "warning: no content in %s" % item["title"] 
                writer.addDocument(doc)
            except Exception, e:
                 print "Failed in indexDocs:", e

开发者ID:yelinkyaw，项目名称:FeedsIndexer，代码行数:35，代码来源:IndexFeeds.py

示例5: index_docs

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
    def index_docs(self, tweets, writer):
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        t1.setStoreTermVectors(True)
        t1.setStoreTermVectorOffsets(True)

        # add each tweet to the index
        for tweet in tweets:
            try:
                # strip out URLs because they provide false index matches
                contents = []
                for word in tweet[1].text.split():
                    if word.startswith("http://") or word.startswith("https://"):
                        continue
                    contents.append(word)
                contents = " ".join(contents)

                if len(contents) == 0: continue

                doc = Document()
                doc.add(Field("contents", contents, t1))
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in index_docs:", e

开发者ID:ryancutter，项目名称:OnlyWorthy，代码行数:29，代码来源:onlyworthy_dev.py

示例6: indexDocs

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
 def indexDocs(self, root, writer):
     t1 = FieldType() # for short items, e.g. file name.
     t1.setIndexed(True)
     t1.setStored(True)
     t1.setTokenized(False)
     t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # DOCS_AND_FREQS_AND_POSITIONS_OFFSETS
     
     t2 = FieldType() # for content
     t2.setIndexed(True)
     t2.setStored(False) # don't store the original text
     t2.setTokenized(True)
     t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
     
     for root, dirnames, filenames in os.walk(root):
         for filename in filenames:
             print "adding", filename
             try:
                 path = os.path.join(root, filename)
                 file = open(path)
                 contents = unicode(file.read(), 'iso-8859-1')
                 file.close()
                 doc = Document()
                 doc.add(Field("name", filename, t1))
                 doc.add(Field("path", root, t1))
                 if len(contents) > 0:
                     doc.add(Field("contents", contents, t2))
                 else:
                     print "warning: no content in %s" % filename
                 writer.addDocument(doc)
             except Exception, e:
                 print "Failed in indexDocs:", e

开发者ID:w2wei，项目名称:XPRC，代码行数:33，代码来源:IndexFiles.py

示例7: _createNoTermsFrequencyFieldType

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def _createNoTermsFrequencyFieldType():
    f = FieldType()
    f.setIndexed(True)
    f.setTokenized(True)
    f.setOmitNorms(True)
    f.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
    f.freeze()
    return f

开发者ID:jerryba，项目名称:meresco-lucene，代码行数:10，代码来源:fieldregistry.py

示例8: LuceneDocumentField

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
class LuceneDocumentField(object):
    """Internal handler class for possible field types"""

    def __init__(self):
        """Init possible field types"""

        # FIELD_ID: stored, indexed, non-tokenized
        self.field_id = FieldType()
        self.field_id.setIndexed(True)
        self.field_id.setStored(True)
        self.field_id.setTokenized(False)

        # FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions)
        # for storing IDs with term vector info
        self.field_id_tv = FieldType()
        self.field_id_tv.setIndexed(True)
        self.field_id_tv.setStored(True)
        self.field_id_tv.setTokenized(False)
        self.field_id_tv.setStoreTermVectors(True)

        # FIELD_TEXT: stored, indexed, tokenized, with positions
        self.field_text = FieldType()
        self.field_text.setIndexed(True)
        self.field_text.setStored(True)
        self.field_text.setTokenized(True)

        # FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions)
        self.field_text_tv = FieldType()
        self.field_text_tv.setIndexed(True)
        self.field_text_tv.setStored(True)
        self.field_text_tv.setTokenized(True)
        self.field_text_tv.setStoreTermVectors(True)

        # FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions
        # (but no character offsets)
        self.field_text_tvp = FieldType()
        self.field_text_tvp.setIndexed(True)
        self.field_text_tvp.setStored(True)
        self.field_text_tvp.setTokenized(True)
        self.field_text_tvp.setStoreTermVectors(True)
        self.field_text_tvp.setStoreTermVectorPositions(True)

    def get_field(self, type):
        """Get Lucene FieldType object for the corresponding internal FIELDTYPE_ value"""
        if type == Lucene.FIELDTYPE_ID:
            return self.field_id
        elif type == Lucene.FIELDTYPE_ID_TV:
            return self.field_id_tv
        elif type == Lucene.FIELDTYPE_TEXT:
            return self.field_text
        elif type == Lucene.FIELDTYPE_TEXT_TV:
            return self.field_text_tv
        elif type == Lucene.FIELDTYPE_TEXT_TVP:
            return self.field_text_tvp
        else:
            raise Exception("Unknown field type")

开发者ID:renespeck，项目名称:TAGME_Reproducibility，代码行数:58，代码来源:lucene_tools.py

示例9: Indexer

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
class Indexer(object):
    def __init__(self, **kwargs):
        """ Initialize a new instance of the Indexer

        :param output: The output directory of the underlying index
        :param anaylzer: The overloaded analyzer to work with
        """
        self.output = kwargs.get("root", "index")
        if not os.path.exists(self.output):
            os.mkdir(self.output)

        self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT))
        self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.store = SimpleFSDirectory(File(self.output))
        self.writer = IndexWriter(self.store, self.config)
        self.create_field_types()

    def index(self, document):
        """ Given a new document, add it to the index.

        :param document: The document to add to the indexer
        """
        try:
            self.writer.addDocument(document)
        except Exception:
            logger.exception("Failed to index the supplied document")

    def shutdown(self):
        """ Shutdown the currently processing indexer.
        """
        try:
            # self.writer.optimize()
            self.writer.close()
        except Exception:
            logger.exception("Failed to shutdown the indexer correctly")

    def create_field_types(self):
        """ Create the field types that will be used to specify
        what actions lucene should take on the various fields
        supplied to index.
        """
        self.field_clean = FieldType()
        self.field_clean.setIndexed(True)
        self.field_clean.setStored(True)
        self.field_clean.setTokenized(False)
        self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        self.field_dirty = FieldType()
        self.field_dirty.setIndexed(True)
        self.field_dirty.setStored(False)
        self.field_dirty.setTokenized(True)
        self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

开发者ID:bashwork，项目名称:common，代码行数:56，代码来源:filesearch.py

示例10: index_article

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def index_article(writer, art_id, art_body):
    art_id_field = FieldType()
    art_id_field.setIndexed(True)
    art_id_field.setStored(True)
    art_id_field.setTokenized(False)
    art_id_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

    art_body_field = FieldType()
    art_body_field.setIndexed(True)
    art_body_field.setStored(True)
    art_body_field.setTokenized(True)
    art_body_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    doc = Document()
    doc.add(Field("art_id", str(art_id), art_id_field))
    doc.add(Field("art_body", art_body, art_body_field))

    writer.addDocument(doc)

开发者ID:andrely，项目名称:vg-pipeline，代码行数:20，代码来源:indexing.py

示例11: setUp

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
    def setUp(self):
        super(Test_Bug1842, self).setUp()

        self.analyzer = StandardAnalyzer()

        w1 = self.getWriter(analyzer=self.analyzer)
        doc1 = Document()

        ftype = FieldType()
        ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        ftype.setTokenized(True)
        ftype.setStoreTermVectors(True)
        ftype.freeze()

        doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
        doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))

        w1.addDocument(doc1)
        w1.close()

开发者ID:svn2github，项目名称:pylucene，代码行数:21，代码来源:test_bug1842.py

示例12: lazyImport

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def lazyImport():
    global imported
    if imported:
        return

    from meresco.pylucene import getJVM
    getJVM()

    from java.nio.file import Paths
    from org.apache.lucene.document import Document, StringField, Field, FieldType
    from org.apache.lucene.search import IndexSearcher, TermQuery
    from org.apache.lucene.index import DirectoryReader, Term, IndexWriter, IndexWriterConfig, IndexOptions
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from org.apache.lucene.analysis.core import WhitespaceAnalyzer

    UNINDEXED_TYPE = FieldType()
    UNINDEXED_TYPE.setIndexOptions(IndexOptions.NONE)
    UNINDEXED_TYPE.setStored(True)
    UNINDEXED_TYPE.setTokenized(False)

    imported = True
    globals().update(locals())

开发者ID:seecr，项目名称:meresco-lucene，代码行数:25，代码来源:lucenekeyvaluestore.py

示例13: indexDocs

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
 def indexDocs(self, root, writer): 
      
     #Create a new FieldType with default properties. 
     t1 = FieldType() 
     t1.setIndexed(True) 
     t1.setStored(True) 
     t1.setTokenized(False)#True if this field's value should be analyzed by the Analyzer. 
     t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) 
      
     #Create a new FieldType with default properties. 
     t2 = FieldType() 
     t2.setIndexed(True) 
     t2.setStored(True) 
     t2.setTokenized(True)#True if this field's value should be analyzed by the Analyzer. 
     t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) 
      
     for root, dirnames, filenames in os.walk(root): 
         for filename in filenames: 
             if not filename.endswith('.txt'): 
                 continue 
             print 'adding', filename 
             try: 
                 path = os.path.join(root, filename) 
                 file = open(path) 
                 contents = file.read() 
                 file.close() 
                 doc = Document() 
                 doc.add(Field('name', filename, t1)) 
                 doc.add(Field('path', root, t1)) 
                 if len(contents) > 0: 
                     doc.add(Field('contents', contents, t2)) 
                     print 'length of content is %d'%(len(contents)) 
                 else: 
                     print 'warning: no content in %s' % filename 
                 writer.addDocument(doc) 
             except Exception, e: 
                 print 'Failed in indexDocs:', e

开发者ID:ouceduxzk，项目名称:AI2-Kaggle，代码行数:39，代码来源:indexer.py

示例14: indexDocs

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
	def indexDocs(self,root,writer):
		t1 = FieldType()
		t1.setIndexed(True)
		t1.setStored(True)
		t1.setTokenized(True)
		t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

		t2 = FieldType()
		t2.setIndexed(True)
		t2.setStored(False)
		t2.setTokenized(True)
		t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

		for root, dirnames,filenames in os.walk(root):
			# traverse through the doc directory
			for filename in filenames:
				# only if this file ends with '.c'
				if not filename.endswith('.c'):
					continue
				try:
					# only add the filename and path for indexing
					path = os.path.join(root,filename)
					print "adding file : ",path
					file = open(path)
					contents = unicode(file.read(),'utf-8')
					file.close()
					doc = Document()
					doc.add(Field("name",filename,t1))
					doc.add(Field("path",root,t1))
				#	if len(contents) > 0:
				#		doc.add(Field("contents",contents,t2))
				#	else:
				#		print "warning: no content in ",filename
					writer.addDocument(doc)
				except Exception,e:
					print "failed in indexDocs:",e

开发者ID:zz-mars，项目名称:simple-search，代码行数:38，代码来源:indexer.py

示例15: index_wiki

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setTokenized [as 别名]
def index_wiki(wiki_xmlfile, index_directory_name):
    lucene.initVM()
    # Initialize index directory and analyzer.
    version = Version.LUCENE_CURRENT
    store = FSDirectory.open(File(index_directory_name))
    analyzer = StandardAnalyzer(version)
    # Creates config file.
    config = IndexWriterConfig(version, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)
    # Set document content field type.
    content_fieldtype = FieldType()
    content_fieldtype.setIndexed(True)
    content_fieldtype.setStored(True)
    content_fieldtype.setTokenized(True)
    content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    # Set document title field type.
    title_fieldtype = FieldType()
    title_fieldtype.setIndexed(True)
    title_fieldtype.setStored(True)
    title_fieldtype.setTokenized(True)
    title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    # Set document url field type.
    url_fieldtype = FieldType()
    url_fieldtype.setIndexed(True)
    url_fieldtype.setStored(True)
    url_fieldtype.setTokenized(False)
    url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    
    for xmldoc in wikicorpusxml((wiki_xmlfile)):
        content = xmldoc.partition('>')[2].partition('<')[0].strip()
        title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
        url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
        doc = Document()
        doc.add(Field("contents", content, content_fieldtype))
        doc.add(Field("title", title, title_fieldtype))
        doc.add(Field("url", url, url_fieldtype))
        writer.addDocument(doc)
     
    writer.commit()
    writer.close()

开发者ID:alvations，项目名称:Wikicorpus，代码行数:46，代码来源:WikiIndex.py

注：本文中的org.apache.lucene.document.FieldType.setTokenized方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。