当前位置: 首页>>代码示例>>Python>>正文


Python FieldType.setIndexed方法代码示例

本文整理汇总了Python中org.apache.lucene.document.FieldType.setIndexed方法的典型用法代码示例。如果您正苦于以下问题:Python FieldType.setIndexed方法的具体用法?Python FieldType.setIndexed怎么用?Python FieldType.setIndexed使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.lucene.document.FieldType的用法示例。


在下文中一共展示了FieldType.setIndexed方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: index_docs

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
    def index_docs(self, tweets, writer):
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        t1.setStoreTermVectors(True)
        t1.setStoreTermVectorOffsets(True)

        # add each tweet to the index
        for tweet in tweets:
            try:
                # strip out URLs because they provide false index matches
                contents = []
                for word in tweet[1].text.split():
                    if word.startswith("http://") or word.startswith("https://"):
                        continue
                    contents.append(word)
                contents = " ".join(contents)

                if len(contents) == 0: continue

                doc = Document()
                doc.add(Field("contents", contents, t1))
                writer.addDocument(doc)
            except Exception, e:
                print "Failed in index_docs:", e
开发者ID:ryancutter,项目名称:OnlyWorthy,代码行数:29,代码来源:onlyworthy_dev.py

示例2: indexDocs

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        
        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                if not filename.endswith('.html'):
                    continue
                print "adding", filename
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = unicode(file.read(), 'iso-8859-1')
                    file.close()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e
开发者ID:devs4v,项目名称:devs4v-information-retrieval15,代码行数:36,代码来源:IndexFiles.py

示例3: index_docs

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
    def index_docs(self, train_set, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPayloads(True)
        t2.setStoreTermVectorPositions(True)
        t2.setStoreTermVectors(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for ii in train_set:
            doc = Document()
            doc.add(Field("answer", ii['Answer'], t1))
            doc.add(Field("qid", ii['Question ID'], t1))
            doc.add(Field("category", ii['category'], t1))
            doc.add(Field("position", ii['Sentence Position'], t1))
            doc.add(Field("question", ii['Question Text'], t2))
            doc.add(Field("wiki_plain",
                          self.wiki_reader.get_text(ii['Answer']), t2))
            writer.addDocument(doc)
开发者ID:sangheestyle,项目名称:nlp2014,代码行数:30,代码来源:index.py

示例4: indexDocs

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
 def indexDocs(self, root, writer):
     t1 = FieldType() # for short items, e.g. file name.
     t1.setIndexed(True)
     t1.setStored(True)
     t1.setTokenized(False)
     t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # DOCS_AND_FREQS_AND_POSITIONS_OFFSETS
     
     t2 = FieldType() # for content
     t2.setIndexed(True)
     t2.setStored(False) # don't store the original text
     t2.setTokenized(True)
     t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
     
     for root, dirnames, filenames in os.walk(root):
         for filename in filenames:
             print "adding", filename
             try:
                 path = os.path.join(root, filename)
                 file = open(path)
                 contents = unicode(file.read(), 'iso-8859-1')
                 file.close()
                 doc = Document()
                 doc.add(Field("name", filename, t1))
                 doc.add(Field("path", root, t1))
                 if len(contents) > 0:
                     doc.add(Field("contents", contents, t2))
                 else:
                     print "warning: no content in %s" % filename
                 writer.addDocument(doc)
             except Exception, e:
                 print "Failed in indexDocs:", e
开发者ID:w2wei,项目名称:XPRC,代码行数:33,代码来源:IndexFiles.py

示例5: index

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
	def index(self, root):

		t = FieldType()
		t.setIndexed(True)
		t.setStored(True)
		t.setTokenized(True)
		t.setStoreTermVectors(True)
		
		for path, dirs, files in os.walk(root):
			
			for file in files:
				
				filePath = os.path.join(path, file)
				fd = open(filePath)
				content = unicode(fd.read(), 'iso-8859-1')
				fd.close()
				
				doc = Document()
				doc.add(Field('name', file, StringField.TYPE_STORED))

				parent = os.path.split(path)[1]
				doc.add(Field('parent', parent, StringField.TYPE_STORED))

				if len(content) > 0:
					doc.add(Field('content', content, t))

				print 'Indexing %s' % file
				self.mWriter.addDocument(doc)

		self.mWriter.commit()
		self.mWriter.close()
开发者ID:haonguyen14,项目名称:CLIFinder,代码行数:33,代码来源:Indexer.py

示例6: indexDocs

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
    def indexDocs(self, url, writer):
        type1 = FieldType()
        type1.setIndexed(True)
        type1.setStored(True)
        type1.setTokenized(False)
        type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        type2 = FieldType()
        type2.setIndexed(True)
        type2.setStored(True)
        type2.setTokenized(True)
        type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
        
        # Read Feeds
        feeds = feedparser.parse(url)

        for item in feeds["entries"]:
            print "adding", item["title"] 
            try:
                link = item["link"] 
                contents = item["description"].encode("utf-8")
                contents = re.sub('<[^<]+?>', '', ''.join(contents))
                title = item["title"]
                doc = Document()
                doc.add(Field("url", link, type1))
                doc.add(Field("title", title, type1))
                if len(contents) > 0:
                    doc.add(Field("contents", contents, type2))
                else:
                    print "warning: no content in %s" % item["title"] 
                writer.addDocument(doc)
            except Exception, e:
                 print "Failed in indexDocs:", e
开发者ID:yelinkyaw,项目名称:FeedsIndexer,代码行数:35,代码来源:IndexFeeds.py

示例7: _createNoTermsFrequencyFieldType

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
def _createNoTermsFrequencyFieldType():
    f = FieldType()
    f.setIndexed(True)
    f.setTokenized(True)
    f.setOmitNorms(True)
    f.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY)
    f.freeze()
    return f
开发者ID:jerryba,项目名称:meresco-lucene,代码行数:10,代码来源:fieldregistry.py

示例8: LuceneDocumentField

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
class LuceneDocumentField(object):
    """Internal handler class for possible field types"""

    def __init__(self):
        """Init possible field types"""

        # FIELD_ID: stored, indexed, non-tokenized
        self.field_id = FieldType()
        self.field_id.setIndexed(True)
        self.field_id.setStored(True)
        self.field_id.setTokenized(False)

        # FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions)
        # for storing IDs with term vector info
        self.field_id_tv = FieldType()
        self.field_id_tv.setIndexed(True)
        self.field_id_tv.setStored(True)
        self.field_id_tv.setTokenized(False)
        self.field_id_tv.setStoreTermVectors(True)

        # FIELD_TEXT: stored, indexed, tokenized, with positions
        self.field_text = FieldType()
        self.field_text.setIndexed(True)
        self.field_text.setStored(True)
        self.field_text.setTokenized(True)

        # FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions)
        self.field_text_tv = FieldType()
        self.field_text_tv.setIndexed(True)
        self.field_text_tv.setStored(True)
        self.field_text_tv.setTokenized(True)
        self.field_text_tv.setStoreTermVectors(True)

        # FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions
        # (but no character offsets)
        self.field_text_tvp = FieldType()
        self.field_text_tvp.setIndexed(True)
        self.field_text_tvp.setStored(True)
        self.field_text_tvp.setTokenized(True)
        self.field_text_tvp.setStoreTermVectors(True)
        self.field_text_tvp.setStoreTermVectorPositions(True)

    def get_field(self, type):
        """Get Lucene FieldType object for the corresponding internal FIELDTYPE_ value"""
        if type == Lucene.FIELDTYPE_ID:
            return self.field_id
        elif type == Lucene.FIELDTYPE_ID_TV:
            return self.field_id_tv
        elif type == Lucene.FIELDTYPE_TEXT:
            return self.field_text
        elif type == Lucene.FIELDTYPE_TEXT_TV:
            return self.field_text_tv
        elif type == Lucene.FIELDTYPE_TEXT_TVP:
            return self.field_text_tvp
        else:
            raise Exception("Unknown field type")
开发者ID:renespeck,项目名称:TAGME_Reproducibility,代码行数:58,代码来源:lucene_tools.py

示例9: Indexer

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
class Indexer(object):
    def __init__(self, **kwargs):
        """ Initialize a new instance of the Indexer

        :param output: The output directory of the underlying index
        :param anaylzer: The overloaded analyzer to work with
        """
        self.output = kwargs.get("root", "index")
        if not os.path.exists(self.output):
            os.mkdir(self.output)

        self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT))
        self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.store = SimpleFSDirectory(File(self.output))
        self.writer = IndexWriter(self.store, self.config)
        self.create_field_types()

    def index(self, document):
        """ Given a new document, add it to the index.

        :param document: The document to add to the indexer
        """
        try:
            self.writer.addDocument(document)
        except Exception:
            logger.exception("Failed to index the supplied document")

    def shutdown(self):
        """ Shutdown the currently processing indexer.
        """
        try:
            # self.writer.optimize()
            self.writer.close()
        except Exception:
            logger.exception("Failed to shutdown the indexer correctly")

    def create_field_types(self):
        """ Create the field types that will be used to specify
        what actions lucene should take on the various fields
        supplied to index.
        """
        self.field_clean = FieldType()
        self.field_clean.setIndexed(True)
        self.field_clean.setStored(True)
        self.field_clean.setTokenized(False)
        self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        self.field_dirty = FieldType()
        self.field_dirty.setIndexed(True)
        self.field_dirty.setStored(False)
        self.field_dirty.setTokenized(True)
        self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
开发者ID:bashwork,项目名称:common,代码行数:56,代码来源:filesearch.py

示例10: setUp

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
    def setUp(self):
        super(Test_Bug1842, self).setUp()

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        
        w1 = self.getWriter(analyzer=self.analyzer)
        doc1 = Document()

        ftype = FieldType()
        ftype.setStored(False)
        ftype.setIndexed(True)
        ftype.setStoreTermVectors(True)
        doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
        doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))

        w1.addDocument(doc1)
        w1.close()
开发者ID:devs4v,项目名称:devs4v-information-retrieval15,代码行数:19,代码来源:test_bug1842.py

示例11: index_article

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
def index_article(writer, art_id, art_body):
    art_id_field = FieldType()
    art_id_field.setIndexed(True)
    art_id_field.setStored(True)
    art_id_field.setTokenized(False)
    art_id_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

    art_body_field = FieldType()
    art_body_field.setIndexed(True)
    art_body_field.setStored(True)
    art_body_field.setTokenized(True)
    art_body_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

    doc = Document()
    doc.add(Field("art_id", str(art_id), art_id_field))
    doc.add(Field("art_body", art_body, art_body_field))

    writer.addDocument(doc)
开发者ID:andrely,项目名称:vg-pipeline,代码行数:20,代码来源:indexing.py

示例12: indexDocs

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
 def indexDocs(self, root, writer): 
      
     #Create a new FieldType with default properties. 
     t1 = FieldType() 
     t1.setIndexed(True) 
     t1.setStored(True) 
     t1.setTokenized(False)#True if this field's value should be analyzed by the Analyzer. 
     t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) 
      
     #Create a new FieldType with default properties. 
     t2 = FieldType() 
     t2.setIndexed(True) 
     t2.setStored(True) 
     t2.setTokenized(True)#True if this field's value should be analyzed by the Analyzer. 
     t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) 
      
     for root, dirnames, filenames in os.walk(root): 
         for filename in filenames: 
             if not filename.endswith('.txt'): 
                 continue 
             print 'adding', filename 
             try: 
                 path = os.path.join(root, filename) 
                 file = open(path) 
                 contents = file.read() 
                 file.close() 
                 doc = Document() 
                 doc.add(Field('name', filename, t1)) 
                 doc.add(Field('path', root, t1)) 
                 if len(contents) > 0: 
                     doc.add(Field('contents', contents, t2)) 
                     print 'length of content is %d'%(len(contents)) 
                 else: 
                     print 'warning: no content in %s' % filename 
                 writer.addDocument(doc) 
             except Exception, e: 
                 print 'Failed in indexDocs:', e 
开发者ID:ouceduxzk,项目名称:AI2-Kaggle,代码行数:39,代码来源:indexer.py

示例13: indexDocs

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
	def indexDocs(self,root,writer):
		t1 = FieldType()
		t1.setIndexed(True)
		t1.setStored(True)
		t1.setTokenized(True)
		t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

		t2 = FieldType()
		t2.setIndexed(True)
		t2.setStored(False)
		t2.setTokenized(True)
		t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

		for root, dirnames,filenames in os.walk(root):
			# traverse through the doc directory
			for filename in filenames:
				# only if this file ends with '.c'
				if not filename.endswith('.c'):
					continue
				try:
					# only add the filename and path for indexing
					path = os.path.join(root,filename)
					print "adding file : ",path
					file = open(path)
					contents = unicode(file.read(),'utf-8')
					file.close()
					doc = Document()
					doc.add(Field("name",filename,t1))
					doc.add(Field("path",root,t1))
				#	if len(contents) > 0:
				#		doc.add(Field("contents",contents,t2))
				#	else:
				#		print "warning: no content in ",filename
					writer.addDocument(doc)
				except Exception,e:
					print "failed in indexDocs:",e
开发者ID:zz-mars,项目名称:simple-search,代码行数:38,代码来源:indexer.py

示例14: index_wiki

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
def index_wiki(wiki_xmlfile, index_directory_name):
    lucene.initVM()
    # Initialize index directory and analyzer.
    version = Version.LUCENE_CURRENT
    store = FSDirectory.open(File(index_directory_name))
    analyzer = StandardAnalyzer(version)
    # Creates config file.
    config = IndexWriterConfig(version, analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    writer = IndexWriter(store, config)
    # Set document content field type.
    content_fieldtype = FieldType()
    content_fieldtype.setIndexed(True)
    content_fieldtype.setStored(True)
    content_fieldtype.setTokenized(True)
    content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    # Set document title field type.
    title_fieldtype = FieldType()
    title_fieldtype.setIndexed(True)
    title_fieldtype.setStored(True)
    title_fieldtype.setTokenized(True)
    title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    # Set document url field type.
    url_fieldtype = FieldType()
    url_fieldtype.setIndexed(True)
    url_fieldtype.setStored(True)
    url_fieldtype.setTokenized(False)
    url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
    
    
    for xmldoc in wikicorpusxml((wiki_xmlfile)):
        content = xmldoc.partition('>')[2].partition('<')[0].strip()
        title = xmldoc.partition(' title="')[2].partition('"')[0].strip()
        url = xmldoc.partition(' url="')[2].partition('"')[0].strip()
        doc = Document()
        doc.add(Field("contents", content, content_fieldtype))
        doc.add(Field("title", title, title_fieldtype))
        doc.add(Field("url", url, url_fieldtype))
        writer.addDocument(doc)
     
    writer.commit()
    writer.close()
开发者ID:alvations,项目名称:Wikicorpus,代码行数:46,代码来源:WikiIndex.py

示例15: indexDocs

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setIndexed [as 别名]
    def indexDocs(self, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)



        for folder in os.listdir(rawDir):
            for fileName in os.listdir(rawDir+'/'+folder):
                if not fileName.endswith('.json'):
                    continue


                print fileName
                
                subject_id = fileName.split('_')[0]
                
                # print rawDict
                print 'id:'+subject_id
                print 'folder:'+str(folder)

                rawPath = rawDir+'/' +folder +'/'+subject_id+'_res.json'
                adjPath = adjDir+'/' +folder +'/'+subject_id+'_adj.json'
                #tfidfPath = tf_idfDir + '/' + subject_id+'_tfidf.json'

                print adjPath

                rawFile = open(rawPath,'r')
                raw = rawFile.read()
                if raw =='':
                    with open(baseDir+'/'+'err_no_raw_content.txt','a') as err:
                        err.write(subject_id+'\n')
                # if subject_id =='6018943':
                #     rawFile.seek(0)

                rawFile.close()
                adjFile = open(adjPath,'r')
                adj = adjFile.read()

                adjFile.close()

                raw = getRidOfBOM(raw)
                adj = getRidOfBOM(adj)

                if raw != '':
                    rawDict = json.loads(raw)
                adjDict = json.loads(adj)

                rawAll = rawDict['summary'] +' '+ rawDict['user_tags'] + ' '+rawDict['comments']
                summary_adjs = adjDict['summary_adjs']
                comments_adjs = adjDict['comments_adjs']
                title = adjDict['title']
                rating_average = adjDict['rating_average']
                comments_count = adjDict['comments_count']

                doc = Document()
                doc.add(Field("folder",folder,t1))
                doc.add(Field("title", title, t1))
                doc.add(Field("subject_id", subject_id, t1))
                doc.add(IntField("comments_count", comments_count, Field.Store.YES))
                doc.add(FloatField("rating_average", rating_average, Field.Store.YES))


                if len(summary_adjs)>0:
                    print summary_adjs
                    exit()
                doc.add(Field("summary_adjs", summary_adjs, t2))

                #if len(comments_adjs)>0:
                doc.add(Field("comments_adjs", comments_adjs, t2))

                writer.addDocument(doc)
开发者ID:PhoenixZhao,项目名称:Crawler4Douban,代码行数:83,代码来源:indexAdjFiles.py


注:本文中的org.apache.lucene.document.FieldType.setIndexed方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。