當前位置: 首頁>>代碼示例>>Python>>正文


Python document.FieldType類代碼示例

本文整理匯總了Python中org.apache.lucene.document.FieldType的典型用法代碼示例。如果您正苦於以下問題:Python FieldType類的具體用法?Python FieldType怎麽用?Python FieldType使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。


在下文中一共展示了FieldType類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: run

    def run(self):
        print "Booting lucene driver worker...."
        lucene.initVM()

        self.fieldType1 = FieldType()
        self.fieldType1.setIndexed(True)
        self.fieldType1.setStored(False)
        self.fieldType1.setTokenized(True)

        self.fieldType2 = FieldType()
        self.fieldType2.setIndexed(True)
        self.fieldType2.setStored(True)
        self.fieldType2.setTokenized(False)

        while(True):
            data = self.queue.get()
            da = data[1]
            response = None
            try:
                self.fil = File(da['data']['indexdir'])
                self.d = NIOFSDirectory(self.fil)
                self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
                self.conf = IndexWriterConfig(
                    Version.LUCENE_CURRENT,
                    self.analyzer)

                response = getattr(self, da['action'])(da['data'])
                self.d.close()
            except Exception as e:
                print e
            if response is None:
                response = {}

            self.ret[data[0]] = response
開發者ID:bradleyjones,項目名稱:apiary,代碼行數:34,代碼來源:lucenedriver.py

示例2: index

	def index(self, root):

		t = FieldType()
		t.setIndexed(True)
		t.setStored(True)
		t.setTokenized(True)
		t.setStoreTermVectors(True)
		
		for path, dirs, files in os.walk(root):
			
			for file in files:
				
				filePath = os.path.join(path, file)
				fd = open(filePath)
				content = unicode(fd.read(), 'iso-8859-1')
				fd.close()
				
				doc = Document()
				doc.add(Field('name', file, StringField.TYPE_STORED))

				parent = os.path.split(path)[1]
				doc.add(Field('parent', parent, StringField.TYPE_STORED))

				if len(content) > 0:
					doc.add(Field('content', content, t))

				print 'Indexing %s' % file
				self.mWriter.addDocument(doc)

		self.mWriter.commit()
		self.mWriter.close()
開發者ID:haonguyen14,項目名稱:CLIFinder,代碼行數:31,代碼來源:Indexer.py

示例3: create_field_types

    def create_field_types(self):
        """ Create the field types that will be used to specify
        what actions lucene should take on the various fields
        supplied to index.
        """
        self.field_clean = FieldType()
        self.field_clean.setIndexed(True)
        self.field_clean.setStored(True)
        self.field_clean.setTokenized(False)
        self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        self.field_dirty = FieldType()
        self.field_dirty.setIndexed(True)
        self.field_dirty.setStored(False)
        self.field_dirty.setTokenized(True)
        self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
開發者ID:bashwork,項目名稱:common,代碼行數:16,代碼來源:filesearch.py

示例4: __init__

    def __init__(self):
        """Init possible field types."""

        # FIELD_ID: stored, indexed, non-tokenized
        self.field_id = FieldType()
        self.field_id.setIndexed(True)
        self.field_id.setStored(True)
        self.field_id.setTokenized(False)

        # FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions)
        # for storing IDs with term vector info
        self.field_id_tv = FieldType()
        self.field_id_tv.setIndexed(True)
        self.field_id_tv.setStored(True)
        self.field_id_tv.setTokenized(False)
        self.field_id_tv.setStoreTermVectors(True)

        # FIELD_TEXT: stored, indexed, tokenized, with positions
        self.field_text = FieldType()
        self.field_text.setIndexed(True)
        self.field_text.setStored(True)
        self.field_text.setTokenized(True)

        # FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions)
        self.field_text_tv = FieldType()
        self.field_text_tv.setIndexed(True)
        self.field_text_tv.setStored(True)
        self.field_text_tv.setTokenized(True)
        self.field_text_tv.setStoreTermVectors(True)

        # FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions
        # (but no character offsets)
        self.field_text_tvp = FieldType()
        self.field_text_tvp.setIndexed(True)
        self.field_text_tvp.setStored(True)
        self.field_text_tvp.setTokenized(True)
        self.field_text_tvp.setStoreTermVectors(True)
        self.field_text_tvp.setStoreTermVectorPositions(True)

        # FIELD_TEXT_NTV:  not stored, indexed, tokenized, with term vectors (without positions)
        self.field_text_ntv = FieldType()
        self.field_text_ntv.setIndexed(True)
        self.field_text_ntv.setStored(False)
        self.field_text_ntv.setTokenized(True)
        self.field_text_ntv.setStoreTermVectors(True)

        # FIELD_TEXT_TVP: not stored, indexed, tokenized, with term vectors and positions
        # (but no character offsets)
        self.field_text_ntvp = FieldType()
        self.field_text_ntvp.setIndexed(True)
        self.field_text_ntvp.setStored(False)
        self.field_text_ntvp.setTokenized(True)
        self.field_text_ntvp.setStoreTermVectors(True)
        self.field_text_ntvp.setStoreTermVectorPositions(True)
開發者ID:hasibi,項目名稱:ELR-EntityLinkingRetrieval,代碼行數:54,代碼來源:lucene_tools.py

示例5: testBinaryFieldInIndex

    def testBinaryFieldInIndex(self):

        ft = FieldType()
        ft.setStored(True)

        bytes = JArray('byte')(self.binaryValStored)
        binaryFldStored = StoredField("binaryStored", bytes)
        stringFldStored = Field("stringStored", self.binaryValStored, ft)

        doc = Document()
        doc.add(binaryFldStored)
        doc.add(stringFldStored)

        # test for field count
        self.assertEqual(2, doc.fields.size())

        # add the doc to a ram index
        writer = self.getWriter(analyzer=StandardAnalyzer())
        writer.addDocument(doc)
        writer.close()

        # open a reader and fetch the document
        reader = self.getReader()
        docFromReader = reader.document(0)
        self.assertTrue(docFromReader is not None)

        # fetch the binary stored field and compare it's content with the
        # original one
        bytes = docFromReader.getBinaryValue("binaryStored")
        binaryFldStoredTest = bytes.bytes.bytes_
        self.assertEqual(binaryFldStoredTest, self.binaryValStored)

        # fetch the string field and compare it's content with the original
        # one
        stringFldStoredTest = docFromReader.get("stringStored")
        self.assertEqual(stringFldStoredTest, self.binaryValStored.decode())

        reader.close()
開發者ID:svn2github,項目名稱:pylucene,代碼行數:38,代碼來源:test_BinaryDocument.py

示例6: setUp

    def setUp(self):
        super(Test_Bug1842, self).setUp()

        self.analyzer = StandardAnalyzer()

        w1 = self.getWriter(analyzer=self.analyzer)
        doc1 = Document()

        ftype = FieldType()
        ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        ftype.setTokenized(True)
        ftype.setStoreTermVectors(True)
        ftype.freeze()

        doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
        doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))

        w1.addDocument(doc1)
        w1.close()
開發者ID:svn2github,項目名稱:pylucene,代碼行數:19,代碼來源:test_bug1842.py

示例7: setUp

    def setUp(self):
        super(Test_Bug1842, self).setUp()

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        
        w1 = self.getWriter(analyzer=self.analyzer)
        doc1 = Document()

        ftype = FieldType()
        ftype.setStored(False)
        ftype.setIndexed(True)
        ftype.setStoreTermVectors(True)
        doc1.add(Field("all", "blah blah blah Gesundheit", ftype))
        doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED))

        w1.addDocument(doc1)
        w1.close()
開發者ID:devs4v,項目名稱:devs4v-information-retrieval15,代碼行數:17,代碼來源:test_bug1842.py

示例8: lazyImport

def lazyImport():
    global imported
    if imported:
        return

    from meresco.pylucene import getJVM
    getJVM()

    from java.nio.file import Paths
    from org.apache.lucene.document import Document, StringField, Field, FieldType
    from org.apache.lucene.search import IndexSearcher, TermQuery
    from org.apache.lucene.index import DirectoryReader, Term, IndexWriter, IndexWriterConfig, IndexOptions
    from org.apache.lucene.store import FSDirectory
    from org.apache.lucene.util import Version
    from org.apache.lucene.analysis.core import WhitespaceAnalyzer

    UNINDEXED_TYPE = FieldType()
    UNINDEXED_TYPE.setIndexOptions(IndexOptions.NONE)
    UNINDEXED_TYPE.setStored(True)
    UNINDEXED_TYPE.setTokenized(False)

    imported = True
    globals().update(locals())
開發者ID:seecr,項目名稱:meresco-lucene,代碼行數:23,代碼來源:lucenekeyvaluestore.py

示例9: index_docs

    def index_docs(self, train_set, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPayloads(True)
        t2.setStoreTermVectorPositions(True)
        t2.setStoreTermVectors(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for ii in train_set:
            doc = Document()
            doc.add(Field("answer", ii['Answer'], t1))
            doc.add(Field("qid", ii['Question ID'], t1))
            doc.add(Field("category", ii['category'], t1))
            doc.add(Field("position", ii['Sentence Position'], t1))
            doc.add(Field("question", ii['Question Text'], t2))
            doc.add(Field("wiki_plain",
                          self.wiki_reader.get_text(ii['Answer']), t2))
            writer.addDocument(doc)
開發者ID:sangheestyle,項目名稱:nlp2014,代碼行數:28,代碼來源:index.py

示例10: indexTable

    def indexTable(self, writer):

        #connection 
        con = None

        #define the index of all the fields
        #---------step 2:connect to mysql----------
        con = mdb.connect('localhost','root','testgce','douban_movie_v3')

        #t_num = FieldType.NumericType it is wrong!!
        t_num = FieldType()
        t_num.setStored(False)

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        t3 = FieldType()
        t3.setIndexed(True)
        t3.setStored(True)
        t3.setTokenized(True)
        t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        maxDict = utils.maxDict
        #加權數值範圍
        base = DOC_BOOST_RANGE[0]
        upper = DOC_BOOST_RANGE[1]

        with con:
            # Careful with codecs
            con.set_character_set('utf8')

            cur = con.cursor()
            # Aagin the codecs
            cur.execute('SET NAMES utf8;')
            cur.execute('SET CHARACTER SET utf8;')
            cur.execute('SET character_set_connection=utf8;')
            
            #------step 3: choose the right table------
            cur.execute("SELECT * FROM movie_items")

            numrows = int(cur.rowcount)
            print 'numrows:',numrows
            for i in range(numrows):
                print
                row = cur.fetchone()

                #------step 4:Index your field------
                summary = row[SUMMARY]  
                subject_id = row[SUBJECT_ID]


                print 'id'+subject_id
                year = utils.formatYear(row[YEAR])
                try:
                    date = DateTools.stringToDate(year.replace('-',' '))
                    wtfFile = open('wtf.txt','a')
                    dateStr  = DateTools.dateToString(date,DateTools.Resolution.DAY)
                except:
                    wtfFile.write(year+'\n')

                        

                doc = Document()

                #boosting
                boostProb = utils.calcBoostProb(row,maxDict,dateStr)
                boost = base + boostProb*(upper-base)

                doc.add(FloatField("boost",boost,Field.Store.YES))
                doc.add(StringField("year",dateStr,Field.Store.YES))
                print 'dateStr:'+dateStr
                #A text field is a sequence of terms that has been tokenized while a string field is a single term (although it can also be multivalued.)

                do_count = row[DO_COUNT] if row[DO_COUNT] != None else 0
                wish_count = row[COLLECT_COUNT] if row[WISH_COUNT] != None else 0

                #fields which should not be analyzed
                doc.add(FloatField("rating_average",float(row[RATING_AVERAGE]),Field.Store.YES))
                doc.add(FloatField("rating_stars", float(row[RATING_STARS]), Field.Store.YES))
                doc.add(IntField("reviews_count", int(row[REVIEWS_COUNT]), Field.Store.YES))
                #doc.add(FloatField("year", float(row[YEAR]), Field.Store.YES).setBoost(boost))
                doc.add(IntField("collect_count", int(row[COLLECT_COUNT]), Field.Store.YES))
                doc.add(IntField("do_count", int(do_count), Field.Store.YES))
                doc.add(IntField("wish_count", int(wish_count), Field.Store.YES))
                doc.add(IntField("subject_id", int(row[SUBJECT_ID]), Field.Store.YES))
                doc.add(IntField("comments_count", int(row[COMMENTS_COUNT]), Field.Store.YES))
                doc.add(IntField("ratings_count", int(row[RATINGS_COUNT]), Field.Store.YES))
                doc.add(StringField("image_small", row[IMAGE_SMALL], Field.Store.YES))

                #fields which should be analyzed with WhitespaceAnalyzer
                #attention!!! dont use a long sentence like :
#.........這裏部分代碼省略.........
開發者ID:PhoenixZhao,項目名稱:MovieSearchService,代碼行數:101,代碼來源:IndexMysql.py

示例11: indexDocs

	def indexDocs(self,root,writer):
		t1 = FieldType()
		t1.setIndexed(True)
		t1.setStored(True)
		t1.setTokenized(True)
		t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

		t2 = FieldType()
		t2.setIndexed(True)
		t2.setStored(False)
		t2.setTokenized(True)
		t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

		for root, dirnames,filenames in os.walk(root):
			# traverse through the doc directory
			for filename in filenames:
				# only if this file ends with '.c'
				if not filename.endswith('.c'):
					continue
				try:
					# only add the filename and path for indexing
					path = os.path.join(root,filename)
					print "adding file : ",path
					file = open(path)
					contents = unicode(file.read(),'utf-8')
					file.close()
					doc = Document()
					doc.add(Field("name",filename,t1))
					doc.add(Field("path",root,t1))
				#	if len(contents) > 0:
				#		doc.add(Field("contents",contents,t2))
				#	else:
				#		print "warning: no content in ",filename
					writer.addDocument(doc)
				except Exception,e:
					print "failed in indexDocs:",e
開發者ID:zz-mars,項目名稱:simple-search,代碼行數:36,代碼來源:indexer.py

示例12: indexDocs

    def indexDocs(self, root, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        
        for root, dirnames, filenames in os.walk(root):
            for filename in filenames:
                if not filename.endswith('.html'):
                    continue
                print "adding", filename
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = unicode(file.read(), 'iso-8859-1')
                    file.close()
                    doc = Document()
                    doc.add(Field("name", filename, t1))
                    doc.add(Field("path", root, t1))
                    if len(contents) > 0:
                        doc.add(Field("contents", contents, t2))
                    else:
                        print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e
開發者ID:devs4v,項目名稱:devs4v-information-retrieval15,代碼行數:34,代碼來源:IndexFiles.py

示例13: LuceneDocumentField

class LuceneDocumentField(object):
    """Internal handler class for possible field types"""

    def __init__(self):
        """Init possible field types"""

        # FIELD_ID: stored, indexed, non-tokenized
        self.field_id = FieldType()
        self.field_id.setIndexed(True)
        self.field_id.setStored(True)
        self.field_id.setTokenized(False)

        # FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions)
        # for storing IDs with term vector info
        self.field_id_tv = FieldType()
        self.field_id_tv.setIndexed(True)
        self.field_id_tv.setStored(True)
        self.field_id_tv.setTokenized(False)
        self.field_id_tv.setStoreTermVectors(True)

        # FIELD_TEXT: stored, indexed, tokenized, with positions
        self.field_text = FieldType()
        self.field_text.setIndexed(True)
        self.field_text.setStored(True)
        self.field_text.setTokenized(True)

        # FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions)
        self.field_text_tv = FieldType()
        self.field_text_tv.setIndexed(True)
        self.field_text_tv.setStored(True)
        self.field_text_tv.setTokenized(True)
        self.field_text_tv.setStoreTermVectors(True)

        # FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions
        # (but no character offsets)
        self.field_text_tvp = FieldType()
        self.field_text_tvp.setIndexed(True)
        self.field_text_tvp.setStored(True)
        self.field_text_tvp.setTokenized(True)
        self.field_text_tvp.setStoreTermVectors(True)
        self.field_text_tvp.setStoreTermVectorPositions(True)

    def get_field(self, type):
        """Get Lucene FieldType object for the corresponding internal FIELDTYPE_ value"""
        if type == Lucene.FIELDTYPE_ID:
            return self.field_id
        elif type == Lucene.FIELDTYPE_ID_TV:
            return self.field_id_tv
        elif type == Lucene.FIELDTYPE_TEXT:
            return self.field_text
        elif type == Lucene.FIELDTYPE_TEXT_TV:
            return self.field_text_tv
        elif type == Lucene.FIELDTYPE_TEXT_TVP:
            return self.field_text_tvp
        else:
            raise Exception("Unknown field type")
開發者ID:renespeck,項目名稱:TAGME_Reproducibility,代碼行數:56,代碼來源:lucene_tools.py

示例14: indexDocs

def indexDocs(root, writer):
        """
        indexed: name title content
        stored: date name tilte sumary
        :param root:
        :param writer:
        :return:
        """
        #index and store
        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        #only index, but not store
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        #only store
        t3 = FieldType()
        t3.setIndexed(False)
        t3.setStored(True)
        t3.setTokenized(False)
        t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        for root, dirnames, filenames in os.walk(root):
            print filenames
            for filename in filenames:
                if not filename.endswith('.md'):
                    continue
                print "adding", filename
                try:
                    path = os.path.join(root, filename)
                    file = open(path)
                    contents = unicode(file.read(), 'utf-8')
                    file.close()

                    date, name = get_date_name(filename)
                    title, content = get_post_title_content(contents)
                    summary = content[:200] if content else ''

                    print date, name, title

                    doc = Document()
                    doc.add(Field('date', date, t3))
                    doc.add(Field('name', name, t1))
                    doc.add(Field('title', title, t1))
                    doc.add(Field('content', content, t2))
                    doc.add(Field('summary', summary, t3))


                    # doc.add(Field("name", filename, t1))
                    # doc.add(Field("path", root, t1))
                    # if len(contents) > 0:
                    #     doc.add(Field("contents", contents, t2))
                    # else:
                    #     print "warning: no content in %s" % filename
                    writer.addDocument(doc)
                except Exception, e:
                    print "Failed in indexDocs:", e
開發者ID:wasw100,項目名稱:jekyll-search,代碼行數:64,代碼來源:index.py

示例15: indexDocs

    def indexDocs(self, url, writer):
        type1 = FieldType()
        type1.setIndexed(True)
        type1.setStored(True)
        type1.setTokenized(False)
        type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        type2 = FieldType()
        type2.setIndexed(True)
        type2.setStored(True)
        type2.setTokenized(True)
        type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
        
        # Read Feeds
        feeds = feedparser.parse(url)

        for item in feeds["entries"]:
            print "adding", item["title"] 
            try:
                link = item["link"] 
                contents = item["description"].encode("utf-8")
                contents = re.sub('<[^<]+?>', '', ''.join(contents))
                title = item["title"]
                doc = Document()
                doc.add(Field("url", link, type1))
                doc.add(Field("title", title, type1))
                if len(contents) > 0:
                    doc.add(Field("contents", contents, type2))
                else:
                    print "warning: no content in %s" % item["title"] 
                writer.addDocument(doc)
            except Exception, e:
                 print "Failed in indexDocs:", e
開發者ID:yelinkyaw,項目名稱:FeedsIndexer,代碼行數:33,代碼來源:IndexFeeds.py


注:本文中的org.apache.lucene.document.FieldType類示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。