Python lucene.Document类代码示例

本文整理汇总了Python中lucene.Document类的典型用法代码示例。如果您正苦于以下问题：Python Document类的具体用法？Python Document怎么用？Python Document使用的例子？那么, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了Document类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: index

def index(string):
 lucene.initVM()
 indexDir = "REMOVEME.index-dir"
 dir = SimpleFSDirectory(File(indexDir))
 analyzer = StandardAnalyzer(Version.LUCENE_30)
 try:
  writer = IndexWriter(dir, analyzer, False, IndexWriter.MaxFieldLength(512))
 except lucene.JavaError:
  #print 'Inside Index Except'
  writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))
#e = sys.exc_info()[0]
#print e
 #print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

 doc = Document()
 doc.add(Field("text", string, Field.Store.YES, Field.Index.ANALYZED))
 writer.addDocument(doc)
 #print 'In the index function'
 #print writer.numDocs()

#print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
#print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
 writer.optimize()
#print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
#print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
 #print 'ending Indexing'
 #print string 
 #print 'Total indexes'
 #print writer.numDocs() 
 writer.close()

开发者ID:kansal，项目名称:Sub-Event-Detection，代码行数:30，代码来源:subEventPylucene.py

示例2: index

    def index(source, indexName):

        if(not os.path.exists(indexName)):
            os.mkdir(indexName)
            
        indexDir = File(indexName)

        writer = IndexWriter(SimpleFSDirectory(File(indexName)),StandardAnalyzer(Version.LUCENE_CURRENT), True,IndexWriter.MaxFieldLength.LIMITED)
      
        p = re.compile("(GH\d+\-\d+)\n(.*?)\n+", re.DOTALL)
        res = p.findall(source)
        
        i = 0
        for pair in res:
            i += 1
            doc = Document()
            doc.add(Field("id", pair[0], Field.Store.YES, Field.Index.NO))
            for t in pair[1].split():
                doc.add(Field("content", t.replace("-","_"), Field.Store.NO, Field.Index.NOT_ANALYZED));
                #doc.add(Field("content", pair[1], Field.Store.NO, Field.Index.ANALYZED));
                
            writer.addDocument(doc)
            
        writer.close()
        print str(i)+ " docs indexed"

开发者ID:guillelmo，项目名称:UvA-AIR，代码行数:25，代码来源:index.py

示例3: configure_lucene

def configure_lucene():
    
    f = open('clique.txt','r')
    lucene.initVM()
    print 'Inside Function'
    #indexDir = "/tmp/luceneindex"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(lucene.Version.LUCENE_CURRENT)
    writer = IndexWriter(dir, analyzer, True, IndexWriter.MaxFieldLength(512))

    print >> sys.stderr, "Currently there are %d documents in the index..." % writer.numDocs()

    print >> sys.stderr, "Reading lines from sys.stdin..."
    for line in f:
        line = line.replace('\t','')
        line = line.replace('\r','')
        line = line.replace('\n','')
  	line = line.replace('^','')
    	line = line.strip()
        doc = Document()
        doc.add(Field("text", line, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)

    print >> sys.stderr, "Indexed lines from stdin (%d documents in index)" % (writer.numDocs())
    print >> sys.stderr, "About to optimize index of %d documents..." % writer.numDocs()
    writer.optimize()
    print >> sys.stderr, "...done optimizing index of %d documents" % writer.numDocs()
    print >> sys.stderr, "Closing index of %d documents..." % writer.numDocs()
    writer.close()

开发者ID:avinashkoulavkar，项目名称:GUI，代码行数:29，代码来源:app.py

示例4: setUp

    def setUp(self):
        
        animals = [ "aardvark", "beaver", "coati",
                    "dog", "elephant", "frog", "gila monster",
                    "horse", "iguana", "javelina", "kangaroo",
                    "lemur", "moose", "nematode", "orca",
                    "python", "quokka", "rat", "scorpion",
                    "tarantula", "uromastyx", "vicuna",
                    "walrus", "xiphias", "yak", "zebra" ]

        analyzer = WhitespaceAnalyzer()

        aTOmDirectory = RAMDirectory()
        nTOzDirectory = RAMDirectory()

        aTOmWriter = IndexWriter(aTOmDirectory, analyzer, True,
                                 IndexWriter.MaxFieldLength.UNLIMITED)
        nTOzWriter = IndexWriter(nTOzDirectory, analyzer, True,
                                 IndexWriter.MaxFieldLength.UNLIMITED)

        for animal in animals:
            doc = Document()
            doc.add(Field("animal", animal,
                          Field.Store.YES, Field.Index.NOT_ANALYZED))

            if animal[0].lower() < "n":
                aTOmWriter.addDocument(doc)
            else:
                nTOzWriter.addDocument(doc)

        aTOmWriter.close()
        nTOzWriter.close()

        self.searchers = [ IndexSearcher(aTOmDirectory),
                           IndexSearcher(nTOzDirectory) ]

开发者ID:bpgriner01，项目名称:pylucene，代码行数:35，代码来源:MultiSearcherTest.py

示例5: main

    def main(cls, argv):

        if len(argv) < 5:
            print "Usage: python IndexTuningDemo.py <numDocs> <mergeFactor> <maxMergeDocs> <maxBufferedDocs>"
            return
            
        docsInIndex  = int(argv[1])

        # create an index called 'index-dir' in a temp directory
        indexDir = os.path.join(tempfile.gettempdir(),
                                'index-dir')
        dir = FSDirectory.open(indexDir,)
        analyzer = SimpleAnalyzer()
        writer = IndexWriter(dir, analyzer, True)

        # set variables that affect speed of indexing
        writer.setMergeFactor(int(argv[2]))
        writer.setMaxMergeDocs(int(argv[3]))
        writer.setMaxBufferedDocs(int(argv[4]))
        # writer.infoStream = tempfile.out

        print "Merge factor:  ", writer.getMergeFactor()
        print "Max merge docs:", writer.getMaxMergeDocs()
        print "Max buffered docs:", writer.getMaxBufferedDocs()

        start = time()
        for i in xrange(docsInIndex):
            doc = Document()
            doc.add(Field("fieldname", "Bibamus",
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.close()
        print "Time: ", timedelta(seconds=time() - start)

开发者ID:ustramooner，项目名称:python-lucenepp，代码行数:34，代码来源:IndexTuningDemo.py

示例6: run

	def run(self):
		env.attachCurrentThread()
		stream = tweetstream.SampleStream("username", "password")

		for tweet in stream:
                	try:
									
				contents = unicode(tweet['text'])
				user_name = tweet['user']['screen_name']
				#print contents
				#print user_name
					
				doc = Document()
				doc.add(Field("user_name", user_name, Field.Store.YES, Field.Index.NOT_ANALYZED))
					
				if len(contents) > 0:
					doc.add(Field("contents", contents, Field.Store.YES, Field.Index.ANALYZED))
				else:
					pass
				self.writer.addDocument(doc)
				
				# optimize for fast search and commit the changes
				self.writer.optimize()
				self.writer.commit()
			except Exception as e: pass

开发者ID:greedo，项目名称:TweetLinks，代码行数:25，代码来源:streamingIndexer.py

示例7: indexfeeds

    def indexfeeds(self,writer):
         """
         """
         feedlist=['http://today.reuters.com/rss/topNews',
                      'http://today.reuters.com/rss/domesticNews',
                      'http://today.reuters.com/rss/wordNews',
                      'http://rss.cnn.com/rss/edition.rss',
                      'http://rss.cnn.com/rss/edition_word.rss',
                      'http://rss.cnn.com/rss/edition_us.rss']     
         articletitles=[]
         for feed in feedlist:
                f=feedparser.parse(feed)
                for e in f.entries:
                    if e.title in articletitles: continue
                    contents = e.title.encode('utf8') + self.strphtml(e.description.encode('utf8'))
                    try:
				doc = Document()
				doc.add(Field("name", e.title,
	                                         Field.Store.YES,
	                                         Field.Index.NOT_ANALYZED))
				if len(contents) > 0:					
					doc.add(Field("contents", contents,
	                                            Field.Store.YES,
	                                            Field.Index.ANALYZED,
								    Field.TermVector.YES))
				writer.addDocument(doc)        
                    except Exception, e:
                        print 'Unable to index'

开发者ID:subramgo，项目名称:Vritti，代码行数:28，代码来源:IndexFeed.py

示例8: reindex_all

def reindex_all(reader, writer, analyzer):
    for i in xrange(reader.maxDoc()):
        if reader.isDeleted(i): continue
        doc = reader.document(i)
        p = doc.get("path")
        pkid = doc.get('txtorg_id')
        if p is None:
            # No filepath specified, just use original document
            writer.updateDocument(Term("txtorg_id",pkid),doc,analyzer)
        else:
            # if a path field is found, try to read the file it points to and add a contents field
            edited_doc = Document()
            for f in doc.getFields():
                edited_doc.add(Field.cast_(f))

            try:
                inf = open(p)
                contents = unicode(inf.read(), 'UTF-8')
                inf.close()

                if len(contents) > 0:
                    edited_doc.add(Field("contents", contents,
                                         Field.Store.NO,
                                         Field.Index.ANALYZED,
                                         Field.TermVector.YES))
                else:
                    print "warning: no content in %s" % filename
            except:
                print "Could not read file; skipping"
            writer.updateDocument(Term("txtorg_id",pkid),edited_doc,analyzer)

开发者ID:jlederluis，项目名称:iqss-text-organizer，代码行数:30，代码来源:indexutils.py

示例9: indexDocs

 def indexDocs(self):
   #from user_location_map
   for root, _, filenames in os.walk(self.directory):
     for filename in filenames:
       try:
         path = os.path.join(root, filename)
         print path
         f = open(path, 'r')
         # every line in the file is a user_location_map entry
         for line in f:
           data = cjson.decode(line)
           doc = Document()
           tw_texts = []
           num_tweets = {}
           total_num_tweets = 0.0
           for i in data['locations']:
             tweets = [x['tx'] for x in i['tweets']]
             num_tweets[i['name']] = len(i['tweets'])
             tw_texts.extend(tweets)
             total_num_tweets += len(tweets)
           #tweets indexed as part of the UserMapIndexer
           doc.add(Field("text", " ".join(tw_texts), Field.Store.NO,
                       Field.Index.ANALYZED))
           doc.add(Field("num_tweets", cjson.encode(num_tweets),
                         Field.Store.YES,
                         Field.Index.NO))
           doc.add(Field("user", data['user'], Field.Store.YES,
                         Field.Index.NO))
           doc.setBoost(total_num_tweets)
           self.writer.addDocument(doc)
         f.close()
       except Exception, e:
         print "Failed in indexLocations:", e
         print sys.exc_info()[0]

开发者ID:vandanab，项目名称:LocalExperts，代码行数:34，代码来源:indexer.py

示例10: addDocuments

    def addDocuments(self, dir, isCompound):

        writer = IndexWriter(dir, SimpleAnalyzer(), True,
                             IndexWriter.MaxFieldLength.LIMITED)
        writer.setUseCompoundFile(isCompound)

        # change to adjust performance of indexing with FSDirectory
        # writer.mergeFactor = writer.mergeFactor
        # writer.maxMergeDocs = writer.maxMergeDocs
        # writer.minMergeDocs = writer.minMergeDocs

        for word in self.docs:
            doc = Document()
            doc.add(Field("keyword", word,
                          Field.Store.YES, Field.Index.NOT_ANALYZED))
            doc.add(Field("unindexed", word,
                          Field.Store.YES, Field.Index.NO))
            doc.add(Field("unstored", word,
                          Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field("text", word,
                          Field.Store.YES, Field.Index.ANALYZED))
            writer.addDocument(doc)

        writer.optimize()
        writer.close()

开发者ID:ustramooner，项目名称:python-lucenepp，代码行数:25，代码来源:CompoundVersusMultiFileIndexTest.py

示例11: do_index

def do_index():
    initVM()
    indexDir = "/home/william/woyaoo/luceneindex"
    version = Version.LUCENE_CURRENT
    standardAnalyzer = StandardAnalyzer(version)
    # chineseAnalyzer = CJKAnalyzer(version)
    engine = data.engine_from_config("indexdb.config")
    # engine = data.engine_from_config()
    db = data.init_datafactory(engine)
    docs = dbfactory.Session().query(doc_model.Doc).filter(doc_model.Doc.dateCreated > "20121220").all()
    print len(docs)
    idxDir = SimpleFSDirectory(File(indexDir))
    perIndexCount = 5000
    writer = IndexWriter(idxDir, standardAnalyzer, True, IndexWriter.MaxFieldLength(512))

    # add field
    for doc in docs:
        # print repr(doc.description)
        lucenedoc = Document()
        descriptionValue = doc.description.strip("\r\n").encode("UTF-8")
        # descriptionValue ='中国 abc'
        print repr(descriptionValue)
        lucenedoc.add(Field("url", doc.url, Field.Store.YES, Field.Index.NOT_ANALYZED))
        lucenedoc.add(Field("intent", doc.intent, Field.Store.YES, Field.Index.NOT_ANALYZED))
        # lucenedoc.add(Field('description', doc.description, Field.Store.YES, Field.Index.ANALYZED))
        lucenedoc.add(Field("description", descriptionValue, Field.Store.YES, Field.Index.ANALYZED))
        lucenedoc.add(Field("title", doc.title, Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(lucenedoc)
        writer.optimize()
    writer.close()
    print "index finished"

开发者ID:tongji1907，项目名称:woyaooo，代码行数:31，代码来源:docindexjob.py

示例12: index

 def index(self,path_to_index,path_files):
     'indexes anchor texts from a given folder'
     #lucene.initVM()
     indexDir = path_to_index
     directory_index = SimpleFSDirectory(File(indexDir))
     analyzer = StandardAnalyzer(Version.LUCENE_35)
     writer = IndexWriter(directory_index, analyzer, True, IndexWriter.MaxFieldLength(512))
     listOfPathes = []
     listOfPathes.extend(glob.glob(path_files+"*.txt"))
     counter = 0
     for path_to_file in listOfPathes:
         print path_to_file
         f = open(path_to_file,"r")
         for line in f:
             entry = line.split("\t")
             counter+=1
             """
             optimizes index after a certain amount of added documents
             """
             if counter%500000==0:
                 print counter
                 writer.optimize()
             doc = Document()
             doc.add(Field("anchor", entry[0], Field.Store.YES, Field.Index.ANALYZED))
             doc.add(Field("anchor_uri", entry[1], Field.Store.YES, Field.Index.ANALYZED))
             doc.add(Field("dbpedia_uri", entry[2], Field.Store.YES, Field.Index.ANALYZED))
             doc.add(Field("number", entry[3].replace("\n",""), Field.Store.YES, Field.Index.ANALYZED))
             writer.addDocument(doc)
         writer.optimize()
      
         f.close()
         
     writer.close()
     print counter
     print "done"

开发者ID:swalter2，项目名称:knowledgeLexicalisation，代码行数:35，代码来源:AnchorIndex.py

示例13: addContents

 def addContents(self,contents):
      try:
           #iwconfig = IndexWriterConfig(SimpleAnalyzer(),IndexWriter.MaxFieldLength.LIMITED)
           writer = IndexWriter(self.ramIndex,SimpleAnalyzer(Version.LUCENE_CURRENT),True,IndexWriter.MaxFieldLength.LIMITED)
           for content in contents:
               doc = Document()
               doc.add(Field("contents",content[1],Field.Store.NO,Field.Index.ANALYZED,Field.TermVector.YES))
               writer.addDocument(doc)
           writer.close()
      except Exception,e:
           print 'Unable to add content to RAM index'

开发者ID:subramgo，项目名称:Vritti，代码行数:11，代码来源:RAMIndex.py

示例14: setUp

    def setUp(self):

        self.directory = RAMDirectory()
        writer = IndexWriter(self.directory, self.porterAnalyzer, True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        doc = Document()
        doc.add(Field("contents",
                      "The quick brown fox jumps over the lazy dogs",
                       Field.Store.YES, Field.Index.ANALYZED))
        writer.addDocument(doc)
        writer.close()

开发者ID:bpgriner01，项目名称:pylucene，代码行数:12，代码来源:PositionalPorterStopAnalyzerTest.py

示例15: indexSingleFieldDocs

    def indexSingleFieldDocs(self, fields):

        writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True,
                             IndexWriter.MaxFieldLength.UNLIMITED)

        for field in fields:
            doc = Document()
            doc.add(field)
            writer.addDocument(doc)

        writer.commit()
        writer.close()

开发者ID:qiugen，项目名称:pylucene-trunk，代码行数:12，代码来源:ScoreTest.py

注：本文中的lucene.Document类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。