当前位置: 首页>>代码示例>>Python>>正文


Python Scheduler.feed方法代码示例

本文整理汇总了Python中scheduler.Scheduler.feed方法的典型用法代码示例。如果您正苦于以下问题:Python Scheduler.feed方法的具体用法?Python Scheduler.feed怎么用?Python Scheduler.feed使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scheduler.Scheduler的用法示例。


在下文中一共展示了Scheduler.feed方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: CrawlJob

# 需要导入模块: from scheduler import Scheduler [as 别名]
# 或者: from scheduler.Scheduler import feed [as 别名]

#.........这里部分代码省略.........
        
    def shutdown_dispatcher(self):
        if not self.dispatcher: return
        logging.info("shutting down dispatcher")
        self.dispatcher.shutdown()
        self.dispatcher = None

    def shutdown(self):
        logging.info("shutting down scheduler")
        self.scheduler.shutdown()
        logging.info("closing incoming queues")
        self.inq.flush()
        self.inq.close()
        self.shutdown_dispatcher()
        logging.info("done.")

    def get_status(self):
        r = dict(job=self.jobname, oid=id(self))
        r['sch'] = self.scheduler and self.scheduler.get_status()
        r['inq'] = self.inq and self.inq.get_status()
        return r

    def get_workset_status(self):
        r = dict(job=self.jobname, crawljob=id(self))
        if self.scheduler:
            r['sch'] = id(self.scheduler)
            r['worksets'] = self.scheduler.get_workset_status()
        return r
        
    def workset_activating(self, *args):
        self.init_dispatcher()
        self.dispatcher.workset_activating(*args)

    def schedule(self, curis):
        '''schedule curis bypassing seen-check. typically used for starting
           new crawl cycle.'''
        scheduled = 0
        for curi in curis:
            self.scheduler.schedule(curi)
            scheduled += 1
        return dict(processed=scheduled, scheduled=scheduled)

    def discovered(self, curis):
        return self.inq.add(curis)

    def processinq(self, maxn):
        self.init_dispatcher()
        return self.dispatcher.processinq(maxn)

    def makecuri(self, o):
        # temporary rescue measure. delete after everything's got fixed.
        a = o.get('a')
        if isinstance(a, dict):
            for k in 'pvx':
                m = a.pop(k, None)
                if m is not None: o[k] = m
            if not o['a']:
                del o['a']
        return o

    def feed(self, client, n):
        logging.debug('feed "%s" begin', client)
        curis = self.scheduler.feed(client, n)
        # add recrawl info if enabled
        if self.use_crawlinfo and len(curis) > 0 and self.hq.crawlinfo:
            t0 = time.time()
            self.hq.crawlinfo.update_crawlinfo(curis)
            t = time.time() - t0
            if t / len(curis) > 0.5:
                logging.warn("SLOW update_crawlinfo: %s %.3fs/%d",
                             client, t, len(curis))
            self.hq.crawlinfo.mongo.end_request()
        r = [self.makecuri(u) for u in curis]
        # if client queue is empty, request incoming queue to flush
        if not r:
            # but do not flush too frequently.
            if self.inq.addedcount > self.last_inq_count + 1000:
                self.inq.flush
                self.last_inq_count = self.inq.addedcount
        return r

    def finished(self, curis):
        result = dict(processed=0)
        for curi in curis:
            self.scheduler.finished(curi)
            result['processed'] += 1
        if self.save_crawlinfo and self.hq.crawlinfo:
            for curi in curis:
                self.hq.crawlinfo.save_result(curi)
            # XXX - until I come up with better design
            self.hq.crawlinfo.mongo.end_request()
        return result

    def reset(self, client):
        return self.scheduler.reset(client)

    def flush(self):
        self.inq.flush()
        self.inq.close()
        return self.scheduler.flush_clients()
开发者ID:travisfw,项目名称:crawlhq,代码行数:104,代码来源:hq.py

示例2: CrawlJob

# 需要导入模块: from scheduler import Scheduler [as 别名]
# 或者: from scheduler.Scheduler import feed [as 别名]

#.........这里部分代码省略.........
        host = uc.netloc
        p = host.find(':')
        if p > 0: host = host[:p]
        di = self.domaininfo.get(host)
        return di
        
    def schedule(self, curis):
        '''schedule curis bypassing seen-check. typically used for starting
           new crawl cycle.'''
        scheduled = 0
        for curi in curis:
            self.scheduler.schedule(curi)
            scheduled += 1
        return dict(processed=scheduled, scheduled=scheduled)

    def discovered(self, curis):
        return self.inq.add(curis)
        
    def is_client_active(self, clid):
        """is client clid active?"""
        # TODO: update ZooKeeper when active status changes
        #t = self.client_last_active.get(str(clid))
        return self.scheduler.is_active(clid)

    def is_workset_active(self, wsid):
        """is workset wsid assigned to any active client?"""
        clid = self.mapper.worksetclient[wsid]
        return self.is_client_active(clid)

    def workset_activating(self, wsid):
        """activates working set wsid; start sending CURIs to Scheduler
        and enqueue diverted CURIs back into incoming queue so that
        processinq will process them (again). called by Scheduler,
        through CrawlMapper, when client starts feeding.
        note, unlike workset_deactivating, this method shall not be
        called from inside processinq method below, because processinq
        executes it only when at least one CURI is available for processing.
        if inq is empty, CURIs in divert queues would never be enqueued back.
        """
        # this could be executed asynchronously
        logging.info('workset %s activated', wsid)
        self.workset_state[wsid] = 1
        # is it better to move files back into inq directory?
        qfiles = self.diverter.listqfiles(wsid)
        logging.info('re-scheduling %s to inq', str(qfiles))
        self.inq.rqfile.qfiles_available(qfiles)

    def workset_deactivating(self, wsid):
        """deactivates working set wsid; start sending CURIs into
        divert queues."""
        logging.info('workset %s deactivated', wsid)
        self.workset_state[wsid] = 0
        # flush Workset queues. we don't move qfiles to diverter yet.
        # it will be done when other HQ server becomes active on the
        # workset, and this HQ server starts forwarding CURIs.
        self.scheduler.flush_workset(wsid)

    def processinq(self, maxn):
        '''process incoming queue. maxn paramter adivces
        upper limit on number of URIs processed in this single call.
        actual number of URIs processed may exceed it if incoming queue
        stores URIs in chunks.'''

        # lazy initialization of seen db
        if not self.seen:
            try:
开发者ID:travisfw,项目名称:hq,代码行数:70,代码来源:hq.py


注:本文中的scheduler.Scheduler.feed方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。