当前位置: 首页>>代码示例>>Python>>正文


Python Scheduler.flush_clients方法代码示例

本文整理汇总了Python中scheduler.Scheduler.flush_clients方法的典型用法代码示例。如果您正苦于以下问题:Python Scheduler.flush_clients方法的具体用法?Python Scheduler.flush_clients怎么用?Python Scheduler.flush_clients使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scheduler.Scheduler的用法示例。


在下文中一共展示了Scheduler.flush_clients方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: CrawlJob

# 需要导入模块: from scheduler import Scheduler [as 别名]
# 或者: from scheduler.Scheduler import flush_clients [as 别名]

#.........这里部分代码省略.........
        
    def shutdown_dispatcher(self):
        if not self.dispatcher: return
        logging.info("shutting down dispatcher")
        self.dispatcher.shutdown()
        self.dispatcher = None

    def shutdown(self):
        logging.info("shutting down scheduler")
        self.scheduler.shutdown()
        logging.info("closing incoming queues")
        self.inq.flush()
        self.inq.close()
        self.shutdown_dispatcher()
        logging.info("done.")

    def get_status(self):
        r = dict(job=self.jobname, oid=id(self))
        r['sch'] = self.scheduler and self.scheduler.get_status()
        r['inq'] = self.inq and self.inq.get_status()
        return r

    def get_workset_status(self):
        r = dict(job=self.jobname, crawljob=id(self))
        if self.scheduler:
            r['sch'] = id(self.scheduler)
            r['worksets'] = self.scheduler.get_workset_status()
        return r
        
    def workset_activating(self, *args):
        self.init_dispatcher()
        self.dispatcher.workset_activating(*args)

    def schedule(self, curis):
        '''schedule curis bypassing seen-check. typically used for starting
           new crawl cycle.'''
        scheduled = 0
        for curi in curis:
            self.scheduler.schedule(curi)
            scheduled += 1
        return dict(processed=scheduled, scheduled=scheduled)

    def discovered(self, curis):
        return self.inq.add(curis)

    def processinq(self, maxn):
        self.init_dispatcher()
        return self.dispatcher.processinq(maxn)

    def makecuri(self, o):
        # temporary rescue measure. delete after everything's got fixed.
        a = o.get('a')
        if isinstance(a, dict):
            for k in 'pvx':
                m = a.pop(k, None)
                if m is not None: o[k] = m
            if not o['a']:
                del o['a']
        return o

    def feed(self, client, n):
        logging.debug('feed "%s" begin', client)
        curis = self.scheduler.feed(client, n)
        # add recrawl info if enabled
        if self.use_crawlinfo and len(curis) > 0 and self.hq.crawlinfo:
            t0 = time.time()
            self.hq.crawlinfo.update_crawlinfo(curis)
            t = time.time() - t0
            if t / len(curis) > 0.5:
                logging.warn("SLOW update_crawlinfo: %s %.3fs/%d",
                             client, t, len(curis))
            self.hq.crawlinfo.mongo.end_request()
        r = [self.makecuri(u) for u in curis]
        # if client queue is empty, request incoming queue to flush
        if not r:
            # but do not flush too frequently.
            if self.inq.addedcount > self.last_inq_count + 1000:
                self.inq.flush
                self.last_inq_count = self.inq.addedcount
        return r

    def finished(self, curis):
        result = dict(processed=0)
        for curi in curis:
            self.scheduler.finished(curi)
            result['processed'] += 1
        if self.save_crawlinfo and self.hq.crawlinfo:
            for curi in curis:
                self.hq.crawlinfo.save_result(curi)
            # XXX - until I come up with better design
            self.hq.crawlinfo.mongo.end_request()
        return result

    def reset(self, client):
        return self.scheduler.reset(client)

    def flush(self):
        self.inq.flush()
        self.inq.close()
        return self.scheduler.flush_clients()
开发者ID:travisfw,项目名称:crawlhq,代码行数:104,代码来源:hq.py

示例2: CrawlJob

# 需要导入模块: from scheduler import Scheduler [as 别名]
# 或者: from scheduler.Scheduler import flush_clients [as 别名]

#.........这里部分代码省略.........
        actual number of URIs processed may exceed it if incoming queue
        stores URIs in chunks.'''

        # lazy initialization of seen db
        if not self.seen:
            try:
                cachesize = hqconfig.get('seencache')
                if cachesize: cachesize = int(cachesize)*(1024**2)
            except:
                cachesize = None
            self.seen = Seen(dbdir=hqconfig.seendir(self.jobname),
                             block_cache_size=cachesize)

        result = dict(processed=0, scheduled=0, excluded=0, saved=0,
                      td=0.0, ts=0.0)
        for count in xrange(maxn):
            t0 = time.time()
            furi = self.inq.get(0.01)
            result['td'] += (time.time() - t0)
            if furi is None: break
            result['processed'] += 1
            ws = self.mapper.workset(furi)
            if self.is_workset_active(ws):
                # no need to call self.workset_activating(). it's already
                # done by Scheduler.
                di = self.get_domaininfo(furi['u'])
                if di and di['exclude']:
                    result['excluded'] += 1
                    continue
                t0 = time.time()
                suri = self.seen.already_seen(furi)
                if suri['e'] < int(time.time()):
                    if 'w' in furi:
                        a = furi['w']
                    else:
                        a = dict()
                        for k in ('p','v','x'):
                            m = furi.get(k)
                            if m is not None:
                                a[k] = m
                    curi = dict(u=furi['u'], id=suri['_id'], a=a)
                    self.scheduler.schedule(curi, ws)
                    result['scheduled'] += 1
                result['ts'] += (time.time() - t0)
            else:
                if self.workset_state[ws]:
                    self.workset_deactivating(ws)
                # client is not active
                self.diverter.divert(str(ws), furi)
                result['saved'] += 1
        return result

    def makecuri(self, o):
        if 'a' not in o:
            if 'w' in o:
                o['a'] = o['w']
                del o['w']
            else:
                a = dict()
                for k in 'pxv':
                    if k in o:
                        a[k] = o[k]
                        del o[k]
                if a: o['a'] = a
        return o

    def feed(self, client, n):
        logging.debug('feed %s begin', client)
        curis = self.scheduler.feed(client, n)
        # add recrawl info if enabled
        if self.use_crawlinfo and len(curis) > 0 and self.crawlinfodb:
            t0 = time.time()
            self.crawlinfodb.update_crawlinfo(curis)
            t = time.time() - t0
            if t / len(curis) > 0.5:
                logging.warn("SLOW update_crawlinfo: %s %.3fs/%d",
                             client, t, len(curis))
            self.crawlinfodb.mongo.end_request()
        r = [self.makecuri(u) for u in curis]
        return r

    def finished(self, curis):
        result = dict(processed=0)
        for curi in curis:
            self.scheduler.finished(curi)
            result['processed'] += 1
        if self.save_crawlinfo and self.crawlinfodb:
            for curi in curis:
                self.crawlinfodb.save_result(curi)
            # XXX - until I come up with better design
            self.crawlinfodb.mongo.end_request()
        return result

    def reset(self, client):
        return self.scheduler.reset(client)

    def flush(self):
        self.inq.flush()
        self.inq.close()
        return self.scheduler.flush_clients()
开发者ID:travisfw,项目名称:hq,代码行数:104,代码来源:hq.py


注:本文中的scheduler.Scheduler.flush_clients方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。