当前位置: 首页>>代码示例>>Python>>正文


Python Scheduler.get_workset_status方法代码示例

本文整理汇总了Python中scheduler.Scheduler.get_workset_status方法的典型用法代码示例。如果您正苦于以下问题:Python Scheduler.get_workset_status方法的具体用法?Python Scheduler.get_workset_status怎么用?Python Scheduler.get_workset_status使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scheduler.Scheduler的用法示例。


在下文中一共展示了Scheduler.get_workset_status方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: CrawlJob

# 需要导入模块: from scheduler import Scheduler [as 别名]
# 或者: from scheduler.Scheduler import get_workset_status [as 别名]
class CrawlJob(object):

    def __init__(self, hq, jobname):
        self.hq = hq
        self.jobconfigs = self.hq.jobconfigs
        self.jobname = jobname
        self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS)
        self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname),
                                   self.mapper)

        self.inq = PooledIncomingQueue(
            qdir=hqconfig.inqdir(self.jobname),
            buffsize=1000)

        self._dispatcher_mode = hqconfig.get(
            ('jobs', self.jobname, 'dispatcher'), 'internal')
                                            
        self.dispatcher = None
        #self.init_dispatcher()

        # currently disabled by default - too slow
        self.use_crawlinfo = False
        self.save_crawlinfo = False

        self.last_inq_count = 0

    PARAMS = [('use_crawlinfo', bool),
              ('save_crawlinfo', bool),
              ('dispatcher_mode', str)]

    @property
    def dispatcher_mode(self):
        return self._dispatcher_mode
    @dispatcher_mode.setter
    def dispatcher_mode(self, value):
        self._dispatcher_mode = value
        if value == 'external':
            self.shutdown_dispatcher()

    def init_dispatcher(self):
        if self.dispatcher: return
        if self.dispatcher_mode == 'external':
            raise RuntimeError, 'dispatcher mode is %s' % self.dispatcher_mode
        self.dispatcher = Dispatcher(self.hq.get_domaininfo(),
                                     self.jobname,
                                     mapper=self.mapper,
                                     scheduler=self.scheduler,
                                     inq=self.inq.rqfile)
        
    def shutdown_dispatcher(self):
        if not self.dispatcher: return
        logging.info("shutting down dispatcher")
        self.dispatcher.shutdown()
        self.dispatcher = None

    def shutdown(self):
        logging.info("shutting down scheduler")
        self.scheduler.shutdown()
        logging.info("closing incoming queues")
        self.inq.flush()
        self.inq.close()
        self.shutdown_dispatcher()
        logging.info("done.")

    def get_status(self):
        r = dict(job=self.jobname, oid=id(self))
        r['sch'] = self.scheduler and self.scheduler.get_status()
        r['inq'] = self.inq and self.inq.get_status()
        return r

    def get_workset_status(self):
        r = dict(job=self.jobname, crawljob=id(self))
        if self.scheduler:
            r['sch'] = id(self.scheduler)
            r['worksets'] = self.scheduler.get_workset_status()
        return r
        
    def workset_activating(self, *args):
        self.init_dispatcher()
        self.dispatcher.workset_activating(*args)

    def schedule(self, curis):
        '''schedule curis bypassing seen-check. typically used for starting
           new crawl cycle.'''
        scheduled = 0
        for curi in curis:
            self.scheduler.schedule(curi)
            scheduled += 1
        return dict(processed=scheduled, scheduled=scheduled)

    def discovered(self, curis):
        return self.inq.add(curis)

    def processinq(self, maxn):
        self.init_dispatcher()
        return self.dispatcher.processinq(maxn)

    def makecuri(self, o):
        # temporary rescue measure. delete after everything's got fixed.
        a = o.get('a')
#.........这里部分代码省略.........
开发者ID:travisfw,项目名称:crawlhq,代码行数:103,代码来源:hq.py

示例2: CrawlJob

# 需要导入模块: from scheduler import Scheduler [as 别名]
# 或者: from scheduler.Scheduler import get_workset_status [as 别名]
class CrawlJob(object):
    NWORKSETS_BITS = 8

    def __init__(self, jobconfigs, jobname, crawlinfo, domaininfo):
        self.jobconfigs = jobconfigs
        self.jobname = jobname
        self.mapper = CrawlMapper(self, self.NWORKSETS_BITS)
        self.workset_state = [0 for i in range(self.mapper.nworksets)]

        # seen-db initialization is delayed until it's actually needed
        self.seen = None
        #self.seen = Seen(dbdir=os.path.join(HQ_HOME, 'seen', self.jobname))
        self.crawlinfodb = crawlinfo
        self.domaininfo = domaininfo
        self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname),
                                   self.mapper)
        # self.inq = HashSplitIncomingQueue(
        #     qdir=hqconfig.inqdir(self.jobname),
        #     buffsize=500)
        self.inq = PooledIncomingQueue(
            qdir=hqconfig.inqdir(self.jobname),
            buffsize=1000)

        self.diverter = Diverter(self.jobname, self.mapper)

        #self.discovered_executor = ThreadPoolExecutor(poolsize=1)

        # currently disabled by default - too slow
        self.use_crawlinfo = False
        self.save_crawlinfo = False

    PARAMS = [('use_crawlinfo', bool),
              ('save_crawlinfo', bool)]

    def shutdown(self):
        logging.info("shutting down scheduler")
        self.scheduler.shutdown()
        logging.info("shutting down diverter")
        self.diverter.shutdown()
        if self.seen:
            logging.info("closing seen db")
            self.seen.close()
        logging.info("closing incoming queues")
        self.inq.flush()
        self.inq.close()
        logging.info("shutting down crawlinfo")
        self.crawlinfodb.shutdown()
        logging.info("done.")
        #self.discovered_executor.shutdown()

    def get_status(self):
        r = dict(job=self.jobname, oid=id(self))
        r['seen'] = self.seen and self.seen.get_status()
        r['sch'] = self.scheduler and self.scheduler.get_status()
        r['inq'] = self.inq and self.inq.get_status()
        return r

    def get_workset_status(self):
        r = dict(job=self.jobname, crawljob=id(self))
        if self.scheduler:
            r['sch'] = id(self.scheduler)
            r['worksets'] = self.scheduler.get_workset_status()
        return r
        
    #def discovered_async(self, curis):
    #    return self.inq.add(curis)

    def get_domaininfo(self, url):
        uc = urlsplit(url)
        host = uc.netloc
        p = host.find(':')
        if p > 0: host = host[:p]
        di = self.domaininfo.get(host)
        return di
        
    def schedule(self, curis):
        '''schedule curis bypassing seen-check. typically used for starting
           new crawl cycle.'''
        scheduled = 0
        for curi in curis:
            self.scheduler.schedule(curi)
            scheduled += 1
        return dict(processed=scheduled, scheduled=scheduled)

    def discovered(self, curis):
        return self.inq.add(curis)
        
    def is_client_active(self, clid):
        """is client clid active?"""
        # TODO: update ZooKeeper when active status changes
        #t = self.client_last_active.get(str(clid))
        return self.scheduler.is_active(clid)

    def is_workset_active(self, wsid):
        """is workset wsid assigned to any active client?"""
        clid = self.mapper.worksetclient[wsid]
        return self.is_client_active(clid)

    def workset_activating(self, wsid):
        """activates working set wsid; start sending CURIs to Scheduler
#.........这里部分代码省略.........
开发者ID:travisfw,项目名称:hq,代码行数:103,代码来源:hq.py


注:本文中的scheduler.Scheduler.get_workset_status方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。