本文整理汇总了Python中scheduler.Scheduler.get_workset_status方法的典型用法代码示例。如果您正苦于以下问题:Python Scheduler.get_workset_status方法的具体用法?Python Scheduler.get_workset_status怎么用?Python Scheduler.get_workset_status使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scheduler.Scheduler
的用法示例。
在下文中一共展示了Scheduler.get_workset_status方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: CrawlJob
# 需要导入模块: from scheduler import Scheduler [as 别名]
# 或者: from scheduler.Scheduler import get_workset_status [as 别名]
class CrawlJob(object):
def __init__(self, hq, jobname):
self.hq = hq
self.jobconfigs = self.hq.jobconfigs
self.jobname = jobname
self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS)
self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname),
self.mapper)
self.inq = PooledIncomingQueue(
qdir=hqconfig.inqdir(self.jobname),
buffsize=1000)
self._dispatcher_mode = hqconfig.get(
('jobs', self.jobname, 'dispatcher'), 'internal')
self.dispatcher = None
#self.init_dispatcher()
# currently disabled by default - too slow
self.use_crawlinfo = False
self.save_crawlinfo = False
self.last_inq_count = 0
PARAMS = [('use_crawlinfo', bool),
('save_crawlinfo', bool),
('dispatcher_mode', str)]
@property
def dispatcher_mode(self):
return self._dispatcher_mode
@dispatcher_mode.setter
def dispatcher_mode(self, value):
self._dispatcher_mode = value
if value == 'external':
self.shutdown_dispatcher()
def init_dispatcher(self):
if self.dispatcher: return
if self.dispatcher_mode == 'external':
raise RuntimeError, 'dispatcher mode is %s' % self.dispatcher_mode
self.dispatcher = Dispatcher(self.hq.get_domaininfo(),
self.jobname,
mapper=self.mapper,
scheduler=self.scheduler,
inq=self.inq.rqfile)
def shutdown_dispatcher(self):
if not self.dispatcher: return
logging.info("shutting down dispatcher")
self.dispatcher.shutdown()
self.dispatcher = None
def shutdown(self):
logging.info("shutting down scheduler")
self.scheduler.shutdown()
logging.info("closing incoming queues")
self.inq.flush()
self.inq.close()
self.shutdown_dispatcher()
logging.info("done.")
def get_status(self):
r = dict(job=self.jobname, oid=id(self))
r['sch'] = self.scheduler and self.scheduler.get_status()
r['inq'] = self.inq and self.inq.get_status()
return r
def get_workset_status(self):
r = dict(job=self.jobname, crawljob=id(self))
if self.scheduler:
r['sch'] = id(self.scheduler)
r['worksets'] = self.scheduler.get_workset_status()
return r
def workset_activating(self, *args):
self.init_dispatcher()
self.dispatcher.workset_activating(*args)
def schedule(self, curis):
'''schedule curis bypassing seen-check. typically used for starting
new crawl cycle.'''
scheduled = 0
for curi in curis:
self.scheduler.schedule(curi)
scheduled += 1
return dict(processed=scheduled, scheduled=scheduled)
def discovered(self, curis):
return self.inq.add(curis)
def processinq(self, maxn):
self.init_dispatcher()
return self.dispatcher.processinq(maxn)
def makecuri(self, o):
# temporary rescue measure. delete after everything's got fixed.
a = o.get('a')
#.........这里部分代码省略.........
示例2: CrawlJob
# 需要导入模块: from scheduler import Scheduler [as 别名]
# 或者: from scheduler.Scheduler import get_workset_status [as 别名]
class CrawlJob(object):
NWORKSETS_BITS = 8
def __init__(self, jobconfigs, jobname, crawlinfo, domaininfo):
self.jobconfigs = jobconfigs
self.jobname = jobname
self.mapper = CrawlMapper(self, self.NWORKSETS_BITS)
self.workset_state = [0 for i in range(self.mapper.nworksets)]
# seen-db initialization is delayed until it's actually needed
self.seen = None
#self.seen = Seen(dbdir=os.path.join(HQ_HOME, 'seen', self.jobname))
self.crawlinfodb = crawlinfo
self.domaininfo = domaininfo
self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname),
self.mapper)
# self.inq = HashSplitIncomingQueue(
# qdir=hqconfig.inqdir(self.jobname),
# buffsize=500)
self.inq = PooledIncomingQueue(
qdir=hqconfig.inqdir(self.jobname),
buffsize=1000)
self.diverter = Diverter(self.jobname, self.mapper)
#self.discovered_executor = ThreadPoolExecutor(poolsize=1)
# currently disabled by default - too slow
self.use_crawlinfo = False
self.save_crawlinfo = False
PARAMS = [('use_crawlinfo', bool),
('save_crawlinfo', bool)]
def shutdown(self):
logging.info("shutting down scheduler")
self.scheduler.shutdown()
logging.info("shutting down diverter")
self.diverter.shutdown()
if self.seen:
logging.info("closing seen db")
self.seen.close()
logging.info("closing incoming queues")
self.inq.flush()
self.inq.close()
logging.info("shutting down crawlinfo")
self.crawlinfodb.shutdown()
logging.info("done.")
#self.discovered_executor.shutdown()
def get_status(self):
r = dict(job=self.jobname, oid=id(self))
r['seen'] = self.seen and self.seen.get_status()
r['sch'] = self.scheduler and self.scheduler.get_status()
r['inq'] = self.inq and self.inq.get_status()
return r
def get_workset_status(self):
r = dict(job=self.jobname, crawljob=id(self))
if self.scheduler:
r['sch'] = id(self.scheduler)
r['worksets'] = self.scheduler.get_workset_status()
return r
#def discovered_async(self, curis):
# return self.inq.add(curis)
def get_domaininfo(self, url):
uc = urlsplit(url)
host = uc.netloc
p = host.find(':')
if p > 0: host = host[:p]
di = self.domaininfo.get(host)
return di
def schedule(self, curis):
'''schedule curis bypassing seen-check. typically used for starting
new crawl cycle.'''
scheduled = 0
for curi in curis:
self.scheduler.schedule(curi)
scheduled += 1
return dict(processed=scheduled, scheduled=scheduled)
def discovered(self, curis):
return self.inq.add(curis)
def is_client_active(self, clid):
"""is client clid active?"""
# TODO: update ZooKeeper when active status changes
#t = self.client_last_active.get(str(clid))
return self.scheduler.is_active(clid)
def is_workset_active(self, wsid):
"""is workset wsid assigned to any active client?"""
clid = self.mapper.worksetclient[wsid]
return self.is_client_active(clid)
def workset_activating(self, wsid):
"""activates working set wsid; start sending CURIs to Scheduler
#.........这里部分代码省略.........