本文整理汇总了Python中scheduler.Scheduler.feed方法的典型用法代码示例。如果您正苦于以下问题:Python Scheduler.feed方法的具体用法?Python Scheduler.feed怎么用?Python Scheduler.feed使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scheduler.Scheduler
的用法示例。
在下文中一共展示了Scheduler.feed方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: CrawlJob
# 需要导入模块: from scheduler import Scheduler [as 别名]
# 或者: from scheduler.Scheduler import feed [as 别名]
#.........这里部分代码省略.........
def shutdown_dispatcher(self):
if not self.dispatcher: return
logging.info("shutting down dispatcher")
self.dispatcher.shutdown()
self.dispatcher = None
def shutdown(self):
logging.info("shutting down scheduler")
self.scheduler.shutdown()
logging.info("closing incoming queues")
self.inq.flush()
self.inq.close()
self.shutdown_dispatcher()
logging.info("done.")
def get_status(self):
r = dict(job=self.jobname, oid=id(self))
r['sch'] = self.scheduler and self.scheduler.get_status()
r['inq'] = self.inq and self.inq.get_status()
return r
def get_workset_status(self):
r = dict(job=self.jobname, crawljob=id(self))
if self.scheduler:
r['sch'] = id(self.scheduler)
r['worksets'] = self.scheduler.get_workset_status()
return r
def workset_activating(self, *args):
self.init_dispatcher()
self.dispatcher.workset_activating(*args)
def schedule(self, curis):
'''schedule curis bypassing seen-check. typically used for starting
new crawl cycle.'''
scheduled = 0
for curi in curis:
self.scheduler.schedule(curi)
scheduled += 1
return dict(processed=scheduled, scheduled=scheduled)
def discovered(self, curis):
return self.inq.add(curis)
def processinq(self, maxn):
self.init_dispatcher()
return self.dispatcher.processinq(maxn)
def makecuri(self, o):
# temporary rescue measure. delete after everything's got fixed.
a = o.get('a')
if isinstance(a, dict):
for k in 'pvx':
m = a.pop(k, None)
if m is not None: o[k] = m
if not o['a']:
del o['a']
return o
def feed(self, client, n):
logging.debug('feed "%s" begin', client)
curis = self.scheduler.feed(client, n)
# add recrawl info if enabled
if self.use_crawlinfo and len(curis) > 0 and self.hq.crawlinfo:
t0 = time.time()
self.hq.crawlinfo.update_crawlinfo(curis)
t = time.time() - t0
if t / len(curis) > 0.5:
logging.warn("SLOW update_crawlinfo: %s %.3fs/%d",
client, t, len(curis))
self.hq.crawlinfo.mongo.end_request()
r = [self.makecuri(u) for u in curis]
# if client queue is empty, request incoming queue to flush
if not r:
# but do not flush too frequently.
if self.inq.addedcount > self.last_inq_count + 1000:
self.inq.flush
self.last_inq_count = self.inq.addedcount
return r
def finished(self, curis):
result = dict(processed=0)
for curi in curis:
self.scheduler.finished(curi)
result['processed'] += 1
if self.save_crawlinfo and self.hq.crawlinfo:
for curi in curis:
self.hq.crawlinfo.save_result(curi)
# XXX - until I come up with better design
self.hq.crawlinfo.mongo.end_request()
return result
def reset(self, client):
return self.scheduler.reset(client)
def flush(self):
self.inq.flush()
self.inq.close()
return self.scheduler.flush_clients()
示例2: CrawlJob
# 需要导入模块: from scheduler import Scheduler [as 别名]
# 或者: from scheduler.Scheduler import feed [as 别名]
#.........这里部分代码省略.........
host = uc.netloc
p = host.find(':')
if p > 0: host = host[:p]
di = self.domaininfo.get(host)
return di
def schedule(self, curis):
'''schedule curis bypassing seen-check. typically used for starting
new crawl cycle.'''
scheduled = 0
for curi in curis:
self.scheduler.schedule(curi)
scheduled += 1
return dict(processed=scheduled, scheduled=scheduled)
def discovered(self, curis):
return self.inq.add(curis)
def is_client_active(self, clid):
"""is client clid active?"""
# TODO: update ZooKeeper when active status changes
#t = self.client_last_active.get(str(clid))
return self.scheduler.is_active(clid)
def is_workset_active(self, wsid):
"""is workset wsid assigned to any active client?"""
clid = self.mapper.worksetclient[wsid]
return self.is_client_active(clid)
def workset_activating(self, wsid):
"""activates working set wsid; start sending CURIs to Scheduler
and enqueue diverted CURIs back into incoming queue so that
processinq will process them (again). called by Scheduler,
through CrawlMapper, when client starts feeding.
note, unlike workset_deactivating, this method shall not be
called from inside processinq method below, because processinq
executes it only when at least one CURI is available for processing.
if inq is empty, CURIs in divert queues would never be enqueued back.
"""
# this could be executed asynchronously
logging.info('workset %s activated', wsid)
self.workset_state[wsid] = 1
# is it better to move files back into inq directory?
qfiles = self.diverter.listqfiles(wsid)
logging.info('re-scheduling %s to inq', str(qfiles))
self.inq.rqfile.qfiles_available(qfiles)
def workset_deactivating(self, wsid):
"""deactivates working set wsid; start sending CURIs into
divert queues."""
logging.info('workset %s deactivated', wsid)
self.workset_state[wsid] = 0
# flush Workset queues. we don't move qfiles to diverter yet.
# it will be done when other HQ server becomes active on the
# workset, and this HQ server starts forwarding CURIs.
self.scheduler.flush_workset(wsid)
def processinq(self, maxn):
'''process incoming queue. maxn paramter adivces
upper limit on number of URIs processed in this single call.
actual number of URIs processed may exceed it if incoming queue
stores URIs in chunks.'''
# lazy initialization of seen db
if not self.seen:
try: