本文整理汇总了Python中scheduler.Scheduler.finished方法的典型用法代码示例。如果您正苦于以下问题:Python Scheduler.finished方法的具体用法?Python Scheduler.finished怎么用?Python Scheduler.finished使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scheduler.Scheduler
的用法示例。
在下文中一共展示了Scheduler.finished方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: CrawlJob
# 需要导入模块: from scheduler import Scheduler [as 别名]
# 或者: from scheduler.Scheduler import finished [as 别名]
#.........这里部分代码省略.........
def shutdown_dispatcher(self):
if not self.dispatcher: return
logging.info("shutting down dispatcher")
self.dispatcher.shutdown()
self.dispatcher = None
def shutdown(self):
logging.info("shutting down scheduler")
self.scheduler.shutdown()
logging.info("closing incoming queues")
self.inq.flush()
self.inq.close()
self.shutdown_dispatcher()
logging.info("done.")
def get_status(self):
r = dict(job=self.jobname, oid=id(self))
r['sch'] = self.scheduler and self.scheduler.get_status()
r['inq'] = self.inq and self.inq.get_status()
return r
def get_workset_status(self):
r = dict(job=self.jobname, crawljob=id(self))
if self.scheduler:
r['sch'] = id(self.scheduler)
r['worksets'] = self.scheduler.get_workset_status()
return r
def workset_activating(self, *args):
self.init_dispatcher()
self.dispatcher.workset_activating(*args)
def schedule(self, curis):
'''schedule curis bypassing seen-check. typically used for starting
new crawl cycle.'''
scheduled = 0
for curi in curis:
self.scheduler.schedule(curi)
scheduled += 1
return dict(processed=scheduled, scheduled=scheduled)
def discovered(self, curis):
return self.inq.add(curis)
def processinq(self, maxn):
self.init_dispatcher()
return self.dispatcher.processinq(maxn)
def makecuri(self, o):
# temporary rescue measure. delete after everything's got fixed.
a = o.get('a')
if isinstance(a, dict):
for k in 'pvx':
m = a.pop(k, None)
if m is not None: o[k] = m
if not o['a']:
del o['a']
return o
def feed(self, client, n):
logging.debug('feed "%s" begin', client)
curis = self.scheduler.feed(client, n)
# add recrawl info if enabled
if self.use_crawlinfo and len(curis) > 0 and self.hq.crawlinfo:
t0 = time.time()
self.hq.crawlinfo.update_crawlinfo(curis)
t = time.time() - t0
if t / len(curis) > 0.5:
logging.warn("SLOW update_crawlinfo: %s %.3fs/%d",
client, t, len(curis))
self.hq.crawlinfo.mongo.end_request()
r = [self.makecuri(u) for u in curis]
# if client queue is empty, request incoming queue to flush
if not r:
# but do not flush too frequently.
if self.inq.addedcount > self.last_inq_count + 1000:
self.inq.flush
self.last_inq_count = self.inq.addedcount
return r
def finished(self, curis):
result = dict(processed=0)
for curi in curis:
self.scheduler.finished(curi)
result['processed'] += 1
if self.save_crawlinfo and self.hq.crawlinfo:
for curi in curis:
self.hq.crawlinfo.save_result(curi)
# XXX - until I come up with better design
self.hq.crawlinfo.mongo.end_request()
return result
def reset(self, client):
return self.scheduler.reset(client)
def flush(self):
self.inq.flush()
self.inq.close()
return self.scheduler.flush_clients()
示例2: CrawlJob
# 需要导入模块: from scheduler import Scheduler [as 别名]
# 或者: from scheduler.Scheduler import finished [as 别名]
#.........这里部分代码省略.........
actual number of URIs processed may exceed it if incoming queue
stores URIs in chunks.'''
# lazy initialization of seen db
if not self.seen:
try:
cachesize = hqconfig.get('seencache')
if cachesize: cachesize = int(cachesize)*(1024**2)
except:
cachesize = None
self.seen = Seen(dbdir=hqconfig.seendir(self.jobname),
block_cache_size=cachesize)
result = dict(processed=0, scheduled=0, excluded=0, saved=0,
td=0.0, ts=0.0)
for count in xrange(maxn):
t0 = time.time()
furi = self.inq.get(0.01)
result['td'] += (time.time() - t0)
if furi is None: break
result['processed'] += 1
ws = self.mapper.workset(furi)
if self.is_workset_active(ws):
# no need to call self.workset_activating(). it's already
# done by Scheduler.
di = self.get_domaininfo(furi['u'])
if di and di['exclude']:
result['excluded'] += 1
continue
t0 = time.time()
suri = self.seen.already_seen(furi)
if suri['e'] < int(time.time()):
if 'w' in furi:
a = furi['w']
else:
a = dict()
for k in ('p','v','x'):
m = furi.get(k)
if m is not None:
a[k] = m
curi = dict(u=furi['u'], id=suri['_id'], a=a)
self.scheduler.schedule(curi, ws)
result['scheduled'] += 1
result['ts'] += (time.time() - t0)
else:
if self.workset_state[ws]:
self.workset_deactivating(ws)
# client is not active
self.diverter.divert(str(ws), furi)
result['saved'] += 1
return result
def makecuri(self, o):
if 'a' not in o:
if 'w' in o:
o['a'] = o['w']
del o['w']
else:
a = dict()
for k in 'pxv':
if k in o:
a[k] = o[k]
del o[k]
if a: o['a'] = a
return o
def feed(self, client, n):
logging.debug('feed %s begin', client)
curis = self.scheduler.feed(client, n)
# add recrawl info if enabled
if self.use_crawlinfo and len(curis) > 0 and self.crawlinfodb:
t0 = time.time()
self.crawlinfodb.update_crawlinfo(curis)
t = time.time() - t0
if t / len(curis) > 0.5:
logging.warn("SLOW update_crawlinfo: %s %.3fs/%d",
client, t, len(curis))
self.crawlinfodb.mongo.end_request()
r = [self.makecuri(u) for u in curis]
return r
def finished(self, curis):
result = dict(processed=0)
for curi in curis:
self.scheduler.finished(curi)
result['processed'] += 1
if self.save_crawlinfo and self.crawlinfodb:
for curi in curis:
self.crawlinfodb.save_result(curi)
# XXX - until I come up with better design
self.crawlinfodb.mongo.end_request()
return result
def reset(self, client):
return self.scheduler.reset(client)
def flush(self):
self.inq.flush()
self.inq.close()
return self.scheduler.flush_clients()