本文整理汇总了Python中scrapy.core.scraper.Scraper.close_spider方法的典型用法代码示例。如果您正苦于以下问题:Python Scraper.close_spider方法的具体用法?Python Scraper.close_spider怎么用?Python Scraper.close_spider使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.core.scraper.Scraper
的用法示例。
在下文中一共展示了Scraper.close_spider方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: ExecutionEngine
# 需要导入模块: from scrapy.core.scraper import Scraper [as 别名]
# 或者: from scrapy.core.scraper.Scraper import close_spider [as 别名]
#.........这里部分代码省略.........
def _download(self, request, spider):
slot = self.slots[spider]
slot.add_request(request)
def _on_success(response):
assert isinstance(response, (Response, Request))
if isinstance(response, Response):
response.request = request # tie request to response received
log.msg(log.formatter.crawled(request, response, spider), \
level=log.DEBUG, spider=spider)
send_catch_log(signal=signals.response_received, \
response=response, request=request, spider=spider)
return response
def _on_error(failure):
failure.request = request
return failure
def _on_complete(_):
slot.nextcall.schedule()
return _
dwld = self.downloader.fetch(request, spider)
dwld.addCallbacks(_on_success, _on_error)
dwld.addBoth(_on_complete)
return dwld
@defer.inlineCallbacks
def open_spider(self, spider, start_requests=None, close_if_idle=True):
assert self.has_capacity(), "No free spider slots when opening %r" % \
spider.name
log.msg("Spider opened", spider=spider)
nextcall = CallLaterOnce(self._next_request, spider)
scheduler = self.scheduler_cls.from_settings(self.settings)
slot = Slot(start_requests or (), close_if_idle, nextcall, scheduler)
self.slots[spider] = slot
yield scheduler.open(spider)
yield self.scraper.open_spider(spider)
stats.open_spider(spider)
yield send_catch_log_deferred(signals.spider_opened, spider=spider)
slot.nextcall.schedule()
def _spider_idle(self, spider):
"""Called when a spider gets idle. This function is called when there
are no remaining pages to download or schedule. It can be called
multiple times. If some extension raises a DontCloseSpider exception
(in the spider_idle signal handler) the spider is not closed until the
next loop and this function is guaranteed to be called (at least) once
again for this spider.
"""
res = send_catch_log(signal=signals.spider_idle, \
spider=spider, dont_log=DontCloseSpider)
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
for _, x in res):
self.slots[spider].nextcall.schedule(5)
return
if self.spider_is_idle(spider):
self.close_spider(spider, reason='finished')
def close_spider(self, spider, reason='cancelled'):
"""Close (cancel) spider and clear all its outstanding requests"""
slot = self.slots[spider]
if slot.closing:
return slot.closing
log.msg("Closing spider (%s)" % reason, spider=spider)
dfd = slot.close()
dfd.addBoth(lambda _: self.scraper.close_spider(spider))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: slot.scheduler.close(reason))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: send_catch_log_deferred(signal=signals.spider_closed, \
spider=spider, reason=reason))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))
dfd.addBoth(lambda _: self.slots.pop(spider))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: self._spider_closed_callback(spider))
return dfd
def _close_all_spiders(self):
dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
dlist = defer.DeferredList(dfds)
return dlist
@defer.inlineCallbacks
def _finish_stopping_engine(self):
yield send_catch_log_deferred(signal=signals.engine_stopped)
yield stats.engine_stopped()
示例2: ExecutionEngine
# 需要导入模块: from scrapy.core.scraper import Scraper [as 别名]
# 或者: from scrapy.core.scraper.Scraper import close_spider [as 别名]
#.........这里部分代码省略.........
dwld = self.downloader.fetch(request, spider)
dwld.addCallbacks(_on_success)
dwld.addBoth(_on_complete)
return dwld
@defer.inlineCallbacks
def open_spider(self, spider, start_requests=(), close_if_idle=True):
assert self.has_capacity(), "No free spider slot when opening %r" % \
spider.name
logger.info("Spider opened", extra={'spider': spider})
nextcall = CallLaterOnce(self._next_request, spider)
scheduler = self.scheduler_cls.from_crawler(self.crawler)
start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
self.slot = slot
self.spider = spider
yield scheduler.open(spider)
yield self.scraper.open_spider(spider)
self.crawler.stats.open_spider(spider)
yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
slot.nextcall.schedule()
slot.heartbeat.start(5)
def _spider_idle(self, spider):
"""Called when a spider gets idle. This function is called when there
are no remaining pages to download or schedule. It can be called
multiple times. If some extension raises a DontCloseSpider exception
(in the spider_idle signal handler) the spider is not closed until the
next loop and this function is guaranteed to be called (at least) once
again for this spider.
"""
res = self.signals.send_catch_log(signal=signals.spider_idle, \
spider=spider, dont_log=DontCloseSpider)
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
for _, x in res):
return
if self.spider_is_idle(spider):
self.close_spider(spider, reason='finished')
def close_spider(self, spider, reason='cancelled'):
"""Close (cancel) spider and clear all its outstanding requests"""
slot = self.slot
if slot.closing:
return slot.closing
logger.info("Closing spider (%(reason)s)",
{'reason': reason},
extra={'spider': spider})
dfd = slot.close()
def log_failure(msg):
def errback(failure):
logger.error(
msg,
exc_info=failure_to_exc_info(failure),
extra={'spider': spider}
)
return errback
dfd.addBoth(lambda _: self.downloader.close())
dfd.addErrback(log_failure('Downloader close failure'))
dfd.addBoth(lambda _: self.scraper.close_spider(spider))
dfd.addErrback(log_failure('Scraper close failure'))
dfd.addBoth(lambda _: slot.scheduler.close(reason))
dfd.addErrback(log_failure('Scheduler close failure'))
dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(
signal=signals.spider_closed, spider=spider, reason=reason))
dfd.addErrback(log_failure('Error while sending spider_close signal'))
dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
dfd.addErrback(log_failure('Stats close failure'))
dfd.addBoth(lambda _: logger.info("Spider closed (%(reason)s)",
{'reason': reason},
extra={'spider': spider}))
dfd.addBoth(lambda _: setattr(self, 'slot', None))
dfd.addErrback(log_failure('Error while unassigning slot'))
dfd.addBoth(lambda _: setattr(self, 'spider', None))
dfd.addErrback(log_failure('Error while unassigning spider'))
dfd.addBoth(lambda _: self._spider_closed_callback(spider))
return dfd
def _close_all_spiders(self):
dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
dlist = defer.DeferredList(dfds)
return dlist
@defer.inlineCallbacks
def _finish_stopping_engine(self):
yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
self._closewait.callback(None)
示例3: ExecutionEngine
# 需要导入模块: from scrapy.core.scraper import Scraper [as 别名]
# 或者: from scrapy.core.scraper.Scraper import close_spider [as 别名]
#.........这里部分代码省略.........
def _on_error(_failure):
"""handle an error processing a page"""
exc = _failure.value
if isinstance(exc, IgnoreRequest):
errmsg = _failure.getErrorMessage()
level = exc.level
else:
errmsg = str(_failure)
level = log.ERROR
if errmsg:
log.msg("Error downloading <%s>: %s" % (request.url, errmsg), \
level=level, spider=spider)
return Failure(IgnoreRequest(str(exc)))
def _on_complete(_):
self.next_request(spider)
return _
if spider not in self.downloader.sites:
return defer.fail(Failure(IgnoreRequest())).addBoth(_on_complete)
dwld = mustbe_deferred(self.downloader.fetch, request, spider)
dwld.addCallbacks(_on_success, _on_error)
dwld.addBoth(_on_complete)
return dwld
@defer.inlineCallbacks
def open_spider(self, spider):
assert self.has_capacity(), "No free spider slots when opening %r" % \
spider.name
log.msg("Spider opened", spider=spider)
yield self.scheduler.open_spider(spider)
self.downloader.open_spider(spider)
yield self.scraper.open_spider(spider)
stats.open_spider(spider)
yield send_catch_log_deferred(signals.spider_opened, spider=spider)
self.next_request(spider)
def _spider_idle(self, spider):
"""Called when a spider gets idle. This function is called when there
are no remaining pages to download or schedule. It can be called
multiple times. If some extension raises a DontCloseSpider exception
(in the spider_idle signal handler) the spider is not closed until the
next loop and this function is guaranteed to be called (at least) once
again for this spider.
"""
res = send_catch_log(signal=signals.spider_idle, \
spider=spider, dont_log=DontCloseSpider)
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
for _, x in res):
reactor.callLater(5, self.next_request, spider)
return
if self.spider_is_idle(spider):
self.close_spider(spider, reason='finished')
def close_spider(self, spider, reason='cancelled'):
"""Close (cancel) spider and clear all its outstanding requests"""
if spider in self.closing:
return defer.succeed(None)
log.msg("Closing spider (%s)" % reason, spider=spider)
self.closing[spider] = reason
self.scheduler.clear_pending_requests(spider)
dfd = self.downloader.close_spider(spider)
self.closing_dfds[spider] = dfd
dfd.addBoth(lambda _: self.scheduler.close_spider(spider))
dfd.addErrback(log.err, "Unhandled error in scheduler.close_spider()", \
spider=spider)
dfd.addBoth(lambda _: self.scraper.close_spider(spider))
dfd.addErrback(log.err, "Unhandled error in scraper.close_spider()", \
spider=spider)
dfd.addBoth(lambda _: self._finish_closing_spider(spider))
return dfd
def _close_all_spiders(self):
dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
dfds += self.closing_dfds.values()
dlist = defer.DeferredList(dfds)
return dlist
def _finish_closing_spider(self, spider):
"""This function is called after the spider has been closed"""
reason = self.closing.pop(spider, 'finished')
call = self._next_request_calls.pop(spider, None)
if call and call.active():
call.cancel()
dfd = send_catch_log_deferred(signal=signals.spider_closed, \
spider=spider, reason=reason)
dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
dfd.addErrback(log.err, "Unhandled error in stats.close_spider()",
spider=spider)
dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))
dfd.addBoth(lambda _: self.closing_dfds.pop(spider).callback(spider))
dfd.addBoth(lambda _: self._spider_closed_callback(spider))
return dfd
@defer.inlineCallbacks
def _finish_stopping_engine(self):
yield send_catch_log_deferred(signal=signals.engine_stopped)
示例4: ExecutionEngine
# 需要导入模块: from scrapy.core.scraper import Scraper [as 别名]
# 或者: from scrapy.core.scraper.Scraper import close_spider [as 别名]
#.........这里部分代码省略.........
response.request = request # tie request to response received
logkws = self.logformatter.crawled(request, response, spider)
log.msg(spider=spider, **logkws)
self.signals.send_catch_log(signal=signals.response_received, \
response=response, request=request, spider=spider)
return response
def _on_complete(_):
slot.nextcall.schedule()
return _
dwld = self.downloader.fetch(request, spider)
dwld.addCallbacks(_on_success)
dwld.addBoth(_on_complete)
return dwld
@defer.inlineCallbacks
def open_spider(self, spider, start_requests=(), close_if_idle=True):
assert self.has_capacity(), "No free spider slot when opening %r" % \
spider.name
log.msg("Spider opened", spider=spider)
nextcall = CallLaterOnce(self._next_request, spider)
scheduler = self.scheduler_cls.from_crawler(self.crawler)
start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
self.slot = slot
self.spider = spider
yield scheduler.open(spider)
yield self.scraper.open_spider(spider)
self.crawler.stats.open_spider(spider)
yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
slot.nextcall.schedule()
def _spider_idle(self, spider):
"""Called when a spider gets idle. This function is called when there
are no remaining pages to download or schedule. It can be called
multiple times. If some extension raises a DontCloseSpider exception
(in the spider_idle signal handler) the spider is not closed until the
next loop and this function is guaranteed to be called (at least) once
again for this spider.
"""
res = self.signals.send_catch_log(signal=signals.spider_idle, \
spider=spider, dont_log=DontCloseSpider)
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
for _, x in res):
self.slot.nextcall.schedule(5)
return
if self.spider_is_idle(spider):
self.close_spider(spider, reason='finished')
def close_spider(self, spider, reason='cancelled'):
"""Close (cancel) spider and clear all its outstanding requests"""
slot = self.slot
if slot.closing:
return slot.closing
log.msg(format="Closing spider (%(reason)s)", reason=reason, spider=spider)
dfd = slot.close()
dfd.addBoth(lambda _: self.downloader.close())
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: self.scraper.close_spider(spider))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: slot.scheduler.close(reason))
dfd.addErrback(log.err, spider=spider)
# XXX: spider_stats argument was added for backwards compatibility with
# stats collection refactoring added in 0.15. it should be removed in 0.17.
dfd.addBoth(lambda _: self.signals.send_catch_log_deferred(signal=signals.spider_closed, \
spider=spider, reason=reason, spider_stats=self.crawler.stats.get_stats()))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: self.crawler.stats.close_spider(spider, reason=reason))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: log.msg(format="Spider closed (%(reason)s)", reason=reason, spider=spider))
dfd.addBoth(lambda _: setattr(self, 'slot', None))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: setattr(self, 'spider', None))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: self._spider_closed_callback(spider))
return dfd
def _close_all_spiders(self):
dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
dlist = defer.DeferredList(dfds)
return dlist
@defer.inlineCallbacks
def _finish_stopping_engine(self):
yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
self._closewait.callback(None)
示例5: ExecutionEngine
# 需要导入模块: from scrapy.core.scraper import Scraper [as 别名]
# 或者: from scrapy.core.scraper.Scraper import close_spider [as 别名]
#.........这里部分代码省略.........
log.msg(log.formatter.crawled(request, response, spider), \
level=log.DEBUG, spider=spider)
send_catch_log(signal=signals.response_received, \
response=response, request=request, spider=spider)
return response
def _on_complete(_):
self.next_request(spider)
return _
dwld = mustbe_deferred(self.downloader.fetch, request, spider)
dwld.addCallback(_on_success)
dwld.addBoth(_on_complete)
return dwld
@defer.inlineCallbacks
def open_spider(self, spider):
assert self.has_capacity(), "No free spider slots when opening %r" % \
spider.name
log.msg("Spider opened", spider=spider)
self.slots[spider] = Slot()
yield self.scheduler.open_spider(spider)
self.downloader.open_spider(spider)
yield self.scraper.open_spider(spider)
stats.open_spider(spider)
yield send_catch_log_deferred(signals.spider_opened, spider=spider)
self.next_request(spider)
def _spider_idle(self, spider):
"""Called when a spider gets idle. This function is called when there
are no remaining pages to download or schedule. It can be called
multiple times. If some extension raises a DontCloseSpider exception
(in the spider_idle signal handler) the spider is not closed until the
next loop and this function is guaranteed to be called (at least) once
again for this spider.
"""
res = send_catch_log(signal=signals.spider_idle, \
spider=spider, dont_log=DontCloseSpider)
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
for _, x in res):
reactor.callLater(5, self.next_request, spider)
return
if self.spider_is_idle(spider):
self.close_spider(spider, reason='finished')
def close_spider(self, spider, reason='cancelled'):
"""Close (cancel) spider and clear all its outstanding requests"""
slot = self.slots[spider]
if slot.closing:
return slot.closing
log.msg("Closing spider (%s)" % reason, spider=spider)
self.scheduler.clear_pending_requests(spider)
dfd = slot.close()
dfd.addBoth(lambda _: self.downloader.close_spider(spider))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: self.scraper.close_spider(spider))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: self.scheduler.close_spider(spider))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: self._cancel_next_call(spider))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: send_catch_log_deferred(signal=signals.spider_closed, \
spider=spider, reason=reason))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: stats.close_spider(spider, reason=reason))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: log.msg("Spider closed (%s)" % reason, spider=spider))
dfd.addBoth(lambda _: self.slots.pop(spider))
dfd.addErrback(log.err, spider=spider)
dfd.addBoth(lambda _: self._spider_closed_callback(spider))
return dfd
def _cancel_next_call(self, spider):
call = self._next_request_calls.pop(spider, None)
if call and call.active:
call.cancel()
def _close_all_spiders(self):
dfds = [self.close_spider(s, reason='shutdown') for s in self.open_spiders]
dlist = defer.DeferredList(dfds)
return dlist
@defer.inlineCallbacks
def _finish_stopping_engine(self):
yield send_catch_log_deferred(signal=signals.engine_stopped)
yield stats.engine_stopped()