本文整理匯總了Python中scrapy.core.scraper.Scraper.open_spider方法的典型用法代碼示例。如果您正苦於以下問題:Python Scraper.open_spider方法的具體用法?Python Scraper.open_spider怎麽用?Python Scraper.open_spider使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類scrapy.core.scraper.Scraper
的用法示例。
在下文中一共展示了Scraper.open_spider方法的6個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: ExecutionEngine
# 需要導入模塊: from scrapy.core.scraper import Scraper [as 別名]
# 或者: from scrapy.core.scraper.Scraper import open_spider [as 別名]
#.........這裏部分代碼省略.........
slot = self.slots[spider]
request = slot.scheduler.next_request()
if not request:
return
d = self._download(request, spider)
d.addBoth(self._handle_downloader_output, request, spider)
d.addErrback(log.msg, spider=spider)
d.addBoth(lambda _: slot.remove_request(request))
d.addErrback(log.msg, spider=spider)
d.addBoth(lambda _: slot.nextcall.schedule())
d.addErrback(log.msg, spider=spider)
return d
def _handle_downloader_output(self, response, request, spider):
assert isinstance(response, (Request, Response, Failure)), response
# downloader middleware can return requests (for example, redirects)
if isinstance(response, Request):
self.crawl(response, spider)
return
# response is a Response or Failure
d = self.scraper.enqueue_scrape(response, request, spider)
d.addErrback(log.err, spider=spider)
return d
def spider_is_idle(self, spider):
scraper_idle = spider in self.scraper.slots \
and self.scraper.slots[spider].is_idle()
pending = self.slots[spider].scheduler.has_pending_requests()
downloading = bool(self.downloader.slots)
idle = scraper_idle and not (pending or downloading)
return idle
@property
def open_spiders(self):
return self.slots.keys()
def has_capacity(self):
"""Does the engine have capacity to handle more spiders"""
return len(self.slots) < self._concurrent_spiders
def crawl(self, request, spider):
assert spider in self.open_spiders, \
"Spider %r not opened when crawling: %s" % (spider.name, request)
self.schedule(request, spider)
self.slots[spider].nextcall.schedule()
def schedule(self, request, spider):
return self.slots[spider].scheduler.enqueue_request(request)
def download(self, request, spider):
slot = self.slots[spider]
slot.add_request(request)
d = self._download(request, spider)
d.addBoth(self._downloaded, slot, request, spider)
return d
def _downloaded(self, response, slot, request, spider):
slot.remove_request(request)
return self.download(response, spider) \
if isinstance(response, Request) else response
def _download(self, request, spider):
slot = self.slots[spider]
slot.add_request(request)
def _on_success(response):
assert isinstance(response, (Response, Request))
示例2: ExecutionEngine
# 需要導入模塊: from scrapy.core.scraper import Scraper [as 別名]
# 或者: from scrapy.core.scraper.Scraper import open_spider [as 別名]
class ExecutionEngine(object):
def __init__(self, crawler, spider_closed_callback):
self.crawler = crawler
self.settings = crawler.settings
self.signals = crawler.signals
self.logformatter = crawler.logformatter
self.slot = None
self.spider = None
self.running = False
self.paused = False
self.scheduler_cls = load_object(self.settings['SCHEDULER'])
downloader_cls = load_object(self.settings['DOWNLOADER'])
self.downloader = downloader_cls(crawler)
self.scraper = Scraper(crawler)
self._spider_closed_callback = spider_closed_callback
@defer.inlineCallbacks
def start(self):
"""Start the execution engine"""
assert not self.running, "Engine already running"
self.start_time = time()
yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
self.running = True
self._closewait = defer.Deferred()
yield self._closewait
def stop(self):
"""Stop the execution engine gracefully"""
assert self.running, "Engine not running"
self.running = False
dfd = self._close_all_spiders()
return dfd.addBoth(lambda _: self._finish_stopping_engine())
def close(self):
"""Close the execution engine gracefully.
If it has already been started, stop it. In all cases, close all spiders
and the downloader.
"""
if self.running:
# Will also close spiders and downloader
return self.stop()
elif self.open_spiders:
# Will also close downloader
return self._close_all_spiders()
else:
return defer.succeed(self.downloader.close())
def pause(self):
"""Pause the execution engine"""
self.paused = True
def unpause(self):
"""Resume the execution engine"""
self.paused = False
def _next_request(self, spider):
slot = self.slot
if not slot:
return
if self.paused:
return
while not self._needs_backout(spider):
if not self._next_request_from_scheduler(spider):
break
if slot.start_requests and not self._needs_backout(spider):
try:
request = next(slot.start_requests)
except StopIteration:
slot.start_requests = None
except Exception:
slot.start_requests = None
logger.error('Error while obtaining start requests',
exc_info=True, extra={'spider': spider})
else:
self.crawl(request, spider)
if self.spider_is_idle(spider) and slot.close_if_idle:
self._spider_idle(spider)
def _needs_backout(self, spider):
slot = self.slot
return not self.running \
or slot.closing \
or self.downloader.needs_backout() \
or self.scraper.slot.needs_backout()
def _next_request_from_scheduler(self, spider):
slot = self.slot
request = slot.scheduler.next_request()
if not request:
return
d = self._download(request, spider)
d.addBoth(self._handle_downloader_output, request, spider)
d.addErrback(lambda f: logger.info('Error while handling downloader output',
exc_info=failure_to_exc_info(f),
#.........這裏部分代碼省略.........
示例3: ExecutionEngine
# 需要導入模塊: from scrapy.core.scraper import Scraper [as 別名]
# 或者: from scrapy.core.scraper.Scraper import open_spider [as 別名]
#.........這裏部分代碼省略.........
def _next_request_from_scheduler(self, spider):
slot = self.slot
request = slot.scheduler.next_request()
if not request:
return
d = self._download(request, spider)
d.addBoth(self._handle_downloader_output, request, spider)
d.addErrback(log.msg, spider=spider)
d.addBoth(lambda _: slot.remove_request(request))
d.addErrback(log.msg, spider=spider)
d.addBoth(lambda _: slot.nextcall.schedule())
d.addErrback(log.msg, spider=spider)
return d
def _handle_downloader_output(self, response, request, spider):
assert isinstance(response, (Request, Response, Failure)), response
# downloader middleware can return requests (for example, redirects)
if isinstance(response, Request):
self.crawl(response, spider)
return
# response is a Response or Failure
d = self.scraper.enqueue_scrape(response, request, spider)
d.addErrback(log.err, spider=spider)
return d
def spider_is_idle(self, spider):
scraper_idle = self.scraper.slot.is_idle()
pending = self.slot.scheduler.has_pending_requests()
downloading = bool(self.downloader.active)
idle = scraper_idle and not (pending or downloading)
return idle
@property
def open_spiders(self):
return [self.spider] if self.spider else []
def has_capacity(self):
"""Does the engine have capacity to handle more spiders"""
return not bool(self.slot)
def crawl(self, request, spider):
assert spider in self.open_spiders, \
"Spider %r not opened when crawling: %s" % (spider.name, request)
self.schedule(request, spider)
self.slot.nextcall.schedule()
def schedule(self, request, spider):
self.signals.send_catch_log(signal=signals.request_scheduled,
request=request, spider=spider)
return self.slot.scheduler.enqueue_request(request)
def download(self, request, spider):
slot = self.slot
slot.add_request(request)
d = self._download(request, spider)
d.addBoth(self._downloaded, slot, request, spider)
return d
def _downloaded(self, response, slot, request, spider):
slot.remove_request(request)
return self.download(response, spider) \
if isinstance(response, Request) else response
def _download(self, request, spider):
slot = self.slot
slot.add_request(request)
示例4: ExecutionEngine
# 需要導入模塊: from scrapy.core.scraper import Scraper [as 別名]
# 或者: from scrapy.core.scraper.Scraper import open_spider [as 別名]
#.........這裏部分代碼省略.........
or self.spider_is_closed(spider) \
or self.downloader.sites[spider].needs_backout() \
or self.scraper.sites[spider].needs_backout()
def _next_request(self, spider):
# Next pending request from scheduler
request, deferred = self.scheduler.next_request(spider)
if request:
dwld = mustbe_deferred(self.download, request, spider)
dwld.chainDeferred(deferred).addBoth(lambda _: deferred)
dwld.addErrback(log.err, "Unhandled error on engine._next_request()",
spider=spider)
return dwld
def spider_is_idle(self, spider):
scraper_idle = spider in self.scraper.sites \
and self.scraper.sites[spider].is_idle()
pending = self.scheduler.spider_has_pending_requests(spider)
downloading = spider in self.downloader.sites \
and self.downloader.sites[spider].active
return scraper_idle and not (pending or downloading)
def spider_is_closed(self, spider):
"""Return True if the spider is fully closed (ie. not even in the
closing stage)"""
return spider not in self.downloader.sites
def spider_is_open(self, spider):
"""Return True if the spider is fully opened (ie. not in closing
stage)"""
return spider in self.downloader.sites and spider not in self.closing
@property
def open_spiders(self):
return self.downloader.sites.keys()
def has_capacity(self):
"""Does the engine have capacity to handle more spiders"""
return len(self.downloader.sites) < self.downloader.concurrent_spiders
def crawl(self, request, spider):
assert spider in self.open_spiders, \
"Spider %r not opened when crawling: %s" % (spider.name, request)
if spider in self.closing: # ignore requests for spiders being closed
return
schd = mustbe_deferred(self.schedule, request, spider)
# FIXME: we can't log errors because we would be preventing them from
# propagating to the request errback. This should be fixed after the
# next core refactoring.
#schd.addErrback(log.err, "Error on engine.crawl()")
schd.addBoth(self.scraper.enqueue_scrape, request, spider)
schd.addErrback(log.err, "Unhandled error on engine.crawl()", spider=spider)
schd.addBoth(lambda _: self.next_request(spider))
def schedule(self, request, spider):
if spider in self.closing:
raise IgnoreRequest()
self.next_request(spider)
return self.scheduler.enqueue_request(spider, request)
def download(self, request, spider):
def _on_success(response):
"""handle the result of a page download"""
assert isinstance(response, (Response, Request))
if isinstance(response, Response):
response.request = request # tie request to response received
示例5: ExecutionEngine
# 需要導入模塊: from scrapy.core.scraper import Scraper [as 別名]
# 或者: from scrapy.core.scraper.Scraper import open_spider [as 別名]
#.........這裏部分代碼省略.........
or self.spider_is_closed(spider) \
or self.downloader.sites[spider].needs_backout() \
or self.scraper.sites[spider].needs_backout()
def _next_request(self, spider):
# Next pending request from scheduler
request, deferred = self.scheduler.next_request(spider)
if request:
dwld = mustbe_deferred(self.download, request, spider)
dwld.chainDeferred(deferred).addBoth(lambda _: deferred)
dwld.addErrback(log.err, "Unhandled error on engine._next_request()",
spider=spider)
return dwld
def spider_is_idle(self, spider):
scraper_idle = spider in self.scraper.sites \
and self.scraper.sites[spider].is_idle()
pending = self.scheduler.spider_has_pending_requests(spider)
downloading = spider in self.downloader.sites \
and self.downloader.sites[spider].active
return scraper_idle and not (pending or downloading)
def spider_is_closed(self, spider):
"""Return True if the spider is fully closed (ie. not even in the
closing stage)"""
return spider not in self.downloader.sites
def spider_is_open(self, spider):
"""Return True if the spider is fully opened (ie. not in closing
stage)"""
return spider in self.downloader.sites and spider not in self.closing
@property
def open_spiders(self):
return self.downloader.sites.keys()
def has_capacity(self):
"""Does the engine have capacity to handle more spiders"""
return len(self.downloader.sites) < self.downloader.concurrent_spiders
def crawl(self, request, spider):
assert spider in self.open_spiders, \
"Spider %r not opened when crawling: %s" % (spider.name, request)
if spider in self.closing: # ignore requests for spiders being closed
return
schd = mustbe_deferred(self.schedule, request, spider)
# FIXME: we can't log errors because we would be preventing them from
# propagating to the request errback. This should be fixed after the
# next core refactoring.
#schd.addErrback(log.err, "Error on engine.crawl()")
schd.addBoth(self.scraper.enqueue_scrape, request, spider)
schd.addErrback(log.err, "Unhandled error on engine.crawl()", spider=spider)
schd.addBoth(lambda _: self.next_request(spider))
def schedule(self, request, spider):
if spider in self.closing:
raise IgnoreRequest()
self.next_request(spider)
return self.scheduler.enqueue_request(spider, request)
def download(self, request, spider):
def _on_success(response):
"""handle the result of a page download"""
assert isinstance(response, (Response, Request))
if isinstance(response, Response):
response.request = request # tie request to response received
示例6: ExecutionEngine
# 需要導入模塊: from scrapy.core.scraper import Scraper [as 別名]
# 或者: from scrapy.core.scraper.Scraper import open_spider [as 別名]
#.........這裏部分代碼省略.........
d.addBoth(lambda _: self.next_request(spider))
return d
def _handle_downloader_output(self, response, request, spider):
assert isinstance(response, (Request, Response, Failure)), response
# downloader middleware can return requests (for example, redirects)
if isinstance(response, Request):
self.crawl(response, spider)
return
# response is a Response or Failure
d = defer.Deferred()
d.addBoth(self.scraper.enqueue_scrape, request, spider)
d.addErrback(log.err, spider=spider)
if isinstance(response, Failure):
d.errback(response)
else:
d.callback(response)
return d
def spider_is_idle(self, spider):
scraper_idle = spider in self.scraper.sites \
and self.scraper.sites[spider].is_idle()
pending = self.scheduler.spider_has_pending_requests(spider)
downloading = spider in self.downloader.sites \
and self.downloader.sites[spider].active
return scraper_idle and not (pending or downloading)
def spider_is_closed(self, spider):
"""Return True if the spider is fully closed (ie. not even in the
closing stage)"""
return spider not in self.downloader.sites
@property
def open_spiders(self):
return self.downloader.sites.keys()
def has_capacity(self):
"""Does the engine have capacity to handle more spiders"""
return len(self.downloader.sites) < self.downloader.concurrent_spiders
def crawl(self, request, spider):
assert spider in self.open_spiders, \
"Spider %r not opened when crawling: %s" % (spider.name, request)
self.schedule(request, spider)
self.next_request(spider)
def schedule(self, request, spider):
return self.scheduler.enqueue_request(spider, request)
def download(self, request, spider):
slot = self.slots[request]
slot.add_request(request)
if isinstance(request, Response):
return request
d = self._download(request, spider)
d.addCallback(self.download, spider)
d.addBoth(self._remove_request, slot, request)
return d
def _remove_request(self, _, slot, request):
slot.remove_request(request)
return _
def _download(self, request, spider):
slot = self.slots[spider]
slot.add_request(request)