当前位置: 首页>>代码示例>>Python>>正文


Python Scraper.enqueue_scrape方法代码示例

本文整理汇总了Python中scrapy.core.scraper.Scraper.enqueue_scrape方法的典型用法代码示例。如果您正苦于以下问题:Python Scraper.enqueue_scrape方法的具体用法?Python Scraper.enqueue_scrape怎么用?Python Scraper.enqueue_scrape使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.core.scraper.Scraper的用法示例。


在下文中一共展示了Scraper.enqueue_scrape方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: ExecutionEngine

# 需要导入模块: from scrapy.core.scraper import Scraper [as 别名]
# 或者: from scrapy.core.scraper.Scraper import enqueue_scrape [as 别名]

#.........这里部分代码省略.........
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(lambda f: logger.info('Error while handling downloader output',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(lambda f: logger.info('Error while removing request from slot',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(lambda f: logger.info('Error while scheduling new request',
                                           exc_info=failure_to_exc_info(f),
                                           extra={'spider': spider}))
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
                                            exc_info=failure_to_exc_info(f),
                                            extra={'spider': spider}))
        return d

    def spider_is_idle(self, spider):
        if not self.scraper.slot.is_idle():
            # scraper is not idle
            return False

        if self.downloader.active:
            # downloader has pending requests
            return False

        if self.slot.start_requests is not None:
            # not all start requests are handled
            return False

        if self.slot.scheduler.has_pending_requests():
            # scheduler has pending requests
            return False

        return True

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)
开发者ID:01-,项目名称:scrapy,代码行数:69,代码来源:engine.py

示例2: ExecutionEngine

# 需要导入模块: from scrapy.core.scraper import Scraper [as 别名]
# 或者: from scrapy.core.scraper.Scraper import enqueue_scrape [as 别名]
class ExecutionEngine(object):

    def __init__(self, crawler, spider_closed_callback):
        self.settings = crawler.settings
        self.slots = {}
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        self.downloader = Downloader(crawler)
        self.scraper = Scraper(crawler)
        self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS')
        self._spider_closed_callback = spider_closed_callback

    @defer.inlineCallbacks
    def start(self):
        """Start the execution engine"""
        assert not self.running, "Engine already running"
        self.start_time = time()
        yield send_catch_log_deferred(signal=signals.engine_started)
        self.running = True

    def stop(self):
        """Stop the execution engine gracefully"""
        assert self.running, "Engine not running"
        self.running = False
        dfd = self._close_all_spiders()
        return dfd.addBoth(lambda _: self._finish_stopping_engine())

    def pause(self):
        """Pause the execution engine"""
        self.paused = True

    def unpause(self):
        """Resume the execution engine"""
        self.paused = False

    def _next_request(self, spider):
        try:
            slot = self.slots[spider]
        except KeyError:
            return

        if self.paused:
            slot.nextcall.schedule(5)
            return

        while not self._needs_backout(spider):
            if not self._next_request_from_scheduler(spider):
                break

        if slot.start_requests and not self._needs_backout(spider):
            try:
                request = slot.start_requests.next()
                self.crawl(request, spider)
            except StopIteration:
                slot.start_requests = None

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slots[spider]
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slots[spider].needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slots[spider]
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(log.msg, spider=spider)
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(log.err, spider=spider)
        return d

    def spider_is_idle(self, spider):
        scraper_idle = spider in self.scraper.slots \
            and self.scraper.slots[spider].is_idle()
        pending = self.slots[spider].scheduler.has_pending_requests()
        downloading = bool(self.downloader.slots)
        idle = scraper_idle and not (pending or downloading)
        return idle

#.........这里部分代码省略.........
开发者ID:tml,项目名称:scrapy,代码行数:103,代码来源:engine.py

示例3: ExecutionEngine

# 需要导入模块: from scrapy.core.scraper import Scraper [as 别名]
# 或者: from scrapy.core.scraper.Scraper import enqueue_scrape [as 别名]

#.........这里部分代码省略.........
                self.crawl(request, spider)

        if self.spider_is_idle(spider) and slot.close_if_idle:
            self._spider_idle(spider)

    def _needs_backout(self, spider):
        slot = self.slot
        return not self.running \
            or slot.closing \
            or self.downloader.needs_backout() \
            or self.scraper.slot.needs_backout()

    def _next_request_from_scheduler(self, spider):
        slot = self.slot
        request = slot.scheduler.next_request()
        if not request:
            return
        d = self._download(request, spider)
        d.addBoth(self._handle_downloader_output, request, spider)
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.remove_request(request))
        d.addErrback(log.msg, spider=spider)
        d.addBoth(lambda _: slot.nextcall.schedule())
        d.addErrback(log.msg, spider=spider)
        return d

    def _handle_downloader_output(self, response, request, spider):
        assert isinstance(response, (Request, Response, Failure)), response
        # downloader middleware can return requests (for example, redirects)
        if isinstance(response, Request):
            self.crawl(response, spider)
            return
        # response is a Response or Failure
        d = self.scraper.enqueue_scrape(response, request, spider)
        d.addErrback(log.err, spider=spider)
        return d

    def spider_is_idle(self, spider):
        scraper_idle = self.scraper.slot.is_idle()
        pending = self.slot.scheduler.has_pending_requests()
        downloading = bool(self.downloader.active)
        idle = scraper_idle and not (pending or downloading)
        return idle

    @property
    def open_spiders(self):
        return [self.spider] if self.spider else []

    def has_capacity(self):
        """Does the engine have capacity to handle more spiders"""
        return not bool(self.slot)

    def crawl(self, request, spider):
        assert spider in self.open_spiders, \
            "Spider %r not opened when crawling: %s" % (spider.name, request)
        self.schedule(request, spider)
        self.slot.nextcall.schedule()

    def schedule(self, request, spider):
        self.signals.send_catch_log(signal=signals.request_scheduled,
                request=request, spider=spider)
        return self.slot.scheduler.enqueue_request(request)

    def download(self, request, spider):
        slot = self.slot
        slot.add_request(request)
开发者ID:382859285,项目名称:scrapy,代码行数:70,代码来源:engine.py


注:本文中的scrapy.core.scraper.Scraper.enqueue_scrape方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。