本文整理汇总了Python中scrapy.signals.spider_idle方法的典型用法代码示例。如果您正苦于以下问题:Python signals.spider_idle方法的具体用法?Python signals.spider_idle怎么用?Python signals.spider_idle使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.signals
的用法示例。
在下文中一共展示了signals.spider_idle方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _spider_idle
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import spider_idle [as 别名]
def _spider_idle(self, spider):
"""Called when a spider gets idle. This function is called when there
are no remaining pages to download or schedule. It can be called
multiple times. If some extension raises a DontCloseSpider exception
(in the spider_idle signal handler) the spider is not closed until the
next loop and this function is guaranteed to be called (at least) once
again for this spider.
"""
res = self.signals.send_catch_log(signal=signals.spider_idle, \
spider=spider, dont_log=DontCloseSpider)
if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
for _, x in res):
return
if self.spider_is_idle(spider):
self.close_spider(spider, reason='finished')
示例2: setup_kafka
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import spider_idle [as 别名]
def setup_kafka(self, settings):
"""Setup redis connection and idle signal.
This should be called after the spider has set its crawler object.
:param settings: The current Scrapy settings being used
:type settings: scrapy.settings.Settings
"""
if not hasattr(self, 'topic') or not self.topic:
self.topic = '%s-starturls' % self.name
hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
_kafka = KafkaClient(hosts)
# wait at most 1sec for more messages. Otherwise continue
self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic,
auto_commit=True, iter_timeout=1.0)
# idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from kafka topic
self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)
示例3: setup_kafka
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import spider_idle [as 别名]
def setup_kafka(self, settings):
"""Setup redis connection and idle signal.
This should be called after the spider has set its crawler object.
:param settings: The current Scrapy settings being used
:type settings: scrapy.settings.Settings
"""
if not hasattr(self, 'topic') or not self.topic:
self.topic = '%s-starturls' % self.name
hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
_kafka = KafkaClient(hosts)
# wait at most 1sec for more messages. Otherwise continue
self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic,
auto_commit=True, iter_timeout=1.0)
# idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from kafka topic
self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)
示例4: setup_redis
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import spider_idle [as 别名]
def setup_redis(self, crawler):
"""send signals when the spider is free"""
self.redis_batch_size = SPIDER_FEED_SIZE
self.redis_con = get_redis_conn()
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
示例5: spider_idle
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import spider_idle [as 别名]
def spider_idle(self):
self.schedule_next_requests()
raise DontCloseSpider
示例6: spider_idle
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import spider_idle [as 别名]
def spider_idle(self):
"""Schedules a request if available, otherwise waits."""
self.schedule_next_request()
raise DontCloseSpider
示例7: _set_crawler
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import spider_idle [as 别名]
def _set_crawler(self, crawler):
super(RedisSpider, self)._set_crawler(crawler)
self.crawler.signals.connect(self.spider_idle,
signal=signals.spider_idle)
示例8: spider_idle
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import spider_idle [as 别名]
def spider_idle(self):
raise DontCloseSpider
示例9: spider_idle
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import spider_idle [as 别名]
def spider_idle(self):
"""Schedules a request if available, otherwise waits."""
# XXX: Handle a sentinel to close the spider.
self.schedule_next_requests()
raise DontCloseSpider
示例10: setup_rabbitmq
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import spider_idle [as 别名]
def setup_rabbitmq(self):
""" Setup RabbitMQ connection.
Call this method after spider has set its crawler object.
:return: None
"""
if not self.rabbitmq_key:
self.rabbitmq_key = '{}:start_urls'.format(self.name)
self.server = connection.from_settings(self.crawler.settings)
self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
示例11: spider_idle
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import spider_idle [as 别名]
def spider_idle(self):
""" Waits for request to be scheduled.
:return: None
"""
self.schedule_next_request()
raise DontCloseSpider
示例12: __init__
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import spider_idle [as 别名]
def __init__(self):
self.spider = HqSpider()
self.crawler = crawler = Crawler(get_project_settings())
crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
crawler.configure()
crawler.crawl(self.spider)
dispatcher.connect(self._dont_close_me, signals.spider_idle)
self.thread = None
self._started = False
self._stopped = False
示例13: start_requests
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import spider_idle [as 别名]
def start_requests(self):
self.crawler.signals.connect(self.make_requests, signal=signals.spider_idle)
return []
示例14: set_crawler
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import spider_idle [as 别名]
def set_crawler(self, crawler):
super(RedisSpider, self).set_crawler(crawler)
self.crawler.signals.connect(self.spider_idle,
signal=signals.spider_idle)
示例15: setup_redis
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import spider_idle [as 别名]
def setup_redis(self):
"""Setup redis connection and idle signal.
This should be called after the spider has set its crawler object.
"""
if not self.redis_key:
self.redis_key = '%s:start_urls' % self.name
self.server = connection.from_settings(self.crawler.settings)
# idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from redis queue
self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
self.log("Reading URLs from redis list '%s'" % self.redis_key)