本文整理汇总了Python中scrapy.signals.item_scraped方法的典型用法代码示例。如果您正苦于以下问题:Python signals.item_scraped方法的具体用法?Python signals.item_scraped怎么用?Python signals.item_scraped使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.signals
的用法示例。
在下文中一共展示了signals.item_scraped方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: setup_kafka
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def setup_kafka(self, settings):
"""Setup redis connection and idle signal.
This should be called after the spider has set its crawler object.
:param settings: The current Scrapy settings being used
:type settings: scrapy.settings.Settings
"""
if not hasattr(self, 'topic') or not self.topic:
self.topic = '%s-starturls' % self.name
hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
_kafka = KafkaClient(hosts)
# wait at most 1sec for more messages. Otherwise continue
self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic,
auto_commit=True, iter_timeout=1.0)
# idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from kafka topic
self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)
示例2: __init__
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def __init__(self, crawler):
self.crawler = crawler
self.close_on = {
'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'),
'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
}
if not any(self.close_on.values()):
raise NotConfigured
self.counter = defaultdict(int)
if self.close_on.get('errorcount'):
crawler.signals.connect(self.error_count, signal=signals.spider_error)
if self.close_on.get('pagecount'):
crawler.signals.connect(self.page_count, signal=signals.response_received)
if self.close_on.get('timeout'):
crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
if self.close_on.get('itemcount'):
crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
示例3: _itemproc_finished
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def _itemproc_finished(self, output, item, response, spider):
"""ItemProcessor finished for the given ``item`` and returned ``output``
"""
self.slot.itemproc_size -= 1
if isinstance(output, Failure):
ex = output.value
if isinstance(ex, DropItem):
logkws = self.logformatter.dropped(item, ex, response, spider)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_dropped, item=item, response=response,
spider=spider, exception=output.value)
else:
logger.error('Error processing %(item)s', {'item': item},
exc_info=failure_to_exc_info(output),
extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_error, item=item, response=response,
spider=spider, failure=output)
else:
logkws = self.logformatter.scraped(output, response, spider)
logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
return self.signals.send_catch_log_deferred(
signal=signals.item_scraped, item=output, response=response,
spider=spider)
示例4: setup_kafka
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def setup_kafka(self, settings):
"""Setup redis connection and idle signal.
This should be called after the spider has set its crawler object.
:param settings: The current Scrapy settings being used
:type settings: scrapy.settings.Settings
"""
if not hasattr(self, 'topic') or not self.topic:
self.topic = '%s-starturls' % self.name
hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
_kafka = KafkaClient(hosts)
# wait at most 1sec for more messages. Otherwise continue
self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic,
auto_commit=True, iter_timeout=1.0)
# idle signal is called when the spider has no requests left,
# that's when we will schedule new requests from kafka topic
self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)
示例5: item_scraped
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def item_scraped(self, *args, **kwargs):
"""Avoids waiting for the spider to idle before scheduling the next request"""
self.schedule_next_request()
示例6: from_crawler
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(pixivSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(cls.update_collection_set, signal=signals.item_scraped)
return spider
# allowed_domains = []
示例7: from_crawler
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def from_crawler(cls, crawler):
o = cls(crawler.stats)
crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
crawler.signals.connect(o.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(o.item_dropped, signal=signals.item_dropped)
crawler.signals.connect(o.response_received, signal=signals.response_received)
return o
示例8: item_scraped
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def item_scraped(self, item, spider):
self.stats.inc_value('item_scraped_count', spider=spider)
示例9: item_scraped
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def item_scraped(self, item, spider):
self.counter['itemcount'] += 1
if self.counter['itemcount'] == self.close_on['itemcount']:
self.crawler.engine.close_spider(spider, 'closespider_itemcount')
示例10: from_crawler
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def from_crawler(cls, crawler):
o = cls(crawler.settings)
o.crawler = crawler
crawler.signals.connect(o.open_spider, signals.spider_opened)
crawler.signals.connect(o.close_spider, signals.spider_closed)
crawler.signals.connect(o.item_scraped, signals.item_scraped)
return o
示例11: item_scraped
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def item_scraped(self, item, spider):
slot = self.slot
if not self._exporting:
slot.exporter.start_exporting()
self._exporting = True
slot.exporter.export_item(item)
slot.itemcount += 1
return item
示例12: setup_rabbitmq
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def setup_rabbitmq(self):
""" Setup RabbitMQ connection.
Call this method after spider has set its crawler object.
:return: None
"""
if not self.rabbitmq_key:
self.rabbitmq_key = '{}:start_urls'.format(self.name)
self.server = connection.from_settings(self.crawler.settings)
self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
示例13: item_scraped
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def item_scraped(self, *args, **kwargs):
""" Avoid waiting for spider.
:param args:
:param kwargs:
:return: None
"""
self.schedule_next_request()
示例14: __init__
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def __init__(self, crawler):
self.crawler = crawler
settings = crawler.settings
if not settings.getbool('KAFKA_EXPORT_ENABLED', False):
raise NotConfigured
logger.debug('Kafka export extension is enabled')
self.kafka_brokers = settings.getlist('KAFKA_BROKERS')
self.kafka_topic = settings.get('KAFKA_TOPIC')
self.batch_size = settings.getint('KAFKA_BATCH_SIZE', 100)
ssl_module_name = settings.get('KAFKA_SSL_CONFIG_MODULE')
if ssl_module_name:
def _load(key):
return resource_filename(ssl_module_name, settings.get(key))
self.ssl_config = get_ssl_config(
cafile=_load('KAFKA_SSL_CACERT_FILE'),
certfile=_load('KAFKA_SSL_CLIENTCERT_FILE'),
keyfile=_load('KAFKA_SSL_CLIENTKEY_FILE'),
)
else:
self.ssl_config = {}
self.item_writer = None
crawler.signals.connect(self.spider_opened, signals.spider_opened)
crawler.signals.connect(self.spider_closed, signals.spider_closed)
crawler.signals.connect(self.process_item_scraped,
signals.item_scraped)
示例15: __init__
# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def __init__(self, crawl_d, crawler):
self.crawl_d = crawl_d
self.crawler = crawler
crawler.signals.connect(self._on_item_scraped, signals.item_scraped)
crawler.signals.connect(self._on_error, signals.spider_error)
crawl_d.addCallback(self._on_finished)
crawl_d.addErrback(self._on_error)
self.closed = False
self._items_available = Deferred()
self._items = collections.deque()