当前位置: 首页>>代码示例>>Python>>正文


Python signals.item_scraped方法代码示例

本文整理汇总了Python中scrapy.signals.item_scraped方法的典型用法代码示例。如果您正苦于以下问题:Python signals.item_scraped方法的具体用法?Python signals.item_scraped怎么用?Python signals.item_scraped使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.signals的用法示例。


在下文中一共展示了signals.item_scraped方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: setup_kafka

# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def setup_kafka(self, settings):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.

        :param settings: The current Scrapy settings being used
        :type settings: scrapy.settings.Settings
        """
        if not hasattr(self, 'topic') or not self.topic:
            self.topic = '%s-starturls' % self.name

        hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
        consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
        _kafka = KafkaClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic,
                                       auto_commit=True, iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic) 
开发者ID:dfdeshom,项目名称:scrapy-kafka,代码行数:24,代码来源:spiders.py

示例2: __init__

# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def __init__(self, crawler):
        self.crawler = crawler

        self.close_on = {
            'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'),
            'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
            'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
            'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
            }

        if not any(self.close_on.values()):
            raise NotConfigured

        self.counter = defaultdict(int)

        if self.close_on.get('errorcount'):
            crawler.signals.connect(self.error_count, signal=signals.spider_error)
        if self.close_on.get('pagecount'):
            crawler.signals.connect(self.page_count, signal=signals.response_received)
        if self.close_on.get('timeout'):
            crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
        if self.close_on.get('itemcount'):
            crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:26,代码来源:closespider.py

示例3: _itemproc_finished

# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def _itemproc_finished(self, output, item, response, spider):
        """ItemProcessor finished for the given ``item`` and returned ``output``
        """
        self.slot.itemproc_size -= 1
        if isinstance(output, Failure):
            ex = output.value
            if isinstance(ex, DropItem):
                logkws = self.logformatter.dropped(item, ex, response, spider)
                logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_dropped, item=item, response=response,
                    spider=spider, exception=output.value)
            else:
                logger.error('Error processing %(item)s', {'item': item},
                             exc_info=failure_to_exc_info(output),
                             extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_error, item=item, response=response,
                    spider=spider, failure=output)
        else:
            logkws = self.logformatter.scraped(output, response, spider)
            logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
            return self.signals.send_catch_log_deferred(
                signal=signals.item_scraped, item=output, response=response,
                spider=spider) 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:27,代码来源:scraper.py

示例4: setup_kafka

# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def setup_kafka(self, settings):
        """Setup redis connection and idle signal.
        This should be called after the spider has set its crawler object.
        :param settings: The current Scrapy settings being used
        :type settings: scrapy.settings.Settings
        """
        if not hasattr(self, 'topic') or not self.topic:
            self.topic = '%s-starturls' % self.name

        hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
        consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
        _kafka = KafkaClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic,
                                       auto_commit=True, iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic) 
开发者ID:openslack,项目名称:openslack-crawler,代码行数:22,代码来源:spiders.py

示例5: item_scraped

# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def item_scraped(self, *args, **kwargs):
        """Avoids waiting for the spider to  idle before scheduling the next request"""
        self.schedule_next_request() 
开发者ID:dfdeshom,项目名称:scrapy-kafka,代码行数:5,代码来源:spiders.py

示例6: from_crawler

# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(pixivSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(cls.update_collection_set, signal=signals.item_scraped)
        return spider

    # allowed_domains = [] 
开发者ID:vicety,项目名称:Pixiv-Crawler,代码行数:9,代码来源:pixiv-beta.py

示例7: from_crawler

# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def from_crawler(cls, crawler):
        o = cls(crawler.stats)
        crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(o.item_scraped, signal=signals.item_scraped)
        crawler.signals.connect(o.item_dropped, signal=signals.item_dropped)
        crawler.signals.connect(o.response_received, signal=signals.response_received)
        return o 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:10,代码来源:corestats.py

示例8: item_scraped

# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def item_scraped(self, item, spider):
        self.stats.inc_value('item_scraped_count', spider=spider) 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:4,代码来源:corestats.py

示例9: item_scraped

# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def item_scraped(self, item, spider):
        self.counter['itemcount'] += 1
        if self.counter['itemcount'] == self.close_on['itemcount']:
            self.crawler.engine.close_spider(spider, 'closespider_itemcount') 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:6,代码来源:closespider.py

示例10: from_crawler

# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def from_crawler(cls, crawler):
        o = cls(crawler.settings)
        o.crawler = crawler
        crawler.signals.connect(o.open_spider, signals.spider_opened)
        crawler.signals.connect(o.close_spider, signals.spider_closed)
        crawler.signals.connect(o.item_scraped, signals.item_scraped)
        return o 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:9,代码来源:feedexport.py

示例11: item_scraped

# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def item_scraped(self, item, spider):
        slot = self.slot
        if not self._exporting:
            slot.exporter.start_exporting()
            self._exporting = True
        slot.exporter.export_item(item)
        slot.itemcount += 1
        return item 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:10,代码来源:feedexport.py

示例12: setup_rabbitmq

# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def setup_rabbitmq(self):
        """ Setup RabbitMQ connection.

            Call this method after spider has set its crawler object.
        :return: None
        """

        if not self.rabbitmq_key:
            self.rabbitmq_key = '{}:start_urls'.format(self.name)

        self.server = connection.from_settings(self.crawler.settings)
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) 
开发者ID:roycehaynes,项目名称:scrapy-rabbitmq,代码行数:15,代码来源:spiders.py

示例13: item_scraped

# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def item_scraped(self, *args, **kwargs):
        """ Avoid waiting for spider.
        :param args:
        :param kwargs:
        :return: None
        """
        self.schedule_next_request() 
开发者ID:roycehaynes,项目名称:scrapy-rabbitmq,代码行数:9,代码来源:spiders.py

示例14: __init__

# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def __init__(self, crawler):
        self.crawler = crawler
        settings = crawler.settings
        if not settings.getbool('KAFKA_EXPORT_ENABLED', False):
            raise NotConfigured
        logger.debug('Kafka export extension is enabled')

        self.kafka_brokers = settings.getlist('KAFKA_BROKERS')
        self.kafka_topic = settings.get('KAFKA_TOPIC')
        self.batch_size = settings.getint('KAFKA_BATCH_SIZE', 100)
        ssl_module_name = settings.get('KAFKA_SSL_CONFIG_MODULE')
        if ssl_module_name:
            def _load(key):
                return resource_filename(ssl_module_name, settings.get(key))

            self.ssl_config = get_ssl_config(
                cafile=_load('KAFKA_SSL_CACERT_FILE'),
                certfile=_load('KAFKA_SSL_CLIENTCERT_FILE'),
                keyfile=_load('KAFKA_SSL_CLIENTKEY_FILE'),
            )
        else:
            self.ssl_config = {}

        self.item_writer = None
        crawler.signals.connect(self.spider_opened, signals.spider_opened)
        crawler.signals.connect(self.spider_closed, signals.spider_closed)
        crawler.signals.connect(self.process_item_scraped,
                                signals.item_scraped) 
开发者ID:TeamHG-Memex,项目名称:scrapy-kafka-export,代码行数:30,代码来源:extensions.py

示例15: __init__

# 需要导入模块: from scrapy import signals [as 别名]
# 或者: from scrapy.signals import item_scraped [as 别名]
def __init__(self, crawl_d, crawler):
        self.crawl_d = crawl_d
        self.crawler = crawler

        crawler.signals.connect(self._on_item_scraped, signals.item_scraped)
        crawler.signals.connect(self._on_error, signals.spider_error)

        crawl_d.addCallback(self._on_finished)
        crawl_d.addErrback(self._on_error)

        self.closed = False
        self._items_available = Deferred()
        self._items = collections.deque() 
开发者ID:TeamHG-Memex,项目名称:autologin,代码行数:15,代码来源:scrapyutils.py


注:本文中的scrapy.signals.item_scraped方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。