当前位置: 首页>>代码示例>>Python>>正文


Python scrapy.crawler方法代码示例

本文整理汇总了Python中scrapy.crawler方法的典型用法代码示例。如果您正苦于以下问题:Python scrapy.crawler方法的具体用法?Python scrapy.crawler怎么用?Python scrapy.crawler使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy的用法示例。


在下文中一共展示了scrapy.crawler方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: populate_vars

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def populate_vars(self, response=None, request=None, spider=None):
        import scrapy

        self.vars['scrapy'] = scrapy
        self.vars['crawler'] = self.crawler
        self.vars['item'] = self.item_class()
        self.vars['settings'] = self.crawler.settings
        self.vars['spider'] = spider
        self.vars['request'] = request
        self.vars['response'] = response
        self.vars['sel'] = _SelectorProxy(response)
        if self.inthread:
            self.vars['fetch'] = self.fetch
        self.vars['view'] = open_in_browser
        self.vars['shelp'] = self.print_help
        self.update_vars(self.vars)
        if not self.code:
            self.vars['banner'] = self.get_help() 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:20,代码来源:shell.py

示例2: main

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def main():
	"""Main routine for the execution of the Spider"""
	# set up signal to catch items scraped
	def catch_item(sender, item, **kwargs):
		print("Item extracted:", item)
	dispatcher.connect(catch_item, signal=signals.item_passed)
	
	settings = Settings()
	settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
	settings.set("LOG_ENABLED",False)	

	# setup crawler
	from scrapy.crawler import CrawlerProcess

	crawler = CrawlerProcess(settings)

	# define the spider for the crawler
	crawler.crawl(EuropythonSpyder())

	# start scrapy
	print("STARTING ENGINE")
	crawler.start() #iniciar el crawler llamando al spider definido
	print("ENGINE STOPPED") 
开发者ID:PacktPublishing,项目名称:Learning-Python-Networking-Second-Edition,代码行数:25,代码来源:EuropythonSpyder.py

示例3: start_requests

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def start_requests(self):
        self._finish_init()
        settings = self.crawler.settings
        self.solver = None
        try:
            import decaptcha
        except ImportError:
            self.logger.warning('Decaptcha not installed')
        else:
            from decaptcha.solvers.deathbycaptcha import DeathbycaptchaSolver
            if (settings.get('DECAPTCHA_DEATHBYCAPTCHA_USERNAME') and
                    settings.get('DECAPTCHA_DEATHBYCAPTCHA_PASSWORD')):
                self.solver = DeathbycaptchaSolver(self.crawler)
            else:
                self.logger.warning('DeathByCaptcha account not provided')
        self.retries_left = settings.getint('LOGIN_MAX_RETRIES')
        request_kwargs = {}
        if self.using_splash:
            request_kwargs['args'] = {'full_render': True}
        yield self.request(self.start_url, **request_kwargs) 
开发者ID:TeamHG-Memex,项目名称:autologin,代码行数:22,代码来源:spiders.py

示例4: tearDown

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def tearDown(self):
        keys = self.redis_conn.keys('stats:crawler:*:test-spider:*')
        keys = keys + self.redis_conn.keys('test-spider:*')
        for key in keys:
            self.redis_conn.delete(key)

        # if for some reason the tests fail, we end up falling behind on
        # the consumer
        for m in self.consumer:
            pass
        self.consumer.close() 
开发者ID:istresearch,项目名称:scrapy-cluster,代码行数:13,代码来源:online.py

示例5: from_crawler

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def from_crawler(cls, crawler):
        instance = cls.from_settings(crawler.settings)
        instance.stats = crawler.stats
        return instance 
开发者ID:cilame,项目名称:vrequest,代码行数:6,代码来源:py_my_scrapy_redis_server.py

示例6: __init__

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def __init__(self, crawler):
        self._spider_id_debg_format = crawler.settings.get('DEBUG_PC_FORMAT')
        self._spider_id_task_format = crawler.settings.get('TASK_ID_FORMAT')
        self._pc_mac    = crawler.settings.get('PCMAC')
        self._dump      = crawler.settings.getbool('STATS_DUMP')
        self._debug_pc  = crawler.settings.getbool('DEBUG_PC')
        self._local_max = crawler.settings.get('DEPTH_MAX_FORMAT')
        self._stats     = {}
        self.server     = connection.from_settings(crawler.settings)
        self.encoding   = self.server.connection_pool.connection_kwargs.get('encoding') 
开发者ID:cilame,项目名称:vrequest,代码行数:12,代码来源:py_my_scrapy_redis_server.py

示例7: schedule_next_requests

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def schedule_next_requests(self):
        for req in self.next_requests():
            self.crawler.engine.crawl(req, spider=self)
    # 下面的部分主要是处理 start_url 的部分,这里的处理是永久打开直至程序关闭的
    # 原本 scrapy-redis 是用这个来接收一个起始 url 字符串,不过现在改成了接收一个json数据传递脚本数据
    # 将此处魔改成对传递过来的参数各种初始化的地方,在发送端生成id后传入这边进行处理
    # 这里可以传过来一个简单的 json 数据来装脚本的代码部分,方便脚本的传递以及实例化 
开发者ID:cilame,项目名称:vrequest,代码行数:9,代码来源:py_my_scrapy_redis_server.py

示例8: __init__

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def __init__(self, crawler, update_vars=None, code=None):
        self.crawler = crawler
        self.update_vars = update_vars or (lambda x: None)
        self.item_class = load_object(crawler.settings['DEFAULT_ITEM_CLASS'])
        self.spider = None
        self.inthread = not threadable.isInIOThread()
        self.code = code
        self.vars = {} 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:10,代码来源:shell.py

示例9: _schedule

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def _schedule(self, request, spider):
        spider = self._open_spider(request, spider)
        d = _request_deferred(request)
        d.addCallback(lambda x: (x, spider))
        self.crawler.engine.crawl(request, spider)
        return d 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:8,代码来源:shell.py

示例10: _open_spider

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def _open_spider(self, request, spider):
        if self.spider:
            return self.spider

        if spider is None:
            spider = self.crawler.spider or self.crawler._create_spider()

        self.crawler.spider = spider
        self.crawler.engine.open_spider(spider, close_if_idle=False)
        self.spider = spider
        return spider 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:13,代码来源:shell.py

示例11: inspect_response

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def inspect_response(response, spider):
    """Open a shell to inspect the given response"""
    Shell(spider.crawler).start(response=response, spider=spider) 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:5,代码来源:shell.py

示例12: from_crawler

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def from_crawler(cls, crawler):
        print('from_crawler stats:', crawler.stats)
        return cls(crawler.stats) 
开发者ID:furas,项目名称:python-examples,代码行数:5,代码来源:main.py

示例13: setup_redis

# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def setup_redis(self, crawler=None):
        if self.server is not None:
            return
        if crawler is None:
            crawler = getattr(self, 'crawler', None)
        if crawler is None:
            raise ValueError("crawler is required")
        settings = crawler.settings
        if self.redis_key is None:
            self.redis_key = settings.get(
                'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
            )
        self.redis_key = self.redis_key % {'name': self.name}
        if not self.redis_key.strip():
            raise ValueError("redis_key must not be empty")
        if self.redis_batch_size is None:
            self.redis_batch_size = settings.getint(
                'REDIS_START_URLS_BATCH_SIZE',
                settings.getint('CONCURRENT_REQUESTS'),
            )
        try:
            self.redis_batch_size = int(self.redis_batch_size)
        except (TypeError, ValueError):
            raise ValueError("redis_batch_size must be an integer")
        if self.redis_encoding is None:
            self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING)
        self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
                         "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
                         self.__dict__)
        self.server = connection.from_settings(crawler.settings)
        # 在后续的处理中,任务不再是在爬虫空闲的时候才进行任务的分配,而是一直都会执行(为了适配多任务)
        # 这样不会让一些任务得不到启动。因此 spider_idle 函数将不在负责执行 schedule_next_requests
        # 而只会抛出 DontCloseSpider 异常,
        # 并且新开一个 schedule_next_requests 函数轮询任务,用于获取启动任务
        # 并且新开一个 _stop_clear 函数轮询任务,用于检测函数停止任务
        crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        # 将日志的模板拿到这个对象中,后续函数需要用到
        self._clear_debug_pc   = crawler.settings.getbool('CLEAR_DEBUG_PC')
        self._clear_dupefilter = crawler.settings.getbool('CLEAR_DUPEFILTER')
        self._spider_id_debg_format = crawler.settings.get('DEBUG_PC_FORMAT')
        self._spider_id_task_format = crawler.settings.get('TASK_ID_FORMAT')
        self._spider_id_dupk_format = crawler.settings.get('SCHEDULER_DUPEFILTER_KEY')
        # 这里是将该任务开启绑定两个定时执行,永不停止的函数
        # 1/ 为了检查已经停止的任务并且清理任务的空间。
        # 2/ 为了获取到新的 start_url 开启新的任务脚本进行任务的初始化并且处理任务空间的问题。
        self.limit_check = 0 # 这个参数是想让不同的任务的检查时机稍微错开一点,不要都挤在 _stop_clear 一次迭代中
        self.limit_same  = 2 # 日志快照连续相同的次数
        self.interval    = 5 # 多少秒执行一次 检测关闭任务
        # (理论上平均检测关闭的时间大概为 (limit_check+1) * (limit_same+1) * interval )
        # 测试时可以适量调整小一些方便查看框架的问题
        self.interval_s  = 2 # 多少秒执行一次 检测启动任务
        self.limit_log   = 8 # 额外的配置,check stoping 限制显示任务数,防止出现如有几百个任务每次都要全部打印的情况。
        crawler.signals.connect(self.spider_opened, signal=signals.spider_opened) 
开发者ID:cilame,项目名称:vrequest,代码行数:55,代码来源:py_my_scrapy_redis_server.py


注:本文中的scrapy.crawler方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。