本文整理汇总了Python中scrapy.crawler方法的典型用法代码示例。如果您正苦于以下问题:Python scrapy.crawler方法的具体用法?Python scrapy.crawler怎么用?Python scrapy.crawler使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy
的用法示例。
在下文中一共展示了scrapy.crawler方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: populate_vars
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def populate_vars(self, response=None, request=None, spider=None):
import scrapy
self.vars['scrapy'] = scrapy
self.vars['crawler'] = self.crawler
self.vars['item'] = self.item_class()
self.vars['settings'] = self.crawler.settings
self.vars['spider'] = spider
self.vars['request'] = request
self.vars['response'] = response
self.vars['sel'] = _SelectorProxy(response)
if self.inthread:
self.vars['fetch'] = self.fetch
self.vars['view'] = open_in_browser
self.vars['shelp'] = self.print_help
self.update_vars(self.vars)
if not self.code:
self.vars['banner'] = self.get_help()
示例2: main
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def main():
"""Main routine for the execution of the Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print("Item extracted:", item)
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# define the spider for the crawler
crawler.crawl(EuropythonSpyder())
# start scrapy
print("STARTING ENGINE")
crawler.start() #iniciar el crawler llamando al spider definido
print("ENGINE STOPPED")
开发者ID:PacktPublishing,项目名称:Learning-Python-Networking-Second-Edition,代码行数:25,代码来源:EuropythonSpyder.py
示例3: start_requests
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def start_requests(self):
self._finish_init()
settings = self.crawler.settings
self.solver = None
try:
import decaptcha
except ImportError:
self.logger.warning('Decaptcha not installed')
else:
from decaptcha.solvers.deathbycaptcha import DeathbycaptchaSolver
if (settings.get('DECAPTCHA_DEATHBYCAPTCHA_USERNAME') and
settings.get('DECAPTCHA_DEATHBYCAPTCHA_PASSWORD')):
self.solver = DeathbycaptchaSolver(self.crawler)
else:
self.logger.warning('DeathByCaptcha account not provided')
self.retries_left = settings.getint('LOGIN_MAX_RETRIES')
request_kwargs = {}
if self.using_splash:
request_kwargs['args'] = {'full_render': True}
yield self.request(self.start_url, **request_kwargs)
示例4: tearDown
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def tearDown(self):
keys = self.redis_conn.keys('stats:crawler:*:test-spider:*')
keys = keys + self.redis_conn.keys('test-spider:*')
for key in keys:
self.redis_conn.delete(key)
# if for some reason the tests fail, we end up falling behind on
# the consumer
for m in self.consumer:
pass
self.consumer.close()
示例5: from_crawler
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def from_crawler(cls, crawler):
instance = cls.from_settings(crawler.settings)
instance.stats = crawler.stats
return instance
示例6: __init__
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def __init__(self, crawler):
self._spider_id_debg_format = crawler.settings.get('DEBUG_PC_FORMAT')
self._spider_id_task_format = crawler.settings.get('TASK_ID_FORMAT')
self._pc_mac = crawler.settings.get('PCMAC')
self._dump = crawler.settings.getbool('STATS_DUMP')
self._debug_pc = crawler.settings.getbool('DEBUG_PC')
self._local_max = crawler.settings.get('DEPTH_MAX_FORMAT')
self._stats = {}
self.server = connection.from_settings(crawler.settings)
self.encoding = self.server.connection_pool.connection_kwargs.get('encoding')
示例7: schedule_next_requests
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def schedule_next_requests(self):
for req in self.next_requests():
self.crawler.engine.crawl(req, spider=self)
# 下面的部分主要是处理 start_url 的部分,这里的处理是永久打开直至程序关闭的
# 原本 scrapy-redis 是用这个来接收一个起始 url 字符串,不过现在改成了接收一个json数据传递脚本数据
# 将此处魔改成对传递过来的参数各种初始化的地方,在发送端生成id后传入这边进行处理
# 这里可以传过来一个简单的 json 数据来装脚本的代码部分,方便脚本的传递以及实例化
示例8: __init__
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def __init__(self, crawler, update_vars=None, code=None):
self.crawler = crawler
self.update_vars = update_vars or (lambda x: None)
self.item_class = load_object(crawler.settings['DEFAULT_ITEM_CLASS'])
self.spider = None
self.inthread = not threadable.isInIOThread()
self.code = code
self.vars = {}
示例9: _schedule
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def _schedule(self, request, spider):
spider = self._open_spider(request, spider)
d = _request_deferred(request)
d.addCallback(lambda x: (x, spider))
self.crawler.engine.crawl(request, spider)
return d
示例10: _open_spider
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def _open_spider(self, request, spider):
if self.spider:
return self.spider
if spider is None:
spider = self.crawler.spider or self.crawler._create_spider()
self.crawler.spider = spider
self.crawler.engine.open_spider(spider, close_if_idle=False)
self.spider = spider
return spider
示例11: inspect_response
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def inspect_response(response, spider):
"""Open a shell to inspect the given response"""
Shell(spider.crawler).start(response=response, spider=spider)
示例12: from_crawler
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def from_crawler(cls, crawler):
print('from_crawler stats:', crawler.stats)
return cls(crawler.stats)
示例13: setup_redis
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import crawler [as 别名]
def setup_redis(self, crawler=None):
if self.server is not None:
return
if crawler is None:
crawler = getattr(self, 'crawler', None)
if crawler is None:
raise ValueError("crawler is required")
settings = crawler.settings
if self.redis_key is None:
self.redis_key = settings.get(
'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
)
self.redis_key = self.redis_key % {'name': self.name}
if not self.redis_key.strip():
raise ValueError("redis_key must not be empty")
if self.redis_batch_size is None:
self.redis_batch_size = settings.getint(
'REDIS_START_URLS_BATCH_SIZE',
settings.getint('CONCURRENT_REQUESTS'),
)
try:
self.redis_batch_size = int(self.redis_batch_size)
except (TypeError, ValueError):
raise ValueError("redis_batch_size must be an integer")
if self.redis_encoding is None:
self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING)
self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
"(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
self.__dict__)
self.server = connection.from_settings(crawler.settings)
# 在后续的处理中,任务不再是在爬虫空闲的时候才进行任务的分配,而是一直都会执行(为了适配多任务)
# 这样不会让一些任务得不到启动。因此 spider_idle 函数将不在负责执行 schedule_next_requests
# 而只会抛出 DontCloseSpider 异常,
# 并且新开一个 schedule_next_requests 函数轮询任务,用于获取启动任务
# 并且新开一个 _stop_clear 函数轮询任务,用于检测函数停止任务
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
# 将日志的模板拿到这个对象中,后续函数需要用到
self._clear_debug_pc = crawler.settings.getbool('CLEAR_DEBUG_PC')
self._clear_dupefilter = crawler.settings.getbool('CLEAR_DUPEFILTER')
self._spider_id_debg_format = crawler.settings.get('DEBUG_PC_FORMAT')
self._spider_id_task_format = crawler.settings.get('TASK_ID_FORMAT')
self._spider_id_dupk_format = crawler.settings.get('SCHEDULER_DUPEFILTER_KEY')
# 这里是将该任务开启绑定两个定时执行,永不停止的函数
# 1/ 为了检查已经停止的任务并且清理任务的空间。
# 2/ 为了获取到新的 start_url 开启新的任务脚本进行任务的初始化并且处理任务空间的问题。
self.limit_check = 0 # 这个参数是想让不同的任务的检查时机稍微错开一点,不要都挤在 _stop_clear 一次迭代中
self.limit_same = 2 # 日志快照连续相同的次数
self.interval = 5 # 多少秒执行一次 检测关闭任务
# (理论上平均检测关闭的时间大概为 (limit_check+1) * (limit_same+1) * interval )
# 测试时可以适量调整小一些方便查看框架的问题
self.interval_s = 2 # 多少秒执行一次 检测启动任务
self.limit_log = 8 # 额外的配置,check stoping 限制显示任务数,防止出现如有几百个任务每次都要全部打印的情况。
crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)