本文整理汇总了Python中scrapy.spiders.Spider方法的典型用法代码示例。如果您正苦于以下问题:Python spiders.Spider方法的具体用法?Python spiders.Spider怎么用?Python spiders.Spider使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.spiders
的用法示例。
在下文中一共展示了spiders.Spider方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: retry_middleware_response
# 需要导入模块: from scrapy import spiders [as 别名]
# 或者: from scrapy.spiders import Spider [as 别名]
def retry_middleware_response(request):
"""
Fixture to simplify creating a crawler
with an activated middleware and going through
the request-response cycle.
Executes process_response() method of the middleware.
"""
settings, status = request.param
crawler = get_crawler(Spider, settings_dict=settings)
spider = crawler._create_spider('foo')
mw = RetryUserAgentMiddleware.from_crawler(crawler)
req = Request('http://www.scrapytest.org/')
rsp = Response(req.url, body=b'', status=status)
yield mw.process_response(req, rsp, spider)
示例2: retry_middleware_exception
# 需要导入模块: from scrapy import spiders [as 别名]
# 或者: from scrapy.spiders import Spider [as 别名]
def retry_middleware_exception(request):
"""
Fixture to simplify creating a crawler
with an activated retry middleware and going through
the request-response cycle.
Executes process_exception() method of the middleware.
"""
settings, exception = request.param
crawler = get_crawler(Spider, settings_dict=settings)
spider = crawler._create_spider('foo')
mw = RetryUserAgentMiddleware.from_crawler(crawler)
req = Request('http://www.scrapytest.org/')
yield mw.process_exception(req, exception, spider)
示例3: spidercls_for_request
# 需要导入模块: from scrapy import spiders [as 别名]
# 或者: from scrapy.spiders import Spider [as 别名]
def spidercls_for_request(spider_loader, request, default_spidercls=None,
log_none=False, log_multiple=False):
"""Return a spider class that handles the given Request.
This will look for the spiders that can handle the given request (using
the spider loader) and return a Spider class if (and only if) there is
only one Spider able to handle the Request.
If multiple spiders (or no spider) are found, it will return the
default_spidercls passed. It can optionally log if multiple or no spiders
are found.
"""
snames = spider_loader.find_by_request(request)
if len(snames) == 1:
return spider_loader.load(snames[0])
if len(snames) > 1 and log_multiple:
logger.error('More than one spider can handle: %(request)s - %(snames)s',
{'request': request, 'snames': ', '.join(snames)})
if len(snames) == 0 and log_none:
logger.error('Unable to find spider that handles: %(request)s',
{'request': request})
return default_spidercls
示例4: middleware_request
# 需要导入模块: from scrapy import spiders [as 别名]
# 或者: from scrapy.spiders import Spider [as 别名]
def middleware_request(request):
crawler = get_crawler(Spider, settings_dict=request.param)
spider = crawler._create_spider('foo')
mw = RandomUserAgentMiddleware.from_crawler(crawler)
req = Request('http://www.scrapytest.org/')
mw.process_request(req, spider)
yield req
示例5: open_spider
# 需要导入模块: from scrapy import spiders [as 别名]
# 或者: from scrapy.spiders import Spider [as 别名]
def open_spider(self, spider):
spider.logger.info('Spider RedisStatsCollector opened. curr pcmac:{}.'.format(self._pc_mac))
示例6: spider_opened
# 需要导入模块: from scrapy import spiders [as 别名]
# 或者: from scrapy.spiders import Spider [as 别名]
def spider_opened(self, spider):
spider.logger.info('Spider RedisCoreStats opened.')
示例7: iter_spider_classes
# 需要导入模块: from scrapy import spiders [as 别名]
# 或者: from scrapy.spiders import Spider [as 别名]
def iter_spider_classes(module):
"""Return an iterator over all spider classes defined in the given module
that can be instantiated (ie. which have name)
"""
# this needs to be imported here until get rid of the spider manager
# singleton in scrapy.spider.spiders
from scrapy.spiders import Spider
for obj in six.itervalues(vars(module)):
if inspect.isclass(obj) and \
issubclass(obj, Spider) and \
obj.__module__ == module.__name__ and \
getattr(obj, 'name', None):
yield obj
示例8: get_crawler
# 需要导入模块: from scrapy import spiders [as 别名]
# 或者: from scrapy.spiders import Spider [as 别名]
def get_crawler(spidercls=None, settings_dict=None):
"""Return an unconfigured Crawler object. If settings_dict is given, it
will be used to populate the crawler settings with a project level
priority.
"""
from scrapy.crawler import CrawlerRunner
from scrapy.spiders import Spider
runner = CrawlerRunner(settings_dict)
return runner.create_crawler(spidercls or Spider)
示例9: setup_module
# 需要导入模块: from scrapy import spiders [as 别名]
# 或者: from scrapy.spiders import Spider [as 别名]
def setup_module(module):
global spider
spider = Spider('spidr')
示例10: setUp
# 需要导入模块: from scrapy import spiders [as 别名]
# 或者: from scrapy.spiders import Spider [as 别名]
def setUp(self):
self.environ = os.environ.copy()
self.spider = Spider('myspider', arg1='val1', start_urls = ["http://example.com"])
def _log(x):
print(x)
self.spider.log = _log
self.response = HtmlResponse(body=b"<html></html>", url="http://www.example.com/product/8798732")
self.item = TestItem({'nom': 'myitem', 'prix': "56.70 euros", "url": "http://www.example.com/product.html?item_no=345"})
示例11: test_spidername_time
# 需要导入模块: from scrapy import spiders [as 别名]
# 或者: from scrapy.spiders import Spider [as 别名]
def test_spidername_time(self):
formatted = _format("Spider: $spider:name. Item scraped at $time", self.spider, self.response, self.item, {})
self.assertRegexpMatches(formatted, 'Spider: myspider. Item scraped at \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$')
示例12: _stop_clear
# 需要导入模块: from scrapy import spiders [as 别名]
# 或者: from scrapy.spiders import Spider [as 别名]
def _stop_clear(self):
taskids = []
spider_tids_shot = self.spider_tids.copy()
for taskid in spider_tids_shot:
taskids.append(taskid)
# 在一定时间后对统计信息的快照进行处理,如果快照相同,则计数
# 相似数超过N次,则代表任务已经收集不到数据了,遂停止任务,并写入任务停止时间,(设置的时间越长越准,十分钟内差不多了)
if self.spider_tids[taskid]['check_times'] != self.limit_check:
self.spider_tids[taskid]['check_times'] += 1
else:
self.spider_tids[taskid]['check_times'] = 0
stat_key = self._spider_id_task_format.format(taskid) % {'spider': self.name}
snapshot, enqueue, dequeue = self._get_snapshot(stat_key)
snapshot_e2d = enqueue == dequeue
snapshot_md5 = hmac.new(b'',str(snapshot).encode(),'md5').hexdigest()
if snapshot_md5 != self.spider_tids[taskid]['stat_snapshot'] or not snapshot_e2d:
self.spider_tids[taskid]['stat_snapshot'] = snapshot_md5
self.spider_tids[taskid]['same_snapshot_times'] = 0
else:
self.spider_tids[taskid]['same_snapshot_times'] += 1
if self.spider_tids[taskid]['same_snapshot_times'] >= self.limit_same:
# 这里主要就是直接对任务结束进行收尾处理
# 后续需要各种删除 redis 中各种不需要的 key 来清理空间
# 另外再清理程序启动时生成的检测停止标签
if self._clear_debug_pc:
stat_pckey = self._spider_id_debg_format % {'spider': self.name}
self.server.delete(stat_pckey)
if self._clear_dupefilter:
dupefilter = self._spider_id_dupk_format.format(taskid) % {'spider': self.name}
self.server.delete(dupefilter)
module_name = self.spider_tids[taskid]['module_name']
# 在 redis 里面必须常驻的就是任务脚本
# 因为任务脚本会经过 hash 处理,以名字的 hash 作为 redis 的 key 进行存储
# 这样一个好处就是即便是存在大量重复的任务也只会存放一个任务脚本
# 同时 spider 对象也用的是脚本的 hash 作为 key 存放在执行程序的一个字典里面
# 为了考虑重复任务的可能,在任务结束时,删除[可能别的任务也在用的]对象的风险和开发难度很大,
# 实际上这种对象资源的消耗本身也比较小,所以对象也考虑常驻内存,
# 并且程序重启后,如果没有遇到需要用到之前任务的脚本也不会主动去实例化。节省开支。
# 另外还有一种恶性情况,就是还没有检查到任务停止的时候程序就意外关闭了
# 可能的影响:没有清理过滤池、没有写入finish_time、少数几条正在执行的任务丢失,
# 对其他正在执行的任务影响基本没有。所以不考虑了。
del self.spider_tids[taskid]
self.log_stat(taskid, 'finish_time')
snapshot,_,_ = self._get_snapshot(stat_key)
self.logger.info('Task {} is Stoped.\n'.format(taskid) + pprint.pformat(snapshot))
taskids.remove(taskid)
if len(taskids) == 0:
self.logger.info("Spider Task is Empty.")
else:
if len(taskids) > self.limit_log:
fmt_log = '{}'.format(taskids[:self.limit_log]).replace(']',', ...][num:{}]'.format(len(taskids)))
else:
fmt_log = '{}'.format(taskids)
self.logger.info("Check Task Stoping {}.".format(fmt_log))