本文整理匯總了Python中scrapy.spiders.Spider方法的典型用法代碼示例。如果您正苦於以下問題:Python spiders.Spider方法的具體用法?Python spiders.Spider怎麽用?Python spiders.Spider使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類scrapy.spiders
的用法示例。
在下文中一共展示了spiders.Spider方法的12個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: retry_middleware_response
# 需要導入模塊: from scrapy import spiders [as 別名]
# 或者: from scrapy.spiders import Spider [as 別名]
def retry_middleware_response(request):
"""
Fixture to simplify creating a crawler
with an activated middleware and going through
the request-response cycle.
Executes process_response() method of the middleware.
"""
settings, status = request.param
crawler = get_crawler(Spider, settings_dict=settings)
spider = crawler._create_spider('foo')
mw = RetryUserAgentMiddleware.from_crawler(crawler)
req = Request('http://www.scrapytest.org/')
rsp = Response(req.url, body=b'', status=status)
yield mw.process_response(req, rsp, spider)
示例2: retry_middleware_exception
# 需要導入模塊: from scrapy import spiders [as 別名]
# 或者: from scrapy.spiders import Spider [as 別名]
def retry_middleware_exception(request):
"""
Fixture to simplify creating a crawler
with an activated retry middleware and going through
the request-response cycle.
Executes process_exception() method of the middleware.
"""
settings, exception = request.param
crawler = get_crawler(Spider, settings_dict=settings)
spider = crawler._create_spider('foo')
mw = RetryUserAgentMiddleware.from_crawler(crawler)
req = Request('http://www.scrapytest.org/')
yield mw.process_exception(req, exception, spider)
示例3: spidercls_for_request
# 需要導入模塊: from scrapy import spiders [as 別名]
# 或者: from scrapy.spiders import Spider [as 別名]
def spidercls_for_request(spider_loader, request, default_spidercls=None,
log_none=False, log_multiple=False):
"""Return a spider class that handles the given Request.
This will look for the spiders that can handle the given request (using
the spider loader) and return a Spider class if (and only if) there is
only one Spider able to handle the Request.
If multiple spiders (or no spider) are found, it will return the
default_spidercls passed. It can optionally log if multiple or no spiders
are found.
"""
snames = spider_loader.find_by_request(request)
if len(snames) == 1:
return spider_loader.load(snames[0])
if len(snames) > 1 and log_multiple:
logger.error('More than one spider can handle: %(request)s - %(snames)s',
{'request': request, 'snames': ', '.join(snames)})
if len(snames) == 0 and log_none:
logger.error('Unable to find spider that handles: %(request)s',
{'request': request})
return default_spidercls
示例4: middleware_request
# 需要導入模塊: from scrapy import spiders [as 別名]
# 或者: from scrapy.spiders import Spider [as 別名]
def middleware_request(request):
crawler = get_crawler(Spider, settings_dict=request.param)
spider = crawler._create_spider('foo')
mw = RandomUserAgentMiddleware.from_crawler(crawler)
req = Request('http://www.scrapytest.org/')
mw.process_request(req, spider)
yield req
示例5: open_spider
# 需要導入模塊: from scrapy import spiders [as 別名]
# 或者: from scrapy.spiders import Spider [as 別名]
def open_spider(self, spider):
spider.logger.info('Spider RedisStatsCollector opened. curr pcmac:{}.'.format(self._pc_mac))
示例6: spider_opened
# 需要導入模塊: from scrapy import spiders [as 別名]
# 或者: from scrapy.spiders import Spider [as 別名]
def spider_opened(self, spider):
spider.logger.info('Spider RedisCoreStats opened.')
示例7: iter_spider_classes
# 需要導入模塊: from scrapy import spiders [as 別名]
# 或者: from scrapy.spiders import Spider [as 別名]
def iter_spider_classes(module):
"""Return an iterator over all spider classes defined in the given module
that can be instantiated (ie. which have name)
"""
# this needs to be imported here until get rid of the spider manager
# singleton in scrapy.spider.spiders
from scrapy.spiders import Spider
for obj in six.itervalues(vars(module)):
if inspect.isclass(obj) and \
issubclass(obj, Spider) and \
obj.__module__ == module.__name__ and \
getattr(obj, 'name', None):
yield obj
示例8: get_crawler
# 需要導入模塊: from scrapy import spiders [as 別名]
# 或者: from scrapy.spiders import Spider [as 別名]
def get_crawler(spidercls=None, settings_dict=None):
"""Return an unconfigured Crawler object. If settings_dict is given, it
will be used to populate the crawler settings with a project level
priority.
"""
from scrapy.crawler import CrawlerRunner
from scrapy.spiders import Spider
runner = CrawlerRunner(settings_dict)
return runner.create_crawler(spidercls or Spider)
示例9: setup_module
# 需要導入模塊: from scrapy import spiders [as 別名]
# 或者: from scrapy.spiders import Spider [as 別名]
def setup_module(module):
global spider
spider = Spider('spidr')
示例10: setUp
# 需要導入模塊: from scrapy import spiders [as 別名]
# 或者: from scrapy.spiders import Spider [as 別名]
def setUp(self):
self.environ = os.environ.copy()
self.spider = Spider('myspider', arg1='val1', start_urls = ["http://example.com"])
def _log(x):
print(x)
self.spider.log = _log
self.response = HtmlResponse(body=b"<html></html>", url="http://www.example.com/product/8798732")
self.item = TestItem({'nom': 'myitem', 'prix': "56.70 euros", "url": "http://www.example.com/product.html?item_no=345"})
示例11: test_spidername_time
# 需要導入模塊: from scrapy import spiders [as 別名]
# 或者: from scrapy.spiders import Spider [as 別名]
def test_spidername_time(self):
formatted = _format("Spider: $spider:name. Item scraped at $time", self.spider, self.response, self.item, {})
self.assertRegexpMatches(formatted, 'Spider: myspider. Item scraped at \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$')
示例12: _stop_clear
# 需要導入模塊: from scrapy import spiders [as 別名]
# 或者: from scrapy.spiders import Spider [as 別名]
def _stop_clear(self):
taskids = []
spider_tids_shot = self.spider_tids.copy()
for taskid in spider_tids_shot:
taskids.append(taskid)
# 在一定時間後對統計信息的快照進行處理,如果快照相同,則計數
# 相似數超過N次,則代表任務已經收集不到數據了,遂停止任務,並寫入任務停止時間,(設置的時間越長越準,十分鍾內差不多了)
if self.spider_tids[taskid]['check_times'] != self.limit_check:
self.spider_tids[taskid]['check_times'] += 1
else:
self.spider_tids[taskid]['check_times'] = 0
stat_key = self._spider_id_task_format.format(taskid) % {'spider': self.name}
snapshot, enqueue, dequeue = self._get_snapshot(stat_key)
snapshot_e2d = enqueue == dequeue
snapshot_md5 = hmac.new(b'',str(snapshot).encode(),'md5').hexdigest()
if snapshot_md5 != self.spider_tids[taskid]['stat_snapshot'] or not snapshot_e2d:
self.spider_tids[taskid]['stat_snapshot'] = snapshot_md5
self.spider_tids[taskid]['same_snapshot_times'] = 0
else:
self.spider_tids[taskid]['same_snapshot_times'] += 1
if self.spider_tids[taskid]['same_snapshot_times'] >= self.limit_same:
# 這裏主要就是直接對任務結束進行收尾處理
# 後續需要各種刪除 redis 中各種不需要的 key 來清理空間
# 另外再清理程序啟動時生成的檢測停止標簽
if self._clear_debug_pc:
stat_pckey = self._spider_id_debg_format % {'spider': self.name}
self.server.delete(stat_pckey)
if self._clear_dupefilter:
dupefilter = self._spider_id_dupk_format.format(taskid) % {'spider': self.name}
self.server.delete(dupefilter)
module_name = self.spider_tids[taskid]['module_name']
# 在 redis 裏麵必須常駐的就是任務腳本
# 因為任務腳本會經過 hash 處理,以名字的 hash 作為 redis 的 key 進行存儲
# 這樣一個好處就是即便是存在大量重複的任務也隻會存放一個任務腳本
# 同時 spider 對象也用的是腳本的 hash 作為 key 存放在執行程序的一個字典裏麵
# 為了考慮重複任務的可能,在任務結束時,刪除[可能別的任務也在用的]對象的風險和開發難度很大,
# 實際上這種對象資源的消耗本身也比較小,所以對象也考慮常駐內存,
# 並且程序重啟後,如果沒有遇到需要用到之前任務的腳本也不會主動去實例化。節省開支。
# 另外還有一種惡性情況,就是還沒有檢查到任務停止的時候程序就意外關閉了
# 可能的影響:沒有清理過濾池、沒有寫入finish_time、少數幾條正在執行的任務丟失,
# 對其他正在執行的任務影響基本沒有。所以不考慮了。
del self.spider_tids[taskid]
self.log_stat(taskid, 'finish_time')
snapshot,_,_ = self._get_snapshot(stat_key)
self.logger.info('Task {} is Stoped.\n'.format(taskid) + pprint.pformat(snapshot))
taskids.remove(taskid)
if len(taskids) == 0:
self.logger.info("Spider Task is Empty.")
else:
if len(taskids) > self.limit_log:
fmt_log = '{}'.format(taskids[:self.limit_log]).replace(']',', ...][num:{}]'.format(len(taskids)))
else:
fmt_log = '{}'.format(taskids)
self.logger.info("Check Task Stoping {}.".format(fmt_log))