本文整理汇总了Python中scrapy.Spider方法的典型用法代码示例。如果您正苦于以下问题:Python scrapy.Spider方法的具体用法?Python scrapy.Spider怎么用?Python scrapy.Spider使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy
的用法示例。
在下文中一共展示了scrapy.Spider方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_spider_has_two_last_stats_history_when_opened_third_time
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Spider [as 别名]
def test_spider_has_two_last_stats_history_when_opened_third_time(
test_settings, stats_temporary_location
):
crawler = get_crawler(Spider, test_settings)
crawler.crawl("foo_spider")
crawler.stats.set_value("first_execution", "value")
crawler.stop()
crawler = get_crawler(Spider, test_settings)
crawler.crawl("foo_spider")
crawler.stats.set_value("second_execution", "value")
crawler.stop()
crawler = get_crawler(Spider, test_settings)
crawler.crawl("foo_spider")
assert len(crawler.spider.stats_history) == 2
assert "second_execution" in crawler.spider.stats_history[0].keys()
assert "first_execution" in crawler.spider.stats_history[1].keys()
crawler.stop()
示例2: test_spider_limit_number_of_stored_stats
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Spider [as 别名]
def test_spider_limit_number_of_stored_stats(test_settings, stats_temporary_location):
test_settings["SPIDERMON_MAX_STORED_STATS"] = 2
crawler = get_crawler(Spider, test_settings)
crawler.crawl("foo_spider")
crawler.stats.set_value("first_execution", "value")
crawler.stop()
crawler = get_crawler(Spider, test_settings)
crawler.crawl("foo_spider")
crawler.stats.set_value("second_execution", "value")
crawler.stop()
crawler = get_crawler(Spider, test_settings)
crawler.crawl("foo_spider")
crawler.stats.set_value("third_execution", "value")
crawler.stop()
crawler = get_crawler(Spider, test_settings)
crawler.crawl("foo_spider")
assert len(crawler.spider.stats_history) == 2
assert "third_execution" in crawler.spider.stats_history[0].keys()
assert "second_execution" in crawler.spider.stats_history[1].keys()
crawler.stop()
示例3: process_request
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Spider [as 别名]
def process_request(self, request: Request, spider: Spider):
"""This method checks if the request is really needed and if its
download could be skipped by trying to infer if a ``Response``
is going to be used by the callback or a Page Input.
If the ``Response`` can be ignored, a ``utils.DummyResponse`` object is
returned on its place. This ``DummyResponse`` is linked to the original
``Request`` instance.
With this behavior, we're able to optimize spider executions avoiding
unnecessary downloads. That could be the case when the callback is
actually using another source like external APIs such as Scrapinghub's
Auto Extract.
"""
if utils.is_response_going_to_be_used(request, spider):
return
spider.logger.debug(f'Skipping download of {request}')
return utils.DummyResponse(url=request.url, request=request)
示例4: create_crawler
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Spider [as 别名]
def create_crawler(self, crawler_or_spidercls):
"""
Return a :class:`~scrapy.crawler.Crawler` object.
* If ``crawler_or_spidercls`` is a Crawler, it is returned as-is.
* If ``crawler_or_spidercls`` is a Spider subclass, a new Crawler
is constructed for it.
* If ``crawler_or_spidercls`` is a string, this function finds
a spider with this name in a Scrapy project (using spider loader),
then creates a Crawler instance for it.
"""
if isinstance(crawler_or_spidercls, Spider):
raise ValueError(
'The crawler_or_spidercls argument cannot be a spider object, '
'it must be a spider class (or a Crawler object)')
if isinstance(crawler_or_spidercls, Crawler):
return crawler_or_spidercls
return self._create_crawler(crawler_or_spidercls)
示例5: test_save_response_with_trim
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Spider [as 别名]
def test_save_response_with_trim(self):
self.instance._writer.maxitemsize = 26
self.instance.hsref.job.key = '123/45/67'
resp = TextResponse(
'http://resp', request=Request('http://req'), encoding='cp1251',
body='\r\n\r\n<html><body></body></html>\r\n \0\0\0\0\0')
with mock.patch.object(Spider, 'logger') as log:
spider = Spider('default')
self.instance.save_response(resp, self.spider)
log.warning.assert_called_with(
"Page not saved, body too large: <http://resp>")
self.instance.trim_html = True
self.instance.save_response(resp, spider)
self.instance._writer.write.assert_called_with(
{u'body': u'<html><body></body></html>', u'_encoding': u'cp1251',
u'_type': u'_pageitem',
u'_key': u'9b4bed7e56103ddf63455ed39145f61f53b3c702',
u'url': u'http://resp', '_jobid': '123/45/67'})
示例6: parse
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Spider [as 别名]
def parse(self, response):
"""
Default callback function with response for the crawled url
https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.parse
"""
response = response.replace(body=re.sub(r"<br\s*[\/]?>", "\n", response.body.decode('utf=8')))
property_key = response.url.split('=')[1].replace('&', '')
# logging.debug("Parsing property_key: %s", property_key)
if 'No Data at this time' in response.text:
msg = "No data for " + response.url
logging.warning(msg)
raise DropItem(msg)
else:
property_info = self.parse_property_info(response)
property_values = self.parse_property_values(response)
property_sales = self.parse_property_sales(response)
property_info['sales'] = property_sales
property_info['values'] = property_values
property_info['property_key'] = property_key
yield Property(property_info)
示例7: __init__
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Spider [as 别名]
def __init__(self):
scrapy.spiders.Spider.__init__(self)
self.global_settings = get_project_settings()
if self.global_settings['PLATFORM'] in ['win', 'mac']:
self.driver = webdriver.PhantomJS(executable_path= self.global_settings['PHANTOMJS_PATH'])
elif self.global_settings['PLATFORM'] in ['linux']:
self.driver = webdriver.PhantomJS()
self.driver.set_page_load_timeout(30)
self.driver.implicitly_wait(10)
self.type_id_list = self.global_settings['CRAWLER']['type_id_list']
self.re_type_id = re.compile(self.global_settings['CRAWLER']['re_type_id'])
self.url_template = self.global_settings['CRAWLER']['url_template']
示例8: __del__
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Spider [as 别名]
def __del__(self):
self.driver.quit()
scrapy.spiders.Spider.__del__(self)
示例9: get_crawler
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Spider [as 别名]
def get_crawler():
def _crawler(extended_settings={}):
settings = {
"SPIDERMON_ENABLED": True,
"EXTENSIONS": {"spidermon.contrib.scrapy.extensions.Spidermon": 500},
}
settings.update(extended_settings)
crawler = Crawler(Spider, settings=settings)
crawler.spider = Spider("dummy")
return crawler
return _crawler
示例10: make_data
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Spider [as 别名]
def make_data(request):
def _make_data(settings=None):
crawler = Crawler(Spider, settings=settings)
spider = Spider("dummy")
return {
"stats": crawler.stats.get_stats(),
"crawler": crawler,
"spider": spider,
"runner": SpiderMonitorRunner(spider=spider),
"job": None,
}
return _make_data
示例11: run_test
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Spider [as 别名]
def run_test(self, **kwargs):
dt = TestData(**kwargs)
settings = {
"SPIDERMON_ENABLED": True,
"SPIDERMON_SPIDER_OPEN_EXPRESSION_MONITORS": [
{"tests": [{"expression": dt.expression}]}
],
}
settings.update(dt.settings)
crawler = get_crawler(settings_dict=settings)
crawler.stats.get_stats = lambda _: dt.stats
spidermon = Spidermon.from_crawler(crawler)
spider = Spider(name=self.spider_name)
# mocking, to see test results via raising AssertionError exception
# with failures and errors as results
spidermon._run_suites = partial(_test_run_suites, spidermon)
try:
spidermon.spider_opened(spider)
except AssertionError as e:
failures, errors = e.args[0]
for f in failures:
_, trace = f
raise AssertionError(trace)
for e in errors:
_, trace = e
if dt.expected_error and dt.expected_error in trace:
dt.expected_error = None
else:
raise AssertionError(trace)
if dt.expected_error:
raise AssertionError(
"Expected error <{}> was not raised".format(dt.expected_error)
)
示例12: test_spider_has_stats_history_attribute_when_opened_with_collector
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Spider [as 别名]
def test_spider_has_stats_history_attribute_when_opened_with_collector(
test_settings, stats_temporary_location
):
crawler = get_crawler(Spider, test_settings)
crawler.crawl("foo_spider")
crawler.stats.set_value("garbage", "value")
assert hasattr(crawler.spider, "stats_history")
assert crawler.spider.stats_history == deque()
crawler.stop()
示例13: test_spider_has_stats_history_queue_with_specified_max_size
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Spider [as 别名]
def test_spider_has_stats_history_queue_with_specified_max_size(
test_settings, stats_temporary_location
):
max_stored_stats = 2
test_settings["SPIDERMON_MAX_STORED_STATS"] = max_stored_stats
crawler = get_crawler(Spider, test_settings)
crawler.crawl("foo_spider")
assert crawler.spider.stats_history == deque()
assert crawler.spider.stats_history.maxlen == max_stored_stats
crawler.stop()
示例14: test_spider_has_last_stats_history_when_opened_again
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Spider [as 别名]
def test_spider_has_last_stats_history_when_opened_again(
test_settings, stats_temporary_location
):
crawler = get_crawler(Spider, test_settings)
crawler.crawl("foo_spider")
crawler.stats.set_value("first_execution", "value")
crawler.stop()
crawler = get_crawler(Spider, test_settings)
crawler.crawl("foo_spider")
assert len(crawler.spider.stats_history) == 1
assert crawler.spider.stats_history[0]["first_execution"] == "value"
crawler.stop()
示例15: spider_for
# 需要导入模块: import scrapy [as 别名]
# 或者: from scrapy import Spider [as 别名]
def spider_for(injectable: Type):
class InjectableSpider(scrapy.Spider):
url = None
def start_requests(self):
yield Request(self.url, capture_exceptions(callback_for(injectable)))
return InjectableSpider