本文整理汇总了Python中frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler.process_spider_output方法的典型用法代码示例。如果您正苦于以下问题:Python FronteraScheduler.process_spider_output方法的具体用法?Python FronteraScheduler.process_spider_output怎么用?Python FronteraScheduler.process_spider_output使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler
的用法示例。
在下文中一共展示了FronteraScheduler.process_spider_output方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_process_spider_output
# 需要导入模块: from frontera.contrib.scrapy.schedulers.frontier import FronteraScheduler [as 别名]
# 或者: from frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler import process_spider_output [as 别名]
def test_process_spider_output(self):
i1 = {'name': 'item', 'item': 'i1'}
i2 = {'name': 'item', 'item': 'i2'}
result = [r1, r2, r3, i1, i2]
resp = Response(fr1.url, request=Request(fr1.url, meta={'frontier_request': fr1}))
crawler = FakeCrawler()
fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
fs.open(Spider)
assert sorted(list(fs.process_spider_output(resp, result, Spider))) == sorted([i1, i2])
assert isinstance(fs.frontier.manager.responses[0], FResponse)
assert fs.frontier.manager.responses[0].url == resp.url
assert set([request.url for request in fs.frontier.manager.links]) == set([r1.url, r2.url, r3.url])
assert all([isinstance(request, FRequest) for request in fs.frontier.manager.links])
assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count') == 1
assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count/200') == 1
assert fs.stats_manager.stats.get_value('frontera/links_extracted_count') == 3
示例2: test_process_spider_output
# 需要导入模块: from frontera.contrib.scrapy.schedulers.frontier import FronteraScheduler [as 别名]
# 或者: from frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler import process_spider_output [as 别名]
def test_process_spider_output(self):
i1 = {"name": "item", "item": "i1"}
i2 = {"name": "item", "item": "i2"}
result = [r1, r2, r3, i1, i2]
resp = Response(fr1.url, request=Request(fr1.url, meta={b"frontier_request": fr1}))
crawler = FakeCrawler()
fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
fs.open(Spider)
assert sorted(list(fs.process_spider_output(resp, result, Spider)), key=lambda i: sorted(i["item"])) == sorted(
[i1, i2], key=lambda i: sorted(i["item"])
)
assert isinstance(fs.frontier.manager.responses[0], FResponse)
assert fs.frontier.manager.responses[0].url == resp.url
assert set([request.url for request in fs.frontier.manager.links]) == set([r1.url, r2.url, r3.url])
assert all([isinstance(request, FRequest) for request in fs.frontier.manager.links])
assert fs.stats_manager.stats.get_value("frontera/crawled_pages_count") == 1
assert fs.stats_manager.stats.get_value("frontera/crawled_pages_count/200") == 1
assert fs.stats_manager.stats.get_value("frontera/links_extracted_count") == 3
示例3: test_process_spider_output
# 需要导入模块: from frontera.contrib.scrapy.schedulers.frontier import FronteraScheduler [as 别名]
# 或者: from frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler import process_spider_output [as 别名]
def test_process_spider_output(self):
i1 = {'name': 'item', 'item': 'i1'}
i2 = {'name': 'item', 'item': 'i2'}
items = [i1 , i2]
requests = [r1, r2, r3]
result = list(requests)
result.extend(items)
resp = Response(fr1.url, request=Request(fr1.url, meta={b'frontier_request': fr1}))
crawler = FakeCrawler()
fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
spider = Spider(name="testing")
fs.open(spider)
out_items = list(fs.process_spider_output(resp, result, spider))
assert len(out_items) == len(items)
assert set([r.url for r in fs.frontier.manager.links]) == set([r.url for r in requests])
assert isinstance(fs.frontier.manager.responses[0], FResponse)
assert fs.frontier.manager.responses[0].url == resp.url
assert set([request.url for request in fs.frontier.manager.links]) == set([r1.url, r2.url, r3.url])
assert all([isinstance(request, FRequest) for request in fs.frontier.manager.links])
assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count') == 1
assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count/200') == 1
assert fs.stats_manager.stats.get_value('frontera/links_extracted_count') == 3
示例4: test_process_spider_output
# 需要导入模块: from frontera.contrib.scrapy.schedulers.frontier import FronteraScheduler [as 别名]
# 或者: from frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler import process_spider_output [as 别名]
def test_process_spider_output(self):
i1 = {'name': 'item', 'item': 'i1'}
i2 = {'name': 'item', 'item': 'i2'}
no_requests = 3
result = [r1, r2, r3, i1, i2]
resp = Response(fr1.url, request=Request(fr1.url, meta={b'frontier_request': fr1}))
crawler = FakeCrawler()
fs = FronteraScheduler(crawler, manager=FakeFrontierManager)
fs.open(Spider)
out = list(fs.process_spider_output(resp, result, Spider))
assert len(out) == len(result)
out_request = out[:no_requests]
assert set(r.url for r in out_request) == set(r.url for r in result[:no_requests])
out_items = out[no_requests:]
assert sorted(out_items, key=lambda i: sorted(i['item'])) == \
sorted([i1, i2], key=lambda i: sorted(i['item']))
assert isinstance(fs.frontier.manager.responses[0], FResponse)
assert fs.frontier.manager.responses[0].url == resp.url
assert set([request.url for request in fs.frontier.manager.links]) == set([r1.url, r2.url, r3.url])
assert all([isinstance(request, FRequest) for request in fs.frontier.manager.links])
assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count') == 1
assert fs.stats_manager.stats.get_value('frontera/crawled_pages_count/200') == 1
assert fs.stats_manager.stats.get_value('frontera/links_extracted_count') == 3