本文整理汇总了Python中scrapy.crawler.CrawlerRunner.create_crawler方法的典型用法代码示例。如果您正苦于以下问题:Python CrawlerRunner.create_crawler方法的具体用法?Python CrawlerRunner.create_crawler怎么用?Python CrawlerRunner.create_crawler使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.crawler.CrawlerRunner
的用法示例。
在下文中一共展示了CrawlerRunner.create_crawler方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_crawler_runner_loading
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import create_crawler [as 别名]
def test_crawler_runner_loading(self):
module = 'tests.test_spiderloader.test_spiders.spider1'
runner = CrawlerRunner({'SPIDER_MODULES': [module]})
self.assertRaisesRegexp(KeyError, 'Spider not found',
runner.create_crawler, 'spider2')
crawler = runner.create_crawler('spider1')
self.assertTrue(issubclass(crawler.spidercls, scrapy.Spider))
self.assertEqual(crawler.spidercls.name, 'spider1')
示例2: get_crawler
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import create_crawler [as 别名]
def get_crawler(spidercls=None, settings_dict=None):
"""Return an unconfigured Crawler object. If settings_dict is given, it
will be used to populate the crawler settings with a project level
priority.
"""
from scrapy.crawler import CrawlerRunner
from scrapy.spiders import Spider
runner = CrawlerRunner(settings_dict)
return runner.create_crawler(spidercls or Spider)
示例3: setUp
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import create_crawler [as 别名]
def setUp(self):
settings = Settings()
settings.setmodule(undercrawler.settings)
settings['DOWNLOAD_DELAY'] = 0.1
settings['ITEM_PIPELINES']['tests.utils.CollectorPipeline'] = 100
splash_url = os.environ.get('SPLASH_URL')
if splash_url:
settings['SPLASH_URL'] = splash_url
settings.update(self.settings)
runner = CrawlerRunner(settings)
self.crawler = runner.create_crawler(BaseSpider)
示例4: scrapy_embedding
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import create_crawler [as 别名]
def scrapy_embedding(spidercls):
settings = get_scrapy_settings()
# actually we can manually create crawler
# but CrawlRunner does it more sophisticated and adds support for str
runner = CrawlerRunner(settings)
crawler = runner.create_crawler(spidercls)
crawler.engine = crawler._create_engine()
crawler.engine.start()
# log.start(logstdout=False)
return crawler
示例5: CrawlTestCase
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import create_crawler [as 别名]
class CrawlTestCase(TestCase):
def setUp(self):
self.mockserver = MockServer()
self.mockserver.__enter__()
self.runner = CrawlerRunner()
def tearDown(self):
self.mockserver.__exit__(None, None, None)
@defer.inlineCallbacks
def test_follow_all(self):
crawler = self.runner.create_crawler(FollowAllSpider)
yield crawler.crawl()
assert crawler.stats.get_value('item_scraped_count') == 3
示例6: FileDownloadCrawlTestCase
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import create_crawler [as 别名]
class FileDownloadCrawlTestCase(TestCase):
pipeline_class = 'scrapy.pipelines.files.FilesPipeline'
store_setting_key = 'FILES_STORE'
media_key = 'files'
media_urls_key = 'file_urls'
expected_checksums = set([
'5547178b89448faf0015a13f904c936e',
'c2281c83670e31d8aaab7cb642b824db',
'ed3f6538dc15d4d9179dae57319edc5f'])
def setUp(self):
self.mockserver = MockServer()
self.mockserver.__enter__()
# prepare a directory for storing files
self.tmpmediastore = self.mktemp()
os.mkdir(self.tmpmediastore)
self.settings = {
'ITEM_PIPELINES': {self.pipeline_class: 1},
self.store_setting_key: self.tmpmediastore,
}
self.runner = CrawlerRunner(self.settings)
self.items = []
def tearDown(self):
shutil.rmtree(self.tmpmediastore)
self.items = []
self.mockserver.__exit__(None, None, None)
def _on_item_scraped(self, item):
self.items.append(item)
def _create_crawler(self, spider_class, **kwargs):
crawler = self.runner.create_crawler(spider_class, **kwargs)
crawler.signals.connect(self._on_item_scraped, signals.item_scraped)
return crawler
def _assert_files_downloaded(self, items, logs):
self.assertEqual(len(items), 1)
self.assertIn(self.media_key, items[0])
# check that logs show the expected number of successful file downloads
file_dl_success = 'File (downloaded): Downloaded file from'
self.assertEqual(logs.count(file_dl_success), 3)
# check that the images/files checksums are what we know they should be
if self.expected_checksums is not None:
checksums = set(
i['checksum']
for item in items
for i in item[self.media_key])
self.assertEqual(checksums, self.expected_checksums)
# check that the image files where actually written to the media store
for item in items:
for i in item[self.media_key]:
self.assertTrue(
os.path.exists(
os.path.join(self.tmpmediastore, i['path'])))
def _assert_files_download_failure(self, crawler, items, code, logs):
# check that the item does NOT have the "images/files" field populated
self.assertEqual(len(items), 1)
self.assertIn(self.media_key, items[0])
self.assertFalse(items[0][self.media_key])
# check that there was 1 successful fetch and 3 other responses with non-200 code
self.assertEqual(crawler.stats.get_value('downloader/request_method_count/GET'), 4)
self.assertEqual(crawler.stats.get_value('downloader/response_count'), 4)
self.assertEqual(crawler.stats.get_value('downloader/response_status_count/200'), 1)
self.assertEqual(crawler.stats.get_value('downloader/response_status_count/%d' % code), 3)
# check that logs do show the failure on the file downloads
file_dl_failure = 'File (code: %d): Error downloading file from' % code
self.assertEqual(logs.count(file_dl_failure), 3)
# check that no files were written to the media store
self.assertEqual(os.listdir(self.tmpmediastore), [])
@defer.inlineCallbacks
def test_download_media(self):
crawler = self._create_crawler(MediaDownloadSpider)
with LogCapture() as log:
yield crawler.crawl("http://localhost:8998/files/images/",
media_key=self.media_key,
media_urls_key=self.media_urls_key)
self._assert_files_downloaded(self.items, str(log))
@defer.inlineCallbacks
def test_download_media_wrong_urls(self):
crawler = self._create_crawler(BrokenLinksMediaDownloadSpider)
with LogCapture() as log:
yield crawler.crawl("http://localhost:8998/files/images/",
media_key=self.media_key,
media_urls_key=self.media_urls_key)
self._assert_files_download_failure(crawler, self.items, 404, str(log))
@defer.inlineCallbacks
def test_download_media_redirected_default_failure(self):
#.........这里部分代码省略.........
示例7: CrawlTestCase
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import create_crawler [as 别名]
class CrawlTestCase(TestCase):
def setUp(self):
self.mockserver = MockServer()
self.mockserver.__enter__()
self.runner = CrawlerRunner()
def tearDown(self):
self.mockserver.__exit__(None, None, None)
@defer.inlineCallbacks
def test_follow_all(self):
crawler = self.runner.create_crawler(FollowAllSpider)
yield crawler.crawl(mockserver=self.mockserver)
self.assertEqual(len(crawler.spider.urls_visited), 11) # 10 + start_url
@defer.inlineCallbacks
def test_delay(self):
# short to long delays
yield self._test_delay(0.2, False)
yield self._test_delay(1, False)
# randoms
yield self._test_delay(0.2, True)
yield self._test_delay(1, True)
@defer.inlineCallbacks
def _test_delay(self, delay, randomize):
settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize}
crawler = CrawlerRunner(settings).create_crawler(FollowAllSpider)
yield crawler.crawl(maxlatency=delay * 2, mockserver=self.mockserver)
t = crawler.spider.times
totaltime = t[-1] - t[0]
avgd = totaltime / (len(t) - 1)
tolerance = 0.6 if randomize else 0.2
self.assertTrue(avgd > delay * (1 - tolerance),
"download delay too small: %s" % avgd)
@defer.inlineCallbacks
def test_timeout_success(self):
crawler = self.runner.create_crawler(DelaySpider)
yield crawler.crawl(n=0.5, mockserver=self.mockserver)
self.assertTrue(crawler.spider.t1 > 0)
self.assertTrue(crawler.spider.t2 > 0)
self.assertTrue(crawler.spider.t2 > crawler.spider.t1)
@defer.inlineCallbacks
def test_timeout_failure(self):
crawler = CrawlerRunner({"DOWNLOAD_TIMEOUT": 0.35}).create_crawler(DelaySpider)
yield crawler.crawl(n=0.5, mockserver=self.mockserver)
self.assertTrue(crawler.spider.t1 > 0)
self.assertTrue(crawler.spider.t2 == 0)
self.assertTrue(crawler.spider.t2_err > 0)
self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
# server hangs after receiving response headers
yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver)
self.assertTrue(crawler.spider.t1 > 0)
self.assertTrue(crawler.spider.t2 == 0)
self.assertTrue(crawler.spider.t2_err > 0)
self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
@defer.inlineCallbacks
def test_retry_503(self):
crawler = self.runner.create_crawler(SimpleSpider)
with LogCapture() as l:
yield crawler.crawl(self.mockserver.url("/status?n=503"), mockserver=self.mockserver)
self._assert_retried(l)
@defer.inlineCallbacks
def test_retry_conn_failed(self):
crawler = self.runner.create_crawler(SimpleSpider)
with LogCapture() as l:
yield crawler.crawl("http://localhost:65432/status?n=503", mockserver=self.mockserver)
self._assert_retried(l)
@defer.inlineCallbacks
def test_retry_dns_error(self):
crawler = self.runner.create_crawler(SimpleSpider)
with LogCapture() as l:
# try to fetch the homepage of a non-existent domain
yield crawler.crawl("http://dns.resolution.invalid./", mockserver=self.mockserver)
self._assert_retried(l)
@defer.inlineCallbacks
def test_start_requests_bug_before_yield(self):
with LogCapture('scrapy', level=logging.ERROR) as l:
crawler = self.runner.create_crawler(BrokenStartRequestsSpider)
yield crawler.crawl(fail_before_yield=1, mockserver=self.mockserver)
self.assertEqual(len(l.records), 1)
record = l.records[0]
self.assertIsNotNone(record.exc_info)
self.assertIs(record.exc_info[0], ZeroDivisionError)
@defer.inlineCallbacks
def test_start_requests_bug_yielding(self):
with LogCapture('scrapy', level=logging.ERROR) as l:
crawler = self.runner.create_crawler(BrokenStartRequestsSpider)
yield crawler.crawl(fail_yielding=1, mockserver=self.mockserver)
self.assertEqual(len(l.records), 1)
#.........这里部分代码省略.........
示例8: make_crawler
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import create_crawler [as 别名]
def make_crawler(settings, **extra_settings):
settings.update(extra_settings)
runner = CrawlerRunner(settings)
return runner.create_crawler(BaseSpider)