本文整理汇总了Python中scrapy.crawler.CrawlerRunner类的典型用法代码示例。如果您正苦于以下问题:Python CrawlerRunner类的具体用法?Python CrawlerRunner怎么用?Python CrawlerRunner使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了CrawlerRunner类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run_crawler_by_runner
def run_crawler_by_runner():
runner = CrawlerRunner(get_project_settings())
[runner.crawl(spider) for spider in spiders]
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
示例2: CrawlerRunnerTest
class CrawlerRunnerTest(unittest.TestCase):
def setUp(self):
self.crawler_runner = CrawlerRunner(Settings())
def tearDown(self):
return self.crawler_runner.stop()
@defer.inlineCallbacks
def test_populate_spidercls_settings(self):
spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
project_settings = {'TEST1': 'project', 'TEST3': 'project'}
class CustomSettingsSpider(DefaultSpider):
custom_settings = spider_settings
self.crawler_runner.settings.setdict(project_settings,
priority='project')
d = self.crawler_runner.crawl(CustomSettingsSpider)
crawler = list(self.crawler_runner.crawlers)[0]
yield d
self.assertEqual(crawler.settings.get('TEST1'), 'spider')
self.assertEqual(crawler.settings.get('TEST2'), 'spider')
self.assertEqual(crawler.settings.get('TEST3'), 'project')
示例3: handle_lj
def handle_lj(self):
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner(crawler_setting)
#d = runner.crawl(HouseSpider)
d = runner.crawl(LianjiaHouseSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()
示例4: run
def run():
options = {
'CONCURRENT_ITEMS': 250,
#'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
'CONCURRENT_REQUESTS': 30,
'DOWNLOAD_DELAY': 0.5,
'COOKIES_ENABLED': False,
}
spider = EntertainmentcareersSpider()
settings = get_project_settings()
settings.update(options)
runner= CrawlerRunner(settings)
runner.crawl(spider)
d= runner.join()
d.addBoth(lambda _:reactor.stop())
#crawler = Crawler(settings)
#crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
#crawler.install()
#crawler.configure()
#crawler.crawl(spider)
#crawler.start()
#log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
reactor.run()
示例5: crawl_articles
def crawl_articles(spids):
settings = get_project_settings()
configure_logging(settings, install_root_handler=False)
logging.getLogger('scrapy').setLevel(logging.WARNING)
runner = CrawlerRunner(settings)
loader = runner.spider_loader
if 'all' in spids:
spids = loader.list()
spiders = [
loader.load(spid)
for spid in spids
if spid in loader.list()
]
if not spiders:
return
random.shuffle(spiders)
for spider in spiders:
runner.crawl(spider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
logger.info('crawl job starting...')
try:
reactor.run()
except Exception:
logger.exception('crawl job got exception:')
logger.info('crawl job finished')
示例6: test_crawler_process
def test_crawler_process(self):
runner = CrawlerRunner(self.settings)
d = runner.crawl(CustomSpider)
d.addBoth(lambda _: reactor.stop())
# add crawl to redis
key = "test-spider:istresearch.com:queue"
self.redis_conn.zadd(key, self.example_feed, -99)
# run the spider, give 20 seconds to see the url, crawl it,
# and send to kafka. Then we kill the reactor
def thread_func():
time.sleep(20)
reactor.stop()
thread = threading.Thread(target=thread_func)
thread.start()
reactor.run()
message_count = 0
m = next(self.consumer)
if m is None:
pass
else:
the_dict = json.loads(m.value)
if the_dict is not None and the_dict['appid'] == 'test' \
and the_dict['crawlid'] == 'abc12345':
message_count += 1
self.assertEquals(message_count, 1)
示例7: crawler_start
def crawler_start(usage, tasks):
"""Start specified spiders or validators from cmd with scrapy core api.
There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
assign any tasks, all these spiders will run.
"""
maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS
origin_spiders = DEFAULT_CRAWLERS if usage == 'crawler' else DEFAULT_VALIDATORS
if not tasks:
spiders = origin_spiders
else:
spiders = list()
cases = list(map(BaseCase, origin_spiders))
for task in tasks:
for case in cases:
if case.check(task, maps):
spiders.append(case.spider)
break
else:
# crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
# task, list(maps.keys())))
pass
if not spiders:
#crawler_logger.warning('no spider starts up, please check your task input')
return
settings = get_project_settings()
configure_logging(settings)
runner = CrawlerRunner(settings)
for spider in spiders:
runner.crawl(spider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
示例8: run_spider
def run_spider():
options = {
'CONCURRENT_ITEMS': 250,
'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
'CONCURRENT_REQUESTS': 30,
'DOWNLOAD_DELAY': 0.5,
'COOKIES_ENABLED': False,
}
settings = get_project_settings()
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
settings.update(options);
#BookToscrapeSpider basic version
from tutorial.spiders.booktoscrape_basic import BookToscrapeSpider
#runner = CrawlerRunner(settings)
#runner.crawl(BookToscrapeSpider())
#BookToscrapeSpider crawl version
from tutorial.spiders.booktoscrape_crawl import BookToscrapeSpider as BookToscrapeSpider_crawl
runner = CrawlerRunner(settings)
runner.crawl(BookToscrapeSpider_crawl())
#crawler = Crawler(settings)
#crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
#crawler.install()
#crawler.configure()
#crawler.crawl(spider)
#crawler.start()
#log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
d= runner.join()
d.addBoth(lambda _:reactor.stop())
reactor.run()
示例9: test_same_url
def test_same_url(self):
class TestSameUrlSpider(Spider):
name = 'test_same_url'
def __init__(self, *args, **kwargs):
super(TestSameUrlSpider, self).__init__(*args, **kwargs)
self.visited = 0
def start_requests(s):
return self.conman.from_spider(s, self.results)
def parse_first(self, response):
self.visited += 1
return TestItem()
def parse_second(self, response):
self.visited += 1
return TestItem()
with MockServer() as mockserver:
contract_doc = '@url {}'.format(mockserver.url('/status?n=200'))
get_unbound_function(TestSameUrlSpider.parse_first).__doc__ = contract_doc
get_unbound_function(TestSameUrlSpider.parse_second).__doc__ = contract_doc
crawler = CrawlerRunner().create_crawler(TestSameUrlSpider)
yield crawler.crawl()
self.assertEqual(crawler.spider.visited, 2)
示例10: test_start_requests_dupes
def test_start_requests_dupes(self):
settings = {"CONCURRENT_REQUESTS": 1}
crawler = CrawlerRunner(settings).create_crawler(DuplicateStartRequestsSpider)
yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver)
self.assertEqual(crawler.spider.visited, 6)
yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver)
self.assertEqual(crawler.spider.visited, 3)
示例11: crawl
def crawl(self):
spider = Scrapy_ModuleSpider()
Runner = CrawlerRunner(self.Scrapy_Module_setting)
cra = Runner.crawl(spider)
# stop reactor when spider closes
cra.addBoth(lambda _: self.spider_closing(cra))
self.logger.info("Run reactor")
reactor.run()
示例12: runSpider
def runSpider(self, spider):
configure_logging({'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s'})
settings = Settings()
settings.set('FEED_URI', 'output.json')
settings.set('FEED_FORMAT', 'json')
runner = CrawlerRunner(settings)
dfd = runner.crawl(spider)
dfd.addBoth(lambda _: reactor.stop())
示例13: runProcess
def runProcess(self):
configure_logging()
dbHandler.check_watches()
runner = CrawlerRunner()
runner.crawl(spider.available_courses_spider)
dbHandler.check_watches()
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
开发者ID:NemirovD,项目名称:Interval-Scheduling-Algorithm-with-Applied-Constraints,代码行数:10,代码来源:spiderworker.py
示例14: run_login_spider
def run_login_spider(seed_url, username, password, db_name, logfile = "results.log"):
init_db(db_name)
settings = get_project_settings()
runner = CrawlerRunner(settings)
d = runner.crawl(LoginFinderSpider, seed_url = seed_url, username = username, password = password)
d.addBoth(lambda _: reactor.stop())
log.start(loglevel=log.DEBUG, logfile=logfile)
log.msg("Item pipelines enabled: %s" % str(settings.get("ITEM_PIPELINES")), level = log.INFO)
reactor.run()
示例15: test_crawler_runner_loading
def test_crawler_runner_loading(self):
module = 'tests.test_spiderloader.test_spiders.spider1'
runner = CrawlerRunner({'SPIDER_MODULES': [module]})
self.assertRaisesRegexp(KeyError, 'Spider not found',
runner.create_crawler, 'spider2')
crawler = runner.create_crawler('spider1')
self.assertTrue(issubclass(crawler.spidercls, scrapy.Spider))
self.assertEqual(crawler.spidercls.name, 'spider1')