本文整理汇总了Python中scrapy.crawler.CrawlerRunner方法的典型用法代码示例。如果您正苦于以下问题:Python crawler.CrawlerRunner方法的具体用法?Python crawler.CrawlerRunner怎么用?Python crawler.CrawlerRunner使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.crawler
的用法示例。
在下文中一共展示了crawler.CrawlerRunner方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def run():
configure_logging()
# importing project settings for further usage
# mainly because of the middlewares
settings = get_project_settings()
runner = CrawlerRunner(settings)
# running spiders sequentially (non-distributed)
@defer.inlineCallbacks
def crawl():
yield runner.crawl(IPTesterSpider)
yield runner.crawl(UATesterSpider)
reactor.stop()
crawl()
reactor.run() # block until the last call
示例2: crawl_runner
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def crawl_runner(extra_settings=None):
settings = base_settings.copy()
if extra_settings is not None:
settings.update(extra_settings, priority='cmdline')
if settings.get('SPLASH_URL'):
settings['DUPEFILTER_CLASS'] = 'scrapy_splash.SplashAwareDupeFilter'
settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression'
'.HttpCompressionMiddleware': 810,
})
else:
settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
'autologin.middleware.ExposeCookiesMiddleware': 700,
})
return CrawlerRunner(settings)
示例3: crawler_start
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def crawler_start(usage, tasks):
"""Start specified spiders or validators from cmd with scrapy core api.
There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
assign any tasks, all these spiders will run.
"""
if usage == 'crawler':
maps = CRAWLER_TASK_MAPS
origin_spiders = DEFAULT_CRAWLERS
else:
maps = TEMP_TASK_MAPS
origin_spiders = DEFAULT_VALIDATORS
if not tasks:
spiders = origin_spiders
else:
spiders = list()
cases = list(map(BaseCase, origin_spiders))
for task in tasks:
for case in cases:
if case.check(task, maps):
spiders.append(case.spider)
break
else:
# crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
# task, list(maps.keys())))
pass
if not spiders:
#crawler_logger.warning('no spider starts up, please check your task input')
return
settings = get_project_settings()
configure_logging(settings)
runner = CrawlerRunner(settings)
for spider in spiders:
runner.crawl(spider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
示例4: test_crawler_process
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def test_crawler_process(self):
runner = CrawlerRunner(self.settings)
d = runner.crawl(CustomSpider)
d.addBoth(lambda _: reactor.stop())
# add crawl to redis
key = "test-spider:dmoztools.net:queue"
self.redis_conn.zadd(key, self.example_feed, -99)
# run the spider, give 20 seconds to see the url, crawl it,
# and send to kafka. Then we kill the reactor
def thread_func():
time.sleep(20)
reactor.stop()
thread = threading.Thread(target=thread_func)
thread.start()
reactor.run()
message_count = 0
m = next(self.consumer)
if m is None:
pass
else:
the_dict = json.loads(m.value)
if the_dict is not None and the_dict['appid'] == 'test' \
and the_dict['crawlid'] == 'abc12345':
message_count += 1
self.assertEquals(message_count, 1)
示例5: runspider
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def runspider(self):
configure_logging(install_root_handler = False)
s = get_project_settings()
runner = CrawlerRunner(settings = s)
@defer.inlineCallbacks
def crawl(**spargs):
yield runner.crawl(JDItemInfoSpider, **spargs)
yield runner.crawl(JDCommentSpider, **spargs)
reactor.stop()
crawl(**self.spargs)
reactor.run() # the script will block here until the last crawl call is finished
# 调度分析
示例6: get_crawler
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def get_crawler(spidercls=None, settings_dict=None):
"""Return an unconfigured Crawler object. If settings_dict is given, it
will be used to populate the crawler settings with a project level
priority.
"""
from scrapy.crawler import CrawlerRunner
from scrapy.spiders import Spider
runner = CrawlerRunner(settings_dict)
return runner.create_crawler(spidercls or Spider)
示例7: return_spider_output
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def return_spider_output(output):
"""
Turns scrapy output into dictionaries
:param output: items scraped by CrawlerRunner
:type output: dict
:return: json with list of items
"""
# this just turns items into dictionaries
return [dict(item) for item in output]
示例8: __init__
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def __init__(self, settings, spider, args):
"""
init parser
:param settings:
:param spider:
:param args:
"""
self.args = args
self.spider = spider
self.crawler_process = CrawlerRunner(settings)
self.spider_loader = self.crawler_process.spider_loader
self.spidercls = self.spider_loader.load(self.spider)
示例9: get_start_requests
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def get_start_requests(project_path, spider_name):
"""
get start requests
:param project_path: project path
:param spider_name: spider name
:return:
"""
work_cwd = os.getcwd()
try:
# change work dir
os.chdir(project_path)
# load settings
settings = get_project_settings()
check_deprecated_settings(settings)
runner = CrawlerRunner(settings=settings)
# add crawler
spider_cls = runner.spider_loader.load(spider_name)
runner.crawl(spider_cls)
# get crawler
crawler = list(runner.crawlers)[0]
# get spider by crawler
spider = crawler.spider
# get start requests
requests = list(spider.start_requests())
if not requests and hasattr(spider, 'start'):
requests = list(spider.start())
requests = list(map(lambda r: process_request(r), requests))
return {'finished': True, 'requests': requests}
finally:
os.chdir(work_cwd)
示例10: init_crawler_runner
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def init_crawler_runner():
crochet.setup()
init_scrapy_env()
settings = get_project_settings()
global CRAWLER_RUNNER
CRAWLER_RUNNER = CrawlerRunner(settings)
logger.info('Initialized crawler runner: %s' % CRAWLER_RUNNER)
# TODO: move these to config file?
示例11: run_spider2
# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def run_spider2(spider, *args):
configure_logging()
runner = CrawlerRunner(get_project_settings())
runner.crawl(spider, *args)
runner.crawl(spider, *args)
d = runner.join()
# d = runner.crawl(spider, *args)
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until all crawling jobs are finished