本文整理汇总了Python中scrapy.crawler.CrawlerRunner.join方法的典型用法代码示例。如果您正苦于以下问题:Python CrawlerRunner.join方法的具体用法?Python CrawlerRunner.join怎么用?Python CrawlerRunner.join使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.crawler.CrawlerRunner
的用法示例。
在下文中一共展示了CrawlerRunner.join方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: crawler_start
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
def crawler_start(usage, tasks):
"""Start specified spiders or validators from cmd with scrapy core api.
There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
assign any tasks, all these spiders will run.
"""
maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS
origin_spiders = DEFAULT_CRAWLERS if usage == 'crawler' else DEFAULT_VALIDATORS
if not tasks:
spiders = origin_spiders
else:
spiders = list()
cases = list(map(BaseCase, origin_spiders))
for task in tasks:
for case in cases:
if case.check(task, maps):
spiders.append(case.spider)
break
else:
# crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
# task, list(maps.keys())))
pass
if not spiders:
#crawler_logger.warning('no spider starts up, please check your task input')
return
settings = get_project_settings()
configure_logging(settings)
runner = CrawlerRunner(settings)
for spider in spiders:
runner.crawl(spider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
示例2: crawl_articles
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
def crawl_articles(spids):
settings = get_project_settings()
configure_logging(settings, install_root_handler=False)
logging.getLogger('scrapy').setLevel(logging.WARNING)
runner = CrawlerRunner(settings)
loader = runner.spider_loader
if 'all' in spids:
spids = loader.list()
spiders = [
loader.load(spid)
for spid in spids
if spid in loader.list()
]
if not spiders:
return
random.shuffle(spiders)
for spider in spiders:
runner.crawl(spider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
logger.info('crawl job starting...')
try:
reactor.run()
except Exception:
logger.exception('crawl job got exception:')
logger.info('crawl job finished')
示例3: run_crawler_by_runner
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
def run_crawler_by_runner():
runner = CrawlerRunner(get_project_settings())
[runner.crawl(spider) for spider in spiders]
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
示例4: run
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
def run():
options = {
'CONCURRENT_ITEMS': 250,
#'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
'CONCURRENT_REQUESTS': 30,
'DOWNLOAD_DELAY': 0.5,
'COOKIES_ENABLED': False,
}
spider = EntertainmentcareersSpider()
settings = get_project_settings()
settings.update(options)
runner= CrawlerRunner(settings)
runner.crawl(spider)
d= runner.join()
d.addBoth(lambda _:reactor.stop())
#crawler = Crawler(settings)
#crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
#crawler.install()
#crawler.configure()
#crawler.crawl(spider)
#crawler.start()
#log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
reactor.run()
示例5: run_spider
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
def run_spider():
options = {
'CONCURRENT_ITEMS': 250,
'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
'CONCURRENT_REQUESTS': 30,
'DOWNLOAD_DELAY': 0.5,
'COOKIES_ENABLED': False,
}
settings = get_project_settings()
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
settings.update(options);
#BookToscrapeSpider basic version
from tutorial.spiders.booktoscrape_basic import BookToscrapeSpider
#runner = CrawlerRunner(settings)
#runner.crawl(BookToscrapeSpider())
#BookToscrapeSpider crawl version
from tutorial.spiders.booktoscrape_crawl import BookToscrapeSpider as BookToscrapeSpider_crawl
runner = CrawlerRunner(settings)
runner.crawl(BookToscrapeSpider_crawl())
#crawler = Crawler(settings)
#crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
#crawler.install()
#crawler.configure()
#crawler.crawl(spider)
#crawler.start()
#log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
d= runner.join()
d.addBoth(lambda _:reactor.stop())
reactor.run()
示例6: runProcess
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
def runProcess(self):
configure_logging()
dbHandler.check_watches()
runner = CrawlerRunner()
runner.crawl(spider.available_courses_spider)
dbHandler.check_watches()
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
开发者ID:NemirovD,项目名称:Interval-Scheduling-Algorithm-with-Applied-Constraints,代码行数:12,代码来源:spiderworker.py
示例7: runSpider
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
def runSpider(host, spider):
spiders = spider.split(',')
changeSettings(host)
settings = get_project_settings()
runner = CrawlerRunner(settings)
for i in spiders:
runner.crawl(SPIDER_MATCHER[i.lower()])
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
示例8: main
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
def main():
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
settings = get_project_settings()
runner = CrawlerRunner(settings)
# settings.set('FEED_FORMAT','json')
# settings.set('FEED_URI', 'result.json')
runner.crawl(PttBoard)
runner.crawl(PTTArticle)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
result = reactor.run() # the script will block here until the crawling is finished
print result
示例9: Runner
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
class Runner(object):
def __init__(self,*args,**kwargs):
configure_logging()
self.settings = get_project_settings()
self.runner = CrawlerRunner(self.settings)
def add(self,*a,**kw):
crawler = Crawler(BroadSpider,self.settings)
self.runner.crawl(crawler,*a,**kw)
def start(self):
d = self.runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
def stop(self):
self.runner.stop()
reactor.stop()
示例10: CrawlTestCase
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
#.........这里部分代码省略.........
self._assert_retried(l)
def _assert_retried(self, log):
self.assertEqual(str(log).count("Retrying"), 2)
self.assertEqual(str(log).count("Gave up retrying"), 1)
@defer.inlineCallbacks
def test_referer_header(self):
"""Referer header is set by RefererMiddleware unless it is already set"""
req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1)
req1 = req0.replace()
req2 = req0.replace(headers={'Referer': None})
req3 = req0.replace(headers={'Referer': 'http://example.com'})
req0.meta['next'] = req1
req1.meta['next'] = req2
req2.meta['next'] = req3
crawler = self.runner.create_crawler(SingleRequestSpider)
yield crawler.crawl(seed=req0, mockserver=self.mockserver)
# basic asserts in case of weird communication errors
self.assertIn('responses', crawler.spider.meta)
self.assertNotIn('failures', crawler.spider.meta)
# start requests doesn't set Referer header
echo0 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body))
self.assertNotIn('Referer', echo0['headers'])
# following request sets Referer to start request url
echo1 = json.loads(to_unicode(crawler.spider.meta['responses'][1].body))
self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
# next request avoids Referer header
echo2 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body))
self.assertNotIn('Referer', echo2['headers'])
# last request explicitly sets a Referer header
echo3 = json.loads(to_unicode(crawler.spider.meta['responses'][3].body))
self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])
@defer.inlineCallbacks
def test_engine_status(self):
from scrapy.utils.engine import get_engine_status
est = []
def cb(response):
est.append(get_engine_status(crawler.engine))
crawler = self.runner.create_crawler(SingleRequestSpider)
yield crawler.crawl(seed=self.mockserver.url('/'), callback_func=cb, mockserver=self.mockserver)
self.assertEqual(len(est), 1, est)
s = dict(est[0])
self.assertEqual(s['engine.spider.name'], crawler.spider.name)
self.assertEqual(s['len(engine.scraper.slot.active)'], 1)
@defer.inlineCallbacks
def test_graceful_crawl_error_handling(self):
"""
Test whether errors happening anywhere in Crawler.crawl() are properly
reported (and not somehow swallowed) after a graceful engine shutdown.
The errors should not come from within Scrapy's core but from within
spiders/middlewares/etc., e.g. raised in Spider.start_requests(),
SpiderMiddleware.process_start_requests(), etc.
"""
class TestError(Exception):
pass
class FaultySpider(SimpleSpider):
def start_requests(self):
raise TestError
crawler = self.runner.create_crawler(FaultySpider)
yield self.assertFailure(crawler.crawl(mockserver=self.mockserver), TestError)
self.assertFalse(crawler.crawling)
@defer.inlineCallbacks
def test_open_spider_error_on_faulty_pipeline(self):
settings = {
"ITEM_PIPELINES": {
"tests.pipelines.ZeroDivisionErrorPipeline": 300,
}
}
crawler = CrawlerRunner(settings).create_crawler(SimpleSpider)
yield self.assertFailure(
self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver),
ZeroDivisionError)
self.assertFalse(crawler.crawling)
@defer.inlineCallbacks
def test_crawlerrunner_accepts_crawler(self):
crawler = self.runner.create_crawler(SimpleSpider)
with LogCapture() as log:
yield self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver)
self.assertIn("Got response 200", str(log))
@defer.inlineCallbacks
def test_crawl_multiple(self):
self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver)
self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=503"), mockserver=self.mockserver)
with LogCapture() as log:
yield self.runner.join()
self._assert_retried(log)
self.assertIn("Got response 200", str(log))
示例11: get_project_settings
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
from time import sleep
import scrapy
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from PTTRank.spiders.ptt import PttSpider
settings = get_project_settings()
runner = CrawlerRunner(settings)
runner.crawl(PttSpider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
示例12: callSpiderWithCrawlerRunner
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
def callSpiderWithCrawlerRunner(self):
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner(get_project_settings())
runner.crawl(self.spider)
dispatcher.connect(self.spider_closing, signal=signals.spider_closed)
runner.join()
示例13: TmallSpider
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
# -*- coding: utf-8 -*-
import scrapy
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from tmall import TmallSpider
spider = TmallSpider(domain='tmall.com')
crawler = CrawlerRunner()
crawler.crawl(spider)
d = crawler.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()