本文整理汇总了Python中scrapy.crawler.CrawlerRunner.crawl方法的典型用法代码示例。如果您正苦于以下问题:Python CrawlerRunner.crawl方法的具体用法?Python CrawlerRunner.crawl怎么用?Python CrawlerRunner.crawl使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类scrapy.crawler.CrawlerRunner
的用法示例。
在下文中一共展示了CrawlerRunner.crawl方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run_spider
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def run_spider():
options = {
'CONCURRENT_ITEMS': 250,
'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
'CONCURRENT_REQUESTS': 30,
'DOWNLOAD_DELAY': 0.5,
'COOKIES_ENABLED': False,
}
settings = get_project_settings()
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
settings.update(options);
#BookToscrapeSpider basic version
from tutorial.spiders.booktoscrape_basic import BookToscrapeSpider
#runner = CrawlerRunner(settings)
#runner.crawl(BookToscrapeSpider())
#BookToscrapeSpider crawl version
from tutorial.spiders.booktoscrape_crawl import BookToscrapeSpider as BookToscrapeSpider_crawl
runner = CrawlerRunner(settings)
runner.crawl(BookToscrapeSpider_crawl())
#crawler = Crawler(settings)
#crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
#crawler.install()
#crawler.configure()
#crawler.crawl(spider)
#crawler.start()
#log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
d= runner.join()
d.addBoth(lambda _:reactor.stop())
reactor.run()
示例2: run
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def run():
options = {
'CONCURRENT_ITEMS': 250,
#'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
'CONCURRENT_REQUESTS': 30,
'DOWNLOAD_DELAY': 0.5,
'COOKIES_ENABLED': False,
}
spider = EntertainmentcareersSpider()
settings = get_project_settings()
settings.update(options)
runner= CrawlerRunner(settings)
runner.crawl(spider)
d= runner.join()
d.addBoth(lambda _:reactor.stop())
#crawler = Crawler(settings)
#crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
#crawler.install()
#crawler.configure()
#crawler.crawl(spider)
#crawler.start()
#log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
reactor.run()
示例3: crawler_start
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def crawler_start(usage, tasks):
"""Start specified spiders or validators from cmd with scrapy core api.
There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
assign any tasks, all these spiders will run.
"""
maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS
origin_spiders = DEFAULT_CRAWLERS if usage == 'crawler' else DEFAULT_VALIDATORS
if not tasks:
spiders = origin_spiders
else:
spiders = list()
cases = list(map(BaseCase, origin_spiders))
for task in tasks:
for case in cases:
if case.check(task, maps):
spiders.append(case.spider)
break
else:
# crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
# task, list(maps.keys())))
pass
if not spiders:
#crawler_logger.warning('no spider starts up, please check your task input')
return
settings = get_project_settings()
configure_logging(settings)
runner = CrawlerRunner(settings)
for spider in spiders:
runner.crawl(spider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
示例4: crawl_articles
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def crawl_articles(spids):
settings = get_project_settings()
configure_logging(settings, install_root_handler=False)
logging.getLogger('scrapy').setLevel(logging.WARNING)
runner = CrawlerRunner(settings)
loader = runner.spider_loader
if 'all' in spids:
spids = loader.list()
spiders = [
loader.load(spid)
for spid in spids
if spid in loader.list()
]
if not spiders:
return
random.shuffle(spiders)
for spider in spiders:
runner.crawl(spider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
logger.info('crawl job starting...')
try:
reactor.run()
except Exception:
logger.exception('crawl job got exception:')
logger.info('crawl job finished')
示例5: test_start_requests_dupes
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def test_start_requests_dupes(self):
settings = {"CONCURRENT_REQUESTS": 1}
crawler = CrawlerRunner(settings).create_crawler(DuplicateStartRequestsSpider)
yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver)
self.assertEqual(crawler.spider.visited, 6)
yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver)
self.assertEqual(crawler.spider.visited, 3)
示例6: runProcess
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def runProcess(self):
configure_logging()
dbHandler.check_watches()
runner = CrawlerRunner()
runner.crawl(spider.available_courses_spider)
dbHandler.check_watches()
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
开发者ID:NemirovD,项目名称:Interval-Scheduling-Algorithm-with-Applied-Constraints,代码行数:12,代码来源:spiderworker.py
示例7: runSpider
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def runSpider(host, spider):
spiders = spider.split(',')
changeSettings(host)
settings = get_project_settings()
runner = CrawlerRunner(settings)
for i in spiders:
runner.crawl(SPIDER_MATCHER[i.lower()])
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
示例8: test_crawler_runner_bootstrap_failed_for_several
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def test_crawler_runner_bootstrap_failed_for_several(self):
runner = CrawlerRunner()
try:
yield runner.crawl(ExceptionSpider)
except ValueError:
pass
else:
self.fail('Exception should be raised from spider')
yield runner.crawl(NoRequestsSpider)
self.assertEqual(runner.bootstrap_failed, True)
示例9: test_timeout_failure
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def test_timeout_failure(self):
crawler = CrawlerRunner({"DOWNLOAD_TIMEOUT": 0.35}).create_crawler(DelaySpider)
yield crawler.crawl(n=0.5, mockserver=self.mockserver)
self.assertTrue(crawler.spider.t1 > 0)
self.assertTrue(crawler.spider.t2 == 0)
self.assertTrue(crawler.spider.t2_err > 0)
self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
# server hangs after receiving response headers
yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver)
self.assertTrue(crawler.spider.t1 > 0)
self.assertTrue(crawler.spider.t2 == 0)
self.assertTrue(crawler.spider.t2_err > 0)
self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
示例10: startprocess
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def startprocess(queue):
runner = CrawlerRunner(get_project_settings())
dfs = set()
l = runner.crawl('linkspider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue)
#回调函数中参数1表示linkspider
dfs.add(l)
s = runner.crawl('srcspider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue)#回调函数中参数2表示srcspider
dfs.add(s)
c = runner.crawl('codespider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue)#回调函数中参数3表示codespider
dfs.add(c)
defer.DeferredList(dfs).addBoth(lambda _: reactor.stop())
# the script will block here until all crawling jobs are finished
reactor.run()
示例11: test_same_url
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def test_same_url(self):
class TestSameUrlSpider(Spider):
name = 'test_same_url'
def __init__(self, *args, **kwargs):
super(TestSameUrlSpider, self).__init__(*args, **kwargs)
self.visited = 0
def start_requests(s):
return self.conman.from_spider(s, self.results)
def parse_first(self, response):
self.visited += 1
return TestItem()
def parse_second(self, response):
self.visited += 1
return TestItem()
with MockServer() as mockserver:
contract_doc = '@url {}'.format(mockserver.url('/status?n=200'))
get_unbound_function(TestSameUrlSpider.parse_first).__doc__ = contract_doc
get_unbound_function(TestSameUrlSpider.parse_second).__doc__ = contract_doc
crawler = CrawlerRunner().create_crawler(TestSameUrlSpider)
yield crawler.crawl()
self.assertEqual(crawler.spider.visited, 2)
示例12: run_crawler_by_runner
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def run_crawler_by_runner():
runner = CrawlerRunner(get_project_settings())
[runner.crawl(spider) for spider in spiders]
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
示例13: CrawlerRunnerTest
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
class CrawlerRunnerTest(unittest.TestCase):
def setUp(self):
self.crawler_runner = CrawlerRunner(Settings())
def tearDown(self):
return self.crawler_runner.stop()
@defer.inlineCallbacks
def test_populate_spidercls_settings(self):
spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
project_settings = {'TEST1': 'project', 'TEST3': 'project'}
class CustomSettingsSpider(DefaultSpider):
custom_settings = spider_settings
self.crawler_runner.settings.setdict(project_settings,
priority='project')
d = self.crawler_runner.crawl(CustomSettingsSpider)
crawler = list(self.crawler_runner.crawlers)[0]
yield d
self.assertEqual(crawler.settings.get('TEST1'), 'spider')
self.assertEqual(crawler.settings.get('TEST2'), 'spider')
self.assertEqual(crawler.settings.get('TEST3'), 'project')
示例14: handle_lj
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def handle_lj(self):
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
runner = CrawlerRunner(crawler_setting)
#d = runner.crawl(HouseSpider)
d = runner.crawl(LianjiaHouseSpider)
d.addBoth(lambda _: reactor.stop())
reactor.run()
示例15: test_crawler_process
# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def test_crawler_process(self):
runner = CrawlerRunner(self.settings)
d = runner.crawl(CustomSpider)
d.addBoth(lambda _: reactor.stop())
# add crawl to redis
key = "test-spider:istresearch.com:queue"
self.redis_conn.zadd(key, self.example_feed, -99)
# run the spider, give 20 seconds to see the url, crawl it,
# and send to kafka. Then we kill the reactor
def thread_func():
time.sleep(20)
reactor.stop()
thread = threading.Thread(target=thread_func)
thread.start()
reactor.run()
message_count = 0
m = next(self.consumer)
if m is None:
pass
else:
the_dict = json.loads(m.value)
if the_dict is not None and the_dict['appid'] == 'test' \
and the_dict['crawlid'] == 'abc12345':
message_count += 1
self.assertEquals(message_count, 1)