当前位置: 首页>>代码示例>>Python>>正文


Python CrawlerRunner.crawl方法代码示例

本文整理汇总了Python中scrapy.crawler.CrawlerRunner.crawl方法的典型用法代码示例。如果您正苦于以下问题:Python CrawlerRunner.crawl方法的具体用法?Python CrawlerRunner.crawl怎么用?Python CrawlerRunner.crawl使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.crawler.CrawlerRunner的用法示例。


在下文中一共展示了CrawlerRunner.crawl方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: run_spider

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def run_spider():
	options = {
	    'CONCURRENT_ITEMS': 250,
	    'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
	    'CONCURRENT_REQUESTS': 30,
	    'DOWNLOAD_DELAY': 0.5,
	    'COOKIES_ENABLED': False,
	    }

	settings = get_project_settings()
	configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
	settings.update(options);

	#BookToscrapeSpider basic version
	from tutorial.spiders.booktoscrape_basic import BookToscrapeSpider
	#runner = CrawlerRunner(settings)
	#runner.crawl(BookToscrapeSpider())

	#BookToscrapeSpider crawl version
	from tutorial.spiders.booktoscrape_crawl import BookToscrapeSpider as BookToscrapeSpider_crawl
	runner = CrawlerRunner(settings)
	runner.crawl(BookToscrapeSpider_crawl())

    #crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    #crawler.install()
    #crawler.configure()
    #crawler.crawl(spider)
    #crawler.start()
    #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
	d= runner.join()
	d.addBoth(lambda _:reactor.stop())

	reactor.run()
开发者ID:Andy-wangke,项目名称:Front_end,代码行数:36,代码来源:run_spiders.py

示例2: run

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def run():
    options = {
        'CONCURRENT_ITEMS': 250,
        #'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_DELAY': 0.5,
        'COOKIES_ENABLED': False,
        }

    spider = EntertainmentcareersSpider()

    settings = get_project_settings()
    settings.update(options)

    runner= CrawlerRunner(settings)
    runner.crawl(spider)

    d= runner.join()
    d.addBoth(lambda _:reactor.stop())
    #crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    #crawler.install()
    #crawler.configure()
    #crawler.crawl(spider)
    #crawler.start()
    #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
    reactor.run()
开发者ID:Andy-wangke,项目名称:Front_end,代码行数:29,代码来源:entertainmentcareers_basic.py

示例3: crawler_start

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def crawler_start(usage, tasks):
    """Start specified spiders or validators from cmd with scrapy core api.
    There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
    assign any tasks, all these spiders will run.
    """
    maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS
    origin_spiders = DEFAULT_CRAWLERS if usage == 'crawler' else DEFAULT_VALIDATORS
    if not tasks:
        spiders = origin_spiders
    else:
        spiders = list()
        cases = list(map(BaseCase, origin_spiders))
        for task in tasks:
            for case in cases:
                if case.check(task, maps):
                    spiders.append(case.spider)
                    break
            else:
                # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
                #     task, list(maps.keys())))
                pass
    if not spiders:
        #crawler_logger.warning('no spider starts up, please check your task input')
        return

    settings = get_project_settings()
    configure_logging(settings)
    runner = CrawlerRunner(settings)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
开发者ID:yb123speed,项目名称:haipproxy,代码行数:35,代码来源:scheduler.py

示例4: crawl_articles

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def crawl_articles(spids):
    settings = get_project_settings()
    configure_logging(settings, install_root_handler=False)
    logging.getLogger('scrapy').setLevel(logging.WARNING)
    runner = CrawlerRunner(settings)
    loader = runner.spider_loader
    if 'all' in spids:
        spids = loader.list()
    spiders = [
        loader.load(spid)
        for spid in spids
        if spid in loader.list()
    ]
    if not spiders:
        return
    random.shuffle(spiders)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    logger.info('crawl job starting...')
    try:
        reactor.run()
    except Exception:
        logger.exception('crawl job got exception:')
    logger.info('crawl job finished')
开发者ID:wartalker,项目名称:BlogSpider,代码行数:28,代码来源:task.py

示例5: test_start_requests_dupes

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
    def test_start_requests_dupes(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        crawler = CrawlerRunner(settings).create_crawler(DuplicateStartRequestsSpider)
        yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 6)

        yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 3)
开发者ID:ArturGaspar,项目名称:scrapy,代码行数:10,代码来源:test_crawl.py

示例6: runProcess

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
    def runProcess(self):
        configure_logging()
        dbHandler.check_watches()
        runner = CrawlerRunner()
        runner.crawl(spider.available_courses_spider)
        dbHandler.check_watches()
        d = runner.join()
        d.addBoth(lambda _: reactor.stop())

        reactor.run()
开发者ID:NemirovD,项目名称:Interval-Scheduling-Algorithm-with-Applied-Constraints,代码行数:12,代码来源:spiderworker.py

示例7: runSpider

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def runSpider(host, spider):
    spiders = spider.split(',')
    changeSettings(host)
    settings = get_project_settings()
    runner = CrawlerRunner(settings)
    for i in spiders:
        runner.crawl(SPIDER_MATCHER[i.lower()])

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
开发者ID:jerryxing98,项目名称:knowledge_graph_demo,代码行数:13,代码来源:run_spider.py

示例8: test_crawler_runner_bootstrap_failed_for_several

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
    def test_crawler_runner_bootstrap_failed_for_several(self):
        runner = CrawlerRunner()

        try:
            yield runner.crawl(ExceptionSpider)
        except ValueError:
            pass
        else:
            self.fail('Exception should be raised from spider')

        yield runner.crawl(NoRequestsSpider)

        self.assertEqual(runner.bootstrap_failed, True)
开发者ID:elacuesta,项目名称:scrapy,代码行数:15,代码来源:test_crawler.py

示例9: test_timeout_failure

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
 def test_timeout_failure(self):
     crawler = CrawlerRunner({"DOWNLOAD_TIMEOUT": 0.35}).create_crawler(DelaySpider)
     yield crawler.crawl(n=0.5, mockserver=self.mockserver)
     self.assertTrue(crawler.spider.t1 > 0)
     self.assertTrue(crawler.spider.t2 == 0)
     self.assertTrue(crawler.spider.t2_err > 0)
     self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
     # server hangs after receiving response headers
     yield crawler.crawl(n=0.5, b=1, mockserver=self.mockserver)
     self.assertTrue(crawler.spider.t1 > 0)
     self.assertTrue(crawler.spider.t2 == 0)
     self.assertTrue(crawler.spider.t2_err > 0)
     self.assertTrue(crawler.spider.t2_err > crawler.spider.t1)
开发者ID:ArturGaspar,项目名称:scrapy,代码行数:15,代码来源:test_crawl.py

示例10: startprocess

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def startprocess(queue):
	runner = CrawlerRunner(get_project_settings())
	dfs = set()
	
	l = runner.crawl('linkspider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue)
			#回调函数中参数1表示linkspider
	dfs.add(l)
	
	s = runner.crawl('srcspider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue)#回调函数中参数2表示srcspider
	dfs.add(s)
	c = runner.crawl('codespider', website='http://caffe.berkeleyvision.org/', domain='berkeleyvision.org').addCallback(test,queue)#回调函数中参数3表示codespider
	dfs.add(c)
	defer.DeferredList(dfs).addBoth(lambda _: reactor.stop())
			# the script will block here until all crawling jobs are finished
	reactor.run()
开发者ID:ashessqy126,项目名称:WEBANALYSIS,代码行数:17,代码来源:test.py

示例11: test_same_url

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
    def test_same_url(self):

        class TestSameUrlSpider(Spider):
            name = 'test_same_url'

            def __init__(self, *args, **kwargs):
                super(TestSameUrlSpider, self).__init__(*args, **kwargs)
                self.visited = 0

            def start_requests(s):
                return self.conman.from_spider(s, self.results)

            def parse_first(self, response):
                self.visited += 1
                return TestItem()

            def parse_second(self, response):
                self.visited += 1
                return TestItem()

        with MockServer() as mockserver:
            contract_doc = '@url {}'.format(mockserver.url('/status?n=200'))

            get_unbound_function(TestSameUrlSpider.parse_first).__doc__ = contract_doc
            get_unbound_function(TestSameUrlSpider.parse_second).__doc__ = contract_doc

            crawler = CrawlerRunner().create_crawler(TestSameUrlSpider)
            yield crawler.crawl()

        self.assertEqual(crawler.spider.visited, 2)
开发者ID:ArturGaspar,项目名称:scrapy,代码行数:32,代码来源:test_contracts.py

示例12: run_crawler_by_runner

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
def run_crawler_by_runner():
    runner = CrawlerRunner(get_project_settings())
    
    [runner.crawl(spider) for spider in spiders]
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
开发者ID:wutongye00703,项目名称:base_location,代码行数:9,代码来源:run.py

示例13: CrawlerRunnerTest

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
class CrawlerRunnerTest(unittest.TestCase):

    def setUp(self):
        self.crawler_runner = CrawlerRunner(Settings())

    def tearDown(self):
        return self.crawler_runner.stop()

    @defer.inlineCallbacks
    def test_populate_spidercls_settings(self):
        spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
        project_settings = {'TEST1': 'project', 'TEST3': 'project'}

        class CustomSettingsSpider(DefaultSpider):
            custom_settings = spider_settings

        self.crawler_runner.settings.setdict(project_settings,
                                             priority='project')

        d = self.crawler_runner.crawl(CustomSettingsSpider)
        crawler = list(self.crawler_runner.crawlers)[0]
        yield d
        self.assertEqual(crawler.settings.get('TEST1'), 'spider')
        self.assertEqual(crawler.settings.get('TEST2'), 'spider')
        self.assertEqual(crawler.settings.get('TEST3'), 'project')
开发者ID:0326,项目名称:scrapy,代码行数:27,代码来源:test_crawler.py

示例14: handle_lj

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
 def handle_lj(self):
     configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
     runner = CrawlerRunner(crawler_setting)
     #d = runner.crawl(HouseSpider)
     d = runner.crawl(LianjiaHouseSpider)
     d.addBoth(lambda _: reactor.stop())
     reactor.run()
开发者ID:zinking,项目名称:housedata,代码行数:9,代码来源:harvest.py

示例15: test_crawler_process

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import crawl [as 别名]
    def test_crawler_process(self):
        runner = CrawlerRunner(self.settings)
        d = runner.crawl(CustomSpider)
        d.addBoth(lambda _: reactor.stop())
        # add crawl to redis
        key = "test-spider:istresearch.com:queue"
        self.redis_conn.zadd(key, self.example_feed, -99)

        # run the spider, give 20 seconds to see the url, crawl it,
        # and send to kafka. Then we kill the reactor
        def thread_func():
            time.sleep(20)
            reactor.stop()

        thread = threading.Thread(target=thread_func)
        thread.start()
        reactor.run()

        message_count = 0
        m = next(self.consumer)

        if m is None:
            pass
        else:
            the_dict = json.loads(m.value)
            if the_dict is not None and the_dict['appid'] == 'test' \
                    and the_dict['crawlid'] == 'abc12345':
                message_count += 1

        self.assertEquals(message_count, 1)
开发者ID:cjzswust,项目名称:test,代码行数:32,代码来源:online.py


注:本文中的scrapy.crawler.CrawlerRunner.crawl方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。