当前位置: 首页>>代码示例>>Python>>正文


Python crawler.CrawlerRunner类代码示例

本文整理汇总了Python中scrapy.crawler.CrawlerRunner的典型用法代码示例。如果您正苦于以下问题:Python CrawlerRunner类的具体用法?Python CrawlerRunner怎么用?Python CrawlerRunner使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了CrawlerRunner类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: run_crawler_by_runner

def run_crawler_by_runner():
    runner = CrawlerRunner(get_project_settings())
    
    [runner.crawl(spider) for spider in spiders]
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
开发者ID:wutongye00703,项目名称:base_location,代码行数:7,代码来源:run.py

示例2: CrawlerRunnerTest

class CrawlerRunnerTest(unittest.TestCase):

    def setUp(self):
        self.crawler_runner = CrawlerRunner(Settings())

    def tearDown(self):
        return self.crawler_runner.stop()

    @defer.inlineCallbacks
    def test_populate_spidercls_settings(self):
        spider_settings = {'TEST1': 'spider', 'TEST2': 'spider'}
        project_settings = {'TEST1': 'project', 'TEST3': 'project'}

        class CustomSettingsSpider(DefaultSpider):
            custom_settings = spider_settings

        self.crawler_runner.settings.setdict(project_settings,
                                             priority='project')

        d = self.crawler_runner.crawl(CustomSettingsSpider)
        crawler = list(self.crawler_runner.crawlers)[0]
        yield d
        self.assertEqual(crawler.settings.get('TEST1'), 'spider')
        self.assertEqual(crawler.settings.get('TEST2'), 'spider')
        self.assertEqual(crawler.settings.get('TEST3'), 'project')
开发者ID:0326,项目名称:scrapy,代码行数:25,代码来源:test_crawler.py

示例3: handle_lj

 def handle_lj(self):
     configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
     runner = CrawlerRunner(crawler_setting)
     #d = runner.crawl(HouseSpider)
     d = runner.crawl(LianjiaHouseSpider)
     d.addBoth(lambda _: reactor.stop())
     reactor.run()
开发者ID:zinking,项目名称:housedata,代码行数:7,代码来源:harvest.py

示例4: run

def run():
    options = {
        'CONCURRENT_ITEMS': 250,
        #'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_DELAY': 0.5,
        'COOKIES_ENABLED': False,
        }

    spider = EntertainmentcareersSpider()

    settings = get_project_settings()
    settings.update(options)

    runner= CrawlerRunner(settings)
    runner.crawl(spider)

    d= runner.join()
    d.addBoth(lambda _:reactor.stop())
    #crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    #crawler.install()
    #crawler.configure()
    #crawler.crawl(spider)
    #crawler.start()
    #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
    reactor.run()
开发者ID:Andy-wangke,项目名称:Front_end,代码行数:27,代码来源:entertainmentcareers_basic.py

示例5: crawl_articles

def crawl_articles(spids):
    settings = get_project_settings()
    configure_logging(settings, install_root_handler=False)
    logging.getLogger('scrapy').setLevel(logging.WARNING)
    runner = CrawlerRunner(settings)
    loader = runner.spider_loader
    if 'all' in spids:
        spids = loader.list()
    spiders = [
        loader.load(spid)
        for spid in spids
        if spid in loader.list()
    ]
    if not spiders:
        return
    random.shuffle(spiders)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    logger.info('crawl job starting...')
    try:
        reactor.run()
    except Exception:
        logger.exception('crawl job got exception:')
    logger.info('crawl job finished')
开发者ID:wartalker,项目名称:BlogSpider,代码行数:26,代码来源:task.py

示例6: test_crawler_process

    def test_crawler_process(self):
        runner = CrawlerRunner(self.settings)
        d = runner.crawl(CustomSpider)
        d.addBoth(lambda _: reactor.stop())
        # add crawl to redis
        key = "test-spider:istresearch.com:queue"
        self.redis_conn.zadd(key, self.example_feed, -99)

        # run the spider, give 20 seconds to see the url, crawl it,
        # and send to kafka. Then we kill the reactor
        def thread_func():
            time.sleep(20)
            reactor.stop()

        thread = threading.Thread(target=thread_func)
        thread.start()
        reactor.run()

        message_count = 0
        m = next(self.consumer)

        if m is None:
            pass
        else:
            the_dict = json.loads(m.value)
            if the_dict is not None and the_dict['appid'] == 'test' \
                    and the_dict['crawlid'] == 'abc12345':
                message_count += 1

        self.assertEquals(message_count, 1)
开发者ID:cjzswust,项目名称:test,代码行数:30,代码来源:online.py

示例7: crawler_start

def crawler_start(usage, tasks):
    """Start specified spiders or validators from cmd with scrapy core api.
    There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
    assign any tasks, all these spiders will run.
    """
    maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS
    origin_spiders = DEFAULT_CRAWLERS if usage == 'crawler' else DEFAULT_VALIDATORS
    if not tasks:
        spiders = origin_spiders
    else:
        spiders = list()
        cases = list(map(BaseCase, origin_spiders))
        for task in tasks:
            for case in cases:
                if case.check(task, maps):
                    spiders.append(case.spider)
                    break
            else:
                # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
                #     task, list(maps.keys())))
                pass
    if not spiders:
        #crawler_logger.warning('no spider starts up, please check your task input')
        return

    settings = get_project_settings()
    configure_logging(settings)
    runner = CrawlerRunner(settings)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
开发者ID:yb123speed,项目名称:haipproxy,代码行数:33,代码来源:scheduler.py

示例8: run_spider

def run_spider():
	options = {
	    'CONCURRENT_ITEMS': 250,
	    'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
	    'CONCURRENT_REQUESTS': 30,
	    'DOWNLOAD_DELAY': 0.5,
	    'COOKIES_ENABLED': False,
	    }

	settings = get_project_settings()
	configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
	settings.update(options);

	#BookToscrapeSpider basic version
	from tutorial.spiders.booktoscrape_basic import BookToscrapeSpider
	#runner = CrawlerRunner(settings)
	#runner.crawl(BookToscrapeSpider())

	#BookToscrapeSpider crawl version
	from tutorial.spiders.booktoscrape_crawl import BookToscrapeSpider as BookToscrapeSpider_crawl
	runner = CrawlerRunner(settings)
	runner.crawl(BookToscrapeSpider_crawl())

    #crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    #crawler.install()
    #crawler.configure()
    #crawler.crawl(spider)
    #crawler.start()
    #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
	d= runner.join()
	d.addBoth(lambda _:reactor.stop())

	reactor.run()
开发者ID:Andy-wangke,项目名称:Front_end,代码行数:34,代码来源:run_spiders.py

示例9: test_same_url

    def test_same_url(self):

        class TestSameUrlSpider(Spider):
            name = 'test_same_url'

            def __init__(self, *args, **kwargs):
                super(TestSameUrlSpider, self).__init__(*args, **kwargs)
                self.visited = 0

            def start_requests(s):
                return self.conman.from_spider(s, self.results)

            def parse_first(self, response):
                self.visited += 1
                return TestItem()

            def parse_second(self, response):
                self.visited += 1
                return TestItem()

        with MockServer() as mockserver:
            contract_doc = '@url {}'.format(mockserver.url('/status?n=200'))

            get_unbound_function(TestSameUrlSpider.parse_first).__doc__ = contract_doc
            get_unbound_function(TestSameUrlSpider.parse_second).__doc__ = contract_doc

            crawler = CrawlerRunner().create_crawler(TestSameUrlSpider)
            yield crawler.crawl()

        self.assertEqual(crawler.spider.visited, 2)
开发者ID:ArturGaspar,项目名称:scrapy,代码行数:30,代码来源:test_contracts.py

示例10: test_start_requests_dupes

    def test_start_requests_dupes(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        crawler = CrawlerRunner(settings).create_crawler(DuplicateStartRequestsSpider)
        yield crawler.crawl(dont_filter=True, distinct_urls=2, dupe_factor=3, mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 6)

        yield crawler.crawl(dont_filter=False, distinct_urls=3, dupe_factor=4, mockserver=self.mockserver)
        self.assertEqual(crawler.spider.visited, 3)
开发者ID:ArturGaspar,项目名称:scrapy,代码行数:8,代码来源:test_crawl.py

示例11: crawl

 def crawl(self):
     spider = Scrapy_ModuleSpider()
     Runner = CrawlerRunner(self.Scrapy_Module_setting)
     cra = Runner.crawl(spider)
     # stop reactor when spider closes
     cra.addBoth(lambda _: self.spider_closing(cra))
     self.logger.info("Run reactor")
     reactor.run()
开发者ID:Higor777,项目名称:Search-Engine-System-For-NPU,代码行数:8,代码来源:Scrapy_Module.py

示例12: runSpider

 def runSpider(self, spider):
     configure_logging({'LOG_FORMAT': '%(asctime)s [%(name)s] %(levelname)s: %(message)s'})
     settings = Settings()
     settings.set('FEED_URI', 'output.json')
     settings.set('FEED_FORMAT', 'json')
     
     runner = CrawlerRunner(settings)
     dfd = runner.crawl(spider)
     dfd.addBoth(lambda _: reactor.stop())
开发者ID:Preetwinder,项目名称:ScrapyStreaming,代码行数:9,代码来源:scrapystreaming.py

示例13: runProcess

    def runProcess(self):
        configure_logging()
        dbHandler.check_watches()
        runner = CrawlerRunner()
        runner.crawl(spider.available_courses_spider)
        dbHandler.check_watches()
        d = runner.join()
        d.addBoth(lambda _: reactor.stop())

        reactor.run()
开发者ID:NemirovD,项目名称:Interval-Scheduling-Algorithm-with-Applied-Constraints,代码行数:10,代码来源:spiderworker.py

示例14: run_login_spider

def run_login_spider(seed_url, username, password, db_name, logfile = "results.log"):

    init_db(db_name)
    settings = get_project_settings()
    runner = CrawlerRunner(settings)
    d = runner.crawl(LoginFinderSpider, seed_url = seed_url, username = username, password = password)
    d.addBoth(lambda _: reactor.stop())
    log.start(loglevel=log.DEBUG, logfile=logfile)
    log.msg("Item pipelines enabled: %s" % str(settings.get("ITEM_PIPELINES")), level = log.INFO)
    reactor.run()
开发者ID:CuriousG102,项目名称:autologin,代码行数:10,代码来源:autologin.py

示例15: test_crawler_runner_loading

    def test_crawler_runner_loading(self):
        module = 'tests.test_spiderloader.test_spiders.spider1'
        runner = CrawlerRunner({'SPIDER_MODULES': [module]})

        self.assertRaisesRegexp(KeyError, 'Spider not found',
                                runner.create_crawler, 'spider2')

        crawler = runner.create_crawler('spider1')
        self.assertTrue(issubclass(crawler.spidercls, scrapy.Spider))
        self.assertEqual(crawler.spidercls.name, 'spider1')
开发者ID:01-,项目名称:scrapy,代码行数:10,代码来源:__init__.py


注:本文中的scrapy.crawler.CrawlerRunner类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。