当前位置: 首页>>代码示例>>Python>>正文


Python crawler.CrawlerRunner方法代码示例

本文整理汇总了Python中scrapy.crawler.CrawlerRunner方法的典型用法代码示例。如果您正苦于以下问题:Python crawler.CrawlerRunner方法的具体用法?Python crawler.CrawlerRunner怎么用?Python crawler.CrawlerRunner使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.crawler的用法示例。


在下文中一共展示了crawler.CrawlerRunner方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: run

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def run():
    configure_logging()
    # importing project settings for further usage
    # mainly because of the middlewares
    settings = get_project_settings()
    runner = CrawlerRunner(settings)

    # running spiders sequentially (non-distributed)
    @defer.inlineCallbacks
    def crawl():
        yield runner.crawl(IPTesterSpider)
        yield runner.crawl(UATesterSpider)
        reactor.stop()

    crawl()
    reactor.run() # block until the last call 
开发者ID:matejbasic,项目名称:PythonScrapyBasicSetup,代码行数:18,代码来源:run.py

示例2: crawl_runner

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def crawl_runner(extra_settings=None):
    settings = base_settings.copy()
    if extra_settings is not None:
        settings.update(extra_settings, priority='cmdline')
    if settings.get('SPLASH_URL'):
        settings['DUPEFILTER_CLASS'] = 'scrapy_splash.SplashAwareDupeFilter'
        settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({
            'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
            'scrapy_splash.SplashCookiesMiddleware': 723,
            'scrapy_splash.SplashMiddleware': 725,
            'scrapy.downloadermiddlewares.httpcompression'
                '.HttpCompressionMiddleware': 810,
        })
    else:
        settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({
            'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
            'autologin.middleware.ExposeCookiesMiddleware': 700,
        })
    return CrawlerRunner(settings) 
开发者ID:TeamHG-Memex,项目名称:autologin,代码行数:21,代码来源:spiders.py

示例3: crawler_start

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def crawler_start(usage, tasks):
    """Start specified spiders or validators from cmd with scrapy core api.
    There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
    assign any tasks, all these spiders will run.
    """
    if usage == 'crawler':
        maps = CRAWLER_TASK_MAPS
        origin_spiders = DEFAULT_CRAWLERS
    else:
        maps = TEMP_TASK_MAPS
        origin_spiders = DEFAULT_VALIDATORS

    if not tasks:
        spiders = origin_spiders
    else:
        spiders = list()
        cases = list(map(BaseCase, origin_spiders))
        for task in tasks:
            for case in cases:
                if case.check(task, maps):
                    spiders.append(case.spider)
                    break
            else:
                # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
                #     task, list(maps.keys())))
                pass
    if not spiders:
        #crawler_logger.warning('no spider starts up, please check your task input')
        return

    settings = get_project_settings()
    configure_logging(settings)
    runner = CrawlerRunner(settings)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run() 
开发者ID:SpiderClub,项目名称:haipproxy,代码行数:40,代码来源:scheduler.py

示例4: test_crawler_process

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def test_crawler_process(self):
        runner = CrawlerRunner(self.settings)
        d = runner.crawl(CustomSpider)
        d.addBoth(lambda _: reactor.stop())
        # add crawl to redis
        key = "test-spider:dmoztools.net:queue"
        self.redis_conn.zadd(key, self.example_feed, -99)

        # run the spider, give 20 seconds to see the url, crawl it,
        # and send to kafka. Then we kill the reactor
        def thread_func():
            time.sleep(20)
            reactor.stop()

        thread = threading.Thread(target=thread_func)
        thread.start()
        reactor.run()

        message_count = 0
        m = next(self.consumer)

        if m is None:
            pass
        else:
            the_dict = json.loads(m.value)
            if the_dict is not None and the_dict['appid'] == 'test' \
                    and the_dict['crawlid'] == 'abc12345':
                message_count += 1

        self.assertEquals(message_count, 1) 
开发者ID:istresearch,项目名称:scrapy-cluster,代码行数:32,代码来源:online.py

示例5: runspider

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def runspider(self):
        configure_logging(install_root_handler = False)
        s = get_project_settings()
        runner = CrawlerRunner(settings = s)

        @defer.inlineCallbacks
        def crawl(**spargs):
            yield runner.crawl(JDItemInfoSpider, **spargs)
            yield runner.crawl(JDCommentSpider, **spargs)
            reactor.stop()

        crawl(**self.spargs)
        reactor.run()  # the script will block here until the last crawl call is finished

    # 调度分析 
开发者ID:awolfly9,项目名称:jd_analysis,代码行数:17,代码来源:full_analysis.py

示例6: get_crawler

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def get_crawler(spidercls=None, settings_dict=None):
    """Return an unconfigured Crawler object. If settings_dict is given, it
    will be used to populate the crawler settings with a project level
    priority.
    """
    from scrapy.crawler import CrawlerRunner
    from scrapy.spiders import Spider

    runner = CrawlerRunner(settings_dict)
    return runner.create_crawler(spidercls or Spider) 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:12,代码来源:test.py

示例7: return_spider_output

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def return_spider_output(output):

    """
    Turns scrapy output into dictionaries
    :param output: items scraped by CrawlerRunner
    :type output: dict

    :return: json with list of items
    """

    # this just turns items into dictionaries
    return [dict(item) for item in output] 
开发者ID:eddiepease,项目名称:company2vec,代码行数:14,代码来源:pipelines.py

示例8: __init__

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def __init__(self, settings, spider, args):
        """
        init parser
        :param settings:
        :param spider:
        :param args:
        """
        self.args = args
        self.spider = spider
        self.crawler_process = CrawlerRunner(settings)
        self.spider_loader = self.crawler_process.spider_loader
        self.spidercls = self.spider_loader.load(self.spider) 
开发者ID:Gerapy,项目名称:Gerapy,代码行数:14,代码来源:parser.py

示例9: get_start_requests

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def get_start_requests(project_path, spider_name):
    """
    get start requests
    :param project_path: project path
    :param spider_name: spider name
    :return:
    """
    work_cwd = os.getcwd()
    try:
        # change work dir
        os.chdir(project_path)
        # load settings
        settings = get_project_settings()
        check_deprecated_settings(settings)
        runner = CrawlerRunner(settings=settings)
        # add crawler
        spider_cls = runner.spider_loader.load(spider_name)
        runner.crawl(spider_cls)
        # get crawler
        crawler = list(runner.crawlers)[0]
        # get spider by crawler
        spider = crawler.spider
        # get start requests
        requests = list(spider.start_requests())
        if not requests and hasattr(spider, 'start'):
            requests = list(spider.start())
        requests = list(map(lambda r: process_request(r), requests))
        return {'finished': True, 'requests': requests}
    finally:
        os.chdir(work_cwd) 
开发者ID:Gerapy,项目名称:Gerapy,代码行数:32,代码来源:parser.py

示例10: init_crawler_runner

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def init_crawler_runner():
    crochet.setup()
    init_scrapy_env()
    settings = get_project_settings()
    global CRAWLER_RUNNER
    CRAWLER_RUNNER = CrawlerRunner(settings)
    logger.info('Initialized crawler runner: %s' % CRAWLER_RUNNER)


# TODO: move these to config file? 
开发者ID:Karmenzind,项目名称:fp-server,代码行数:12,代码来源:crawler.py

示例11: run_spider2

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerRunner [as 别名]
def run_spider2(spider, *args):
    configure_logging()
    runner = CrawlerRunner(get_project_settings())
    runner.crawl(spider, *args)
    runner.crawl(spider, *args)
    d = runner.join()
    # d = runner.crawl(spider, *args)
    d.addBoth(lambda _: reactor.stop())

    reactor.run()  # the script will block here until all crawling jobs are finished 
开发者ID:openslack,项目名称:openslack-crawler,代码行数:12,代码来源:run.py


注:本文中的scrapy.crawler.CrawlerRunner方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。