當前位置: 首頁>>代碼示例>>Python>>正文


Python crawler.CrawlerRunner方法代碼示例

本文整理匯總了Python中scrapy.crawler.CrawlerRunner方法的典型用法代碼示例。如果您正苦於以下問題:Python crawler.CrawlerRunner方法的具體用法?Python crawler.CrawlerRunner怎麽用?Python crawler.CrawlerRunner使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在scrapy.crawler的用法示例。


在下文中一共展示了crawler.CrawlerRunner方法的11個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: run

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import CrawlerRunner [as 別名]
def run():
    configure_logging()
    # importing project settings for further usage
    # mainly because of the middlewares
    settings = get_project_settings()
    runner = CrawlerRunner(settings)

    # running spiders sequentially (non-distributed)
    @defer.inlineCallbacks
    def crawl():
        yield runner.crawl(IPTesterSpider)
        yield runner.crawl(UATesterSpider)
        reactor.stop()

    crawl()
    reactor.run() # block until the last call 
開發者ID:matejbasic,項目名稱:PythonScrapyBasicSetup,代碼行數:18,代碼來源:run.py

示例2: crawl_runner

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import CrawlerRunner [as 別名]
def crawl_runner(extra_settings=None):
    settings = base_settings.copy()
    if extra_settings is not None:
        settings.update(extra_settings, priority='cmdline')
    if settings.get('SPLASH_URL'):
        settings['DUPEFILTER_CLASS'] = 'scrapy_splash.SplashAwareDupeFilter'
        settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({
            'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
            'scrapy_splash.SplashCookiesMiddleware': 723,
            'scrapy_splash.SplashMiddleware': 725,
            'scrapy.downloadermiddlewares.httpcompression'
                '.HttpCompressionMiddleware': 810,
        })
    else:
        settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({
            'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
            'autologin.middleware.ExposeCookiesMiddleware': 700,
        })
    return CrawlerRunner(settings) 
開發者ID:TeamHG-Memex,項目名稱:autologin,代碼行數:21,代碼來源:spiders.py

示例3: crawler_start

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import CrawlerRunner [as 別名]
def crawler_start(usage, tasks):
    """Start specified spiders or validators from cmd with scrapy core api.
    There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
    assign any tasks, all these spiders will run.
    """
    if usage == 'crawler':
        maps = CRAWLER_TASK_MAPS
        origin_spiders = DEFAULT_CRAWLERS
    else:
        maps = TEMP_TASK_MAPS
        origin_spiders = DEFAULT_VALIDATORS

    if not tasks:
        spiders = origin_spiders
    else:
        spiders = list()
        cases = list(map(BaseCase, origin_spiders))
        for task in tasks:
            for case in cases:
                if case.check(task, maps):
                    spiders.append(case.spider)
                    break
            else:
                # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
                #     task, list(maps.keys())))
                pass
    if not spiders:
        #crawler_logger.warning('no spider starts up, please check your task input')
        return

    settings = get_project_settings()
    configure_logging(settings)
    runner = CrawlerRunner(settings)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run() 
開發者ID:SpiderClub,項目名稱:haipproxy,代碼行數:40,代碼來源:scheduler.py

示例4: test_crawler_process

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import CrawlerRunner [as 別名]
def test_crawler_process(self):
        runner = CrawlerRunner(self.settings)
        d = runner.crawl(CustomSpider)
        d.addBoth(lambda _: reactor.stop())
        # add crawl to redis
        key = "test-spider:dmoztools.net:queue"
        self.redis_conn.zadd(key, self.example_feed, -99)

        # run the spider, give 20 seconds to see the url, crawl it,
        # and send to kafka. Then we kill the reactor
        def thread_func():
            time.sleep(20)
            reactor.stop()

        thread = threading.Thread(target=thread_func)
        thread.start()
        reactor.run()

        message_count = 0
        m = next(self.consumer)

        if m is None:
            pass
        else:
            the_dict = json.loads(m.value)
            if the_dict is not None and the_dict['appid'] == 'test' \
                    and the_dict['crawlid'] == 'abc12345':
                message_count += 1

        self.assertEquals(message_count, 1) 
開發者ID:istresearch,項目名稱:scrapy-cluster,代碼行數:32,代碼來源:online.py

示例5: runspider

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import CrawlerRunner [as 別名]
def runspider(self):
        configure_logging(install_root_handler = False)
        s = get_project_settings()
        runner = CrawlerRunner(settings = s)

        @defer.inlineCallbacks
        def crawl(**spargs):
            yield runner.crawl(JDItemInfoSpider, **spargs)
            yield runner.crawl(JDCommentSpider, **spargs)
            reactor.stop()

        crawl(**self.spargs)
        reactor.run()  # the script will block here until the last crawl call is finished

    # 調度分析 
開發者ID:awolfly9,項目名稱:jd_analysis,代碼行數:17,代碼來源:full_analysis.py

示例6: get_crawler

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import CrawlerRunner [as 別名]
def get_crawler(spidercls=None, settings_dict=None):
    """Return an unconfigured Crawler object. If settings_dict is given, it
    will be used to populate the crawler settings with a project level
    priority.
    """
    from scrapy.crawler import CrawlerRunner
    from scrapy.spiders import Spider

    runner = CrawlerRunner(settings_dict)
    return runner.create_crawler(spidercls or Spider) 
開發者ID:wistbean,項目名稱:learn_python3_spider,代碼行數:12,代碼來源:test.py

示例7: return_spider_output

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import CrawlerRunner [as 別名]
def return_spider_output(output):

    """
    Turns scrapy output into dictionaries
    :param output: items scraped by CrawlerRunner
    :type output: dict

    :return: json with list of items
    """

    # this just turns items into dictionaries
    return [dict(item) for item in output] 
開發者ID:eddiepease,項目名稱:company2vec,代碼行數:14,代碼來源:pipelines.py

示例8: __init__

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import CrawlerRunner [as 別名]
def __init__(self, settings, spider, args):
        """
        init parser
        :param settings:
        :param spider:
        :param args:
        """
        self.args = args
        self.spider = spider
        self.crawler_process = CrawlerRunner(settings)
        self.spider_loader = self.crawler_process.spider_loader
        self.spidercls = self.spider_loader.load(self.spider) 
開發者ID:Gerapy,項目名稱:Gerapy,代碼行數:14,代碼來源:parser.py

示例9: get_start_requests

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import CrawlerRunner [as 別名]
def get_start_requests(project_path, spider_name):
    """
    get start requests
    :param project_path: project path
    :param spider_name: spider name
    :return:
    """
    work_cwd = os.getcwd()
    try:
        # change work dir
        os.chdir(project_path)
        # load settings
        settings = get_project_settings()
        check_deprecated_settings(settings)
        runner = CrawlerRunner(settings=settings)
        # add crawler
        spider_cls = runner.spider_loader.load(spider_name)
        runner.crawl(spider_cls)
        # get crawler
        crawler = list(runner.crawlers)[0]
        # get spider by crawler
        spider = crawler.spider
        # get start requests
        requests = list(spider.start_requests())
        if not requests and hasattr(spider, 'start'):
            requests = list(spider.start())
        requests = list(map(lambda r: process_request(r), requests))
        return {'finished': True, 'requests': requests}
    finally:
        os.chdir(work_cwd) 
開發者ID:Gerapy,項目名稱:Gerapy,代碼行數:32,代碼來源:parser.py

示例10: init_crawler_runner

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import CrawlerRunner [as 別名]
def init_crawler_runner():
    crochet.setup()
    init_scrapy_env()
    settings = get_project_settings()
    global CRAWLER_RUNNER
    CRAWLER_RUNNER = CrawlerRunner(settings)
    logger.info('Initialized crawler runner: %s' % CRAWLER_RUNNER)


# TODO: move these to config file? 
開發者ID:Karmenzind,項目名稱:fp-server,代碼行數:12,代碼來源:crawler.py

示例11: run_spider2

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import CrawlerRunner [as 別名]
def run_spider2(spider, *args):
    configure_logging()
    runner = CrawlerRunner(get_project_settings())
    runner.crawl(spider, *args)
    runner.crawl(spider, *args)
    d = runner.join()
    # d = runner.crawl(spider, *args)
    d.addBoth(lambda _: reactor.stop())

    reactor.run()  # the script will block here until all crawling jobs are finished 
開發者ID:openslack,項目名稱:openslack-crawler,代碼行數:12,代碼來源:run.py


注:本文中的scrapy.crawler.CrawlerRunner方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。