当前位置: 首页>>代码示例>>Python>>正文


Python CrawlerRunner.join方法代码示例

本文整理汇总了Python中scrapy.crawler.CrawlerRunner.join方法的典型用法代码示例。如果您正苦于以下问题:Python CrawlerRunner.join方法的具体用法?Python CrawlerRunner.join怎么用?Python CrawlerRunner.join使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.crawler.CrawlerRunner的用法示例。


在下文中一共展示了CrawlerRunner.join方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: crawler_start

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
def crawler_start(usage, tasks):
    """Start specified spiders or validators from cmd with scrapy core api.
    There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
    assign any tasks, all these spiders will run.
    """
    maps = CRAWLER_TASK_MAPS if usage == 'crawler' else TEMP_TASK_MAPS
    origin_spiders = DEFAULT_CRAWLERS if usage == 'crawler' else DEFAULT_VALIDATORS
    if not tasks:
        spiders = origin_spiders
    else:
        spiders = list()
        cases = list(map(BaseCase, origin_spiders))
        for task in tasks:
            for case in cases:
                if case.check(task, maps):
                    spiders.append(case.spider)
                    break
            else:
                # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
                #     task, list(maps.keys())))
                pass
    if not spiders:
        #crawler_logger.warning('no spider starts up, please check your task input')
        return

    settings = get_project_settings()
    configure_logging(settings)
    runner = CrawlerRunner(settings)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
开发者ID:yb123speed,项目名称:haipproxy,代码行数:35,代码来源:scheduler.py

示例2: crawl_articles

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
def crawl_articles(spids):
    settings = get_project_settings()
    configure_logging(settings, install_root_handler=False)
    logging.getLogger('scrapy').setLevel(logging.WARNING)
    runner = CrawlerRunner(settings)
    loader = runner.spider_loader
    if 'all' in spids:
        spids = loader.list()
    spiders = [
        loader.load(spid)
        for spid in spids
        if spid in loader.list()
    ]
    if not spiders:
        return
    random.shuffle(spiders)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    logger.info('crawl job starting...')
    try:
        reactor.run()
    except Exception:
        logger.exception('crawl job got exception:')
    logger.info('crawl job finished')
开发者ID:wartalker,项目名称:BlogSpider,代码行数:28,代码来源:task.py

示例3: run_crawler_by_runner

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
def run_crawler_by_runner():
    runner = CrawlerRunner(get_project_settings())
    
    [runner.crawl(spider) for spider in spiders]
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
开发者ID:wutongye00703,项目名称:base_location,代码行数:9,代码来源:run.py

示例4: run

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
def run():
    options = {
        'CONCURRENT_ITEMS': 250,
        #'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
        'CONCURRENT_REQUESTS': 30,
        'DOWNLOAD_DELAY': 0.5,
        'COOKIES_ENABLED': False,
        }

    spider = EntertainmentcareersSpider()

    settings = get_project_settings()
    settings.update(options)

    runner= CrawlerRunner(settings)
    runner.crawl(spider)

    d= runner.join()
    d.addBoth(lambda _:reactor.stop())
    #crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    #crawler.install()
    #crawler.configure()
    #crawler.crawl(spider)
    #crawler.start()
    #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
    reactor.run()
开发者ID:Andy-wangke,项目名称:Front_end,代码行数:29,代码来源:entertainmentcareers_basic.py

示例5: run_spider

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
def run_spider():
	options = {
	    'CONCURRENT_ITEMS': 250,
	    'USER_AGENT': 'Googlebot/2.1 (+http://www.google.com/bot.html)',
	    'CONCURRENT_REQUESTS': 30,
	    'DOWNLOAD_DELAY': 0.5,
	    'COOKIES_ENABLED': False,
	    }

	settings = get_project_settings()
	configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
	settings.update(options);

	#BookToscrapeSpider basic version
	from tutorial.spiders.booktoscrape_basic import BookToscrapeSpider
	#runner = CrawlerRunner(settings)
	#runner.crawl(BookToscrapeSpider())

	#BookToscrapeSpider crawl version
	from tutorial.spiders.booktoscrape_crawl import BookToscrapeSpider as BookToscrapeSpider_crawl
	runner = CrawlerRunner(settings)
	runner.crawl(BookToscrapeSpider_crawl())

    #crawler = Crawler(settings)
    #crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    #crawler.install()
    #crawler.configure()
    #crawler.crawl(spider)
    #crawler.start()
    #log.start(logfile="results.log", loglevel=log.DEBUG, crawler=crawler, logstdout=False)
	d= runner.join()
	d.addBoth(lambda _:reactor.stop())

	reactor.run()
开发者ID:Andy-wangke,项目名称:Front_end,代码行数:36,代码来源:run_spiders.py

示例6: runProcess

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
    def runProcess(self):
        configure_logging()
        dbHandler.check_watches()
        runner = CrawlerRunner()
        runner.crawl(spider.available_courses_spider)
        dbHandler.check_watches()
        d = runner.join()
        d.addBoth(lambda _: reactor.stop())

        reactor.run()
开发者ID:NemirovD,项目名称:Interval-Scheduling-Algorithm-with-Applied-Constraints,代码行数:12,代码来源:spiderworker.py

示例7: runSpider

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
def runSpider(host, spider):
    spiders = spider.split(',')
    changeSettings(host)
    settings = get_project_settings()
    runner = CrawlerRunner(settings)
    for i in spiders:
        runner.crawl(SPIDER_MATCHER[i.lower()])

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run()
开发者ID:jerryxing98,项目名称:knowledge_graph_demo,代码行数:13,代码来源:run_spider.py

示例8: main

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
def main():
    configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})
    settings = get_project_settings()
    runner = CrawlerRunner(settings)
    

    # settings.set('FEED_FORMAT','json')
    # settings.set('FEED_URI', 'result.json')

    runner.crawl(PttBoard)
    runner.crawl(PTTArticle)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    result = reactor.run() # the script will block here until the crawling is finished

    print result
开发者ID:steny138,项目名称:PTTCrawler,代码行数:18,代码来源:main.py

示例9: Runner

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
class Runner(object):
    def __init__(self,*args,**kwargs): 
        configure_logging()
        self.settings = get_project_settings()
        self.runner = CrawlerRunner(self.settings) 

    def add(self,*a,**kw):  
        crawler = Crawler(BroadSpider,self.settings) 
        self.runner.crawl(crawler,*a,**kw)

    def start(self): 
        d = self.runner.join()
        d.addBoth(lambda _: reactor.stop()) 
        reactor.run()

    def stop(self):
        self.runner.stop()
        reactor.stop()
开发者ID:speedy0526,项目名称:python,代码行数:20,代码来源:run.py

示例10: CrawlTestCase

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]

#.........这里部分代码省略.........
        self._assert_retried(l)

    def _assert_retried(self, log):
        self.assertEqual(str(log).count("Retrying"), 2)
        self.assertEqual(str(log).count("Gave up retrying"), 1)

    @defer.inlineCallbacks
    def test_referer_header(self):
        """Referer header is set by RefererMiddleware unless it is already set"""
        req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1)
        req1 = req0.replace()
        req2 = req0.replace(headers={'Referer': None})
        req3 = req0.replace(headers={'Referer': 'http://example.com'})
        req0.meta['next'] = req1
        req1.meta['next'] = req2
        req2.meta['next'] = req3
        crawler = self.runner.create_crawler(SingleRequestSpider)
        yield crawler.crawl(seed=req0, mockserver=self.mockserver)
        # basic asserts in case of weird communication errors
        self.assertIn('responses', crawler.spider.meta)
        self.assertNotIn('failures', crawler.spider.meta)
        # start requests doesn't set Referer header
        echo0 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body))
        self.assertNotIn('Referer', echo0['headers'])
        # following request sets Referer to start request url
        echo1 = json.loads(to_unicode(crawler.spider.meta['responses'][1].body))
        self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
        # next request avoids Referer header
        echo2 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body))
        self.assertNotIn('Referer', echo2['headers'])
        # last request explicitly sets a Referer header
        echo3 = json.loads(to_unicode(crawler.spider.meta['responses'][3].body))
        self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])

    @defer.inlineCallbacks
    def test_engine_status(self):
        from scrapy.utils.engine import get_engine_status
        est = []

        def cb(response):
            est.append(get_engine_status(crawler.engine))

        crawler = self.runner.create_crawler(SingleRequestSpider)
        yield crawler.crawl(seed=self.mockserver.url('/'), callback_func=cb, mockserver=self.mockserver)
        self.assertEqual(len(est), 1, est)
        s = dict(est[0])
        self.assertEqual(s['engine.spider.name'], crawler.spider.name)
        self.assertEqual(s['len(engine.scraper.slot.active)'], 1)

    @defer.inlineCallbacks
    def test_graceful_crawl_error_handling(self):
        """
        Test whether errors happening anywhere in Crawler.crawl() are properly
        reported (and not somehow swallowed) after a graceful engine shutdown.
        The errors should not come from within Scrapy's core but from within
        spiders/middlewares/etc., e.g. raised in Spider.start_requests(),
        SpiderMiddleware.process_start_requests(), etc.
        """

        class TestError(Exception):
            pass

        class FaultySpider(SimpleSpider):
            def start_requests(self):
                raise TestError

        crawler = self.runner.create_crawler(FaultySpider)
        yield self.assertFailure(crawler.crawl(mockserver=self.mockserver), TestError)
        self.assertFalse(crawler.crawling)

    @defer.inlineCallbacks
    def test_open_spider_error_on_faulty_pipeline(self):
        settings = {
            "ITEM_PIPELINES": {
                "tests.pipelines.ZeroDivisionErrorPipeline": 300,
            }
        }
        crawler = CrawlerRunner(settings).create_crawler(SimpleSpider)
        yield self.assertFailure(
            self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver),
            ZeroDivisionError)
        self.assertFalse(crawler.crawling)

    @defer.inlineCallbacks
    def test_crawlerrunner_accepts_crawler(self):
        crawler = self.runner.create_crawler(SimpleSpider)
        with LogCapture() as log:
            yield self.runner.crawl(crawler, self.mockserver.url("/status?n=200"), mockserver=self.mockserver)
        self.assertIn("Got response 200", str(log))

    @defer.inlineCallbacks
    def test_crawl_multiple(self):
        self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=200"), mockserver=self.mockserver)
        self.runner.crawl(SimpleSpider, self.mockserver.url("/status?n=503"), mockserver=self.mockserver)

        with LogCapture() as log:
            yield self.runner.join()

        self._assert_retried(log)
        self.assertIn("Got response 200", str(log))
开发者ID:ArturGaspar,项目名称:scrapy,代码行数:104,代码来源:test_crawl.py

示例11: get_project_settings

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
from time import sleep

import scrapy
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner

from PTTRank.spiders.ptt import PttSpider

settings = get_project_settings()
runner = CrawlerRunner(settings)
runner.crawl(PttSpider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
开发者ID:gcaaa31928,项目名称:PTTRank,代码行数:18,代码来源:process.py

示例12: callSpiderWithCrawlerRunner

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
 def callSpiderWithCrawlerRunner(self):
     configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'}) 
     runner = CrawlerRunner(get_project_settings())
     runner.crawl(self.spider)
     dispatcher.connect(self.spider_closing, signal=signals.spider_closed)
     runner.join()
开发者ID:tunguyenlam,项目名称:BuyingAssistant,代码行数:8,代码来源:jobtracker.py

示例13: TmallSpider

# 需要导入模块: from scrapy.crawler import CrawlerRunner [as 别名]
# 或者: from scrapy.crawler.CrawlerRunner import join [as 别名]
# -*- coding: utf-8 -*-

import scrapy
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner

from tmall import TmallSpider

spider = TmallSpider(domain='tmall.com')
crawler = CrawlerRunner()
crawler.crawl(spider)
d = crawler.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
开发者ID:0x24bin,项目名称:gooseeker,代码行数:16,代码来源:runcrawl.py


注:本文中的scrapy.crawler.CrawlerRunner.join方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。