當前位置: 首頁>>代碼示例>>Python>>正文


Python crawler.Crawler方法代碼示例

本文整理匯總了Python中scrapy.crawler.Crawler方法的典型用法代碼示例。如果您正苦於以下問題:Python crawler.Crawler方法的具體用法?Python crawler.Crawler怎麽用?Python crawler.Crawler使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在scrapy.crawler的用法示例。


在下文中一共展示了crawler.Crawler方法的12個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: __init__

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import Crawler [as 別名]
def __init__(self, splash_url, crawler_options):
        self.process = CrawlerProcess({'LOG_ENABLED': True})
        self.crawler = Crawler(self.TorSplashSpider, {
            'USER_AGENT': crawler_options['user_agent'],
            'SPLASH_URL': splash_url,
            'ROBOTSTXT_OBEY': False,
            'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
                                       'scrapy_splash.SplashMiddleware': 725,
                                       'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
                                       'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
                                       },
            'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
            'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
            'HTTPERROR_ALLOW_ALL': True,
            'RETRY_TIMES': 2,
            'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
            'DEPTH_LIMIT': crawler_options['depth_limit'],
            'SPLASH_COOKIES_DEBUG': False
            }) 
開發者ID:CIRCL,項目名稱:AIL-framework,代碼行數:21,代碼來源:TorSplashCrawler.py

示例2: build_crawler

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import Crawler [as 別名]
def build_crawler(self, spider):
        """
        do some specific settings for spider
        and return the wrapped crawler

        :param spider: spider class
        :return: crawler
        """
        # TODO: specify settings
        settings = crawler_runner.settings

        # FIXME !!!
        # conf = {}
        # log_file = crawler_runner.settings.get('LOG_FILE')
        # if log_file:
        #     conf['LOG_FILE'] = '%s.%s' % (log_file, spider.name)
        #     conf['LOG_FILE'] = None
        #     conf['LOG_FORMAT'] = ('%(levelname)1.1s [%(asctime)s]'
        #                           ' [spider-{spider}]'
        #                           ' %(message)s'
        #                           ).format(spider=spider.name)
        #     settings = updated_crawler_settings(settings, conf)
        # configure_logging(settings)

        return Crawler(spider, settings) 
開發者ID:Karmenzind,項目名稱:fp-server,代碼行數:27,代碼來源:spider.py

示例3: start_job

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import Crawler [as 別名]
def start_job(self, job=None, callback_fn=None):
        print(job)
        spider_job = job['spider_job']
        runner = job['runner']
        spider_cls = spider_job['spider_cls']
        spider_settings = spider_job['spider_settings']
        spider_kwargs = spider_job['spider_kwargs']

        def engine_stopped_callback():
            runner.transform_and_index(callback_fn=callback_fn)

        if callback_fn:
            print("""
==========================================================
WARNING: callback_fn is {}
==========================================================
Since start_job is called with callback_fn, make sure you end the reactor if you want the spider process to
stop after the callback function is executed. By default callback_fn=None will close the reactor.

To write a custom callback_fn

def callback_fn():
    print ("Write your own callback logic")
    from twisted.internet import reactor
    reactor.stop()
==========================================================
        """.format(callback_fn))

        spider = Crawler(spider_cls, Settings(spider_settings))
        spider.signals.connect(engine_stopped_callback, signals.engine_stopped)
        self.runner.crawl(spider, **spider_kwargs)
        """
        d = runner.crawl(spider, **spider_kwargs)
        # d.addBoth(engine_stopped_callback)
        """
        reactor.run() 
開發者ID:invanalabs,項目名稱:invana-bot,代碼行數:38,代碼來源:base.py

示例4: get_crawler

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import Crawler [as 別名]
def get_crawler():
    def _crawler(extended_settings={}):
        settings = {
            "SPIDERMON_ENABLED": True,
            "EXTENSIONS": {"spidermon.contrib.scrapy.extensions.Spidermon": 500},
        }
        settings.update(extended_settings)
        crawler = Crawler(Spider, settings=settings)
        crawler.spider = Spider("dummy")
        return crawler

    return _crawler 
開發者ID:scrapinghub,項目名稱:spidermon,代碼行數:14,代碼來源:conftest.py

示例5: make_data

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import Crawler [as 別名]
def make_data(request):
    def _make_data(settings=None):
        crawler = Crawler(Spider, settings=settings)
        spider = Spider("dummy")
        return {
            "stats": crawler.stats.get_stats(),
            "crawler": crawler,
            "spider": spider,
            "runner": SpiderMonitorRunner(spider=spider),
            "job": None,
        }

    return _make_data 
開發者ID:scrapinghub,項目名稱:spidermon,代碼行數:15,代碼來源:test_monitors.py

示例6: run

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import Crawler [as 別名]
def run(self, args: List[str], opts: optparse.Values) -> None:
        crawlers = []
        real_create_crawler = self.crawler_process.create_crawler

        def create_crawler(crawler_or_spidercls: Union[Crawler, str]) -> Crawler:
            crawler = real_create_crawler(crawler_or_spidercls)
            crawlers.append(crawler)
            return crawler

        self.crawler_process.create_crawler = create_crawler
        super().run(args, opts)
        if any(crawler.stats.get_value("log_count/ERROR") for crawler in crawlers):
            self.exitcode = 1 
開發者ID:zulip,項目名稱:zulip,代碼行數:15,代碼來源:crawl_with_status.py

示例7: make_crawler

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import Crawler [as 別名]
def make_crawler(spider_cls, settings):
    if not getattr(spider_cls, 'name', None):
        class Spider(spider_cls):
            name = 'test_spider'
        Spider.__name__ = spider_cls.__name__
        Spider.__module__ = spider_cls.__module__
        spider_cls = Spider
    return Crawler(spider_cls, settings) 
開發者ID:scrapinghub,項目名稱:scrapy-poet,代碼行數:10,代碼來源:utils.py

示例8: prepare_callback_replay

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import Crawler [as 別名]
def prepare_callback_replay(fixture_path, encoding="utf-8"):
    with open(str(fixture_path), 'rb') as f:
        raw_data = f.read()

    fixture_info = unpickle_data(decompress_data(raw_data), encoding)
    if 'fixture_version' in fixture_info:
        encoding = fixture_info['encoding']
        data = unpickle_data(fixture_info['data'], encoding)
    else:
        data = fixture_info  # legacy tests

    settings = get_project_settings()

    spider_name = data.get('spider_name')
    if not spider_name:  # legacy tests
        spider_name = os.path.basename(
            os.path.dirname(
                os.path.dirname(fixture_path)
            )
        )

    spider_cls = get_spider_class(spider_name, settings)
    spider_cls.update_settings(settings)

    for k, v in data.get('settings', {}).items():
        settings.set(k, v, 50)

    crawler = Crawler(spider_cls, settings)
    spider_args_in = data.get('spider_args', data.get('spider_args_in', {}))
    spider = spider_cls.from_crawler(crawler, **spider_args_in)
    crawler.spider = spider

    return data, crawler, spider, settings 
開發者ID:scrapinghub,項目名稱:scrapy-autounit,代碼行數:35,代碼來源:utils.py

示例9: run_spider_instance

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import Crawler [as 別名]
def run_spider_instance(spider_class, site_id, main_url):
    """Run a spider given its spider class. For example, importing the TestSpider
and passing it to this function will run it."""
    spider = spider_class(site_id=site_id, main_url=main_url)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    # Scrapy uses a deprecated Twisted interface. Until the fix makes it to a
    # new version (>0.24.4), we'll use this so deprecation warnings don't
    # clutter the output
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    crawler.crawl(spider)
    crawler.start()
    reactor.run() 
開發者ID:tryolabs,項目名稱:daywatch,代碼行數:17,代碼來源:spiders.py

示例10: __init__

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import Crawler [as 別名]
def __init__(self):
    self.spider = HqSpider()
    self.crawler = crawler = Crawler(get_project_settings())
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(self.spider)
    dispatcher.connect(self._dont_close_me, signals.spider_idle)
    self.thread = None
    self._started = False
    self._stopped = False 
開發者ID:yegong,項目名稱:stock,代碼行數:12,代碼來源:spider.py

示例11: list_spiders

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import Crawler [as 別名]
def list_spiders():
    settings = get_project_settings()
    crawler = Crawler(settings)
    return crawler.spiders.list() 
開發者ID:legco-watch,項目名稱:legco-watch,代碼行數:6,代碼來源:utils.py

示例12: do_scrape

# 需要導入模塊: from scrapy import crawler [as 別名]
# 或者: from scrapy.crawler import Crawler [as 別名]
def do_scrape(spider_name):
    """
    Asynchronous task for individual scrapes that is executed by Celery workers.
    :param spider_name: str name of the spider that should be run
    :return: the full path of the jsonlines output file to which results are stored
    """
    # create and configure the spider
    crawl_settings = get_project_settings()
    # configure the output
    # Technically don't need this unless we actually do the scrape, but need to put
    # up here before the crawler is instantiated so the FEED_URI override is active
    output_name = generate_scrape_name(spider_name)
    output_path = os.path.join(crawl_settings.get('DATA_DIR_BASE'), 'scrapes', output_name)
    crawl_settings.overrides['FEED_URI'] = output_path
    crawler = Crawler(crawl_settings)
    crawler.configure()
    try:
        spider = crawler.spiders.create(spider_name)
    except KeyError as e:
        # No spider found.
        raise RuntimeError('Could not find spider with name {}'.format(spider_name))

    # Check to see if we're already running a scrape by looking for open ScrapeJobs
    is_scraping = is_spider_scraping(spider_name)
    if is_scraping is False:
        logger.info('Starting new scrape of {}'.format(spider_name))
        # Create the ScrapeJob record
        job_id = do_scrape.request.id
        if job_id is None:
            # Case if called directly without using Celery, put in a dummy job id
            timestamp = datetime.now().strftime('%y%m%d%H%M')
            job_id = 'MANUAL_RUN{}'.format(timestamp)
        job = ScrapeJob.objects.create(
            spider=spider_name,
            scheduled=datetime.now(),
            # see http://stackoverflow.com/questions/18872854/getting-task-id-inside-a-celery-task
            job_id=job_id,
            raw_response=output_path
        )
        # and set up the callback for updating it
        complete_cb = complete_job(job.id)

        # Connect the signals and logging, then start it up
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.signals.connect(complete_cb, signal=signals.spider_closed)
        log.start(loglevel=log.INFO, logstdout=True)
        crawler.crawl(spider)
        crawler.start()
        reactor.run()
    else:
        logger.info('Pending job found for spider {}'.format(spider_name))
        job = is_scraping

    return job.raw_response 
開發者ID:legco-watch,項目名稱:legco-watch,代碼行數:56,代碼來源:tasks.py


注:本文中的scrapy.crawler.Crawler方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。