当前位置: 首页>>代码示例>>Python>>正文


Python crawler.CrawlerProcess方法代码示例

本文整理汇总了Python中scrapy.crawler.CrawlerProcess方法的典型用法代码示例。如果您正苦于以下问题:Python crawler.CrawlerProcess方法的具体用法?Python crawler.CrawlerProcess怎么用?Python crawler.CrawlerProcess使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.crawler的用法示例。


在下文中一共展示了crawler.CrawlerProcess方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: runspider

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def runspider(name):
    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='log/%s.log' % name,
        format='%(levelname)s %(asctime)s: %(message)s',
        level=logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runspider start spider:%s' % name)
        process.crawl(name)
        process.start()
    except Exception as e:
        logging.exception('runspider spider:%s exception:%s' % (name, e))

    logging.debug('finish this spider:%s\n\n' % name) 
开发者ID:awolfly9,项目名称:IPProxyTool,代码行数:18,代码来源:run_spider.py

示例2: runspider

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def runspider(spargs):
    url = spargs.get('url')
    name = spargs.get('name', 'jd')

    if not os.path.exists('log'):
        os.makedirs('log')

    configure_logging(install_root_handler = False)
    logging.basicConfig(
            filename = 'log/%s.log' % name,
            format = '%(levelname)s %(asctime)s: %(message)s',
            level = logging.ERROR
    )
    print "get_project_settings().attributes:", get_project_settings().attributes['SPIDER_MODULES']
    process = CrawlerProcess(get_project_settings())
    start_time = time.time()
    try:
        logging.info('进入爬虫')
        process.crawl(name, **spargs)
        process.start()
    except Exception, e:
        process.stop()
        logging.error("url:%s, errorMsg:%s" % (url, e.message)) 
开发者ID:awolfly9,项目名称:jd_analysis,代码行数:25,代码来源:real_time_analysis.py

示例3: runspider

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def runspider(spargs):
    url = spargs.get('url')
    name = spargs.get('name', 'jd')
    guid = spargs.get('guid')
    product_id = spargs.get('product_id')

    if not os.path.exists('log'):
        os.makedirs('log')

    configure_logging(install_root_handler = False)
    logging.basicConfig(
            filename = 'log/%s.log' % name,
            format = '%(levelname)s %(asctime)s: %(message)s',
            level = logging.ERROR
    )
    print "get_project_settings().attributes:", get_project_settings().attributes['SPIDER_MODULES']
    process = CrawlerProcess(get_project_settings())
    start_time = time.time()
    try:
        logging.info('进入爬虫')
        process.crawl(name, **spargs)
        process.start()
    except Exception, e:
        process.stop()
        logging.error("url:%s, errorMsg:%s" % (url, e.message)) 
开发者ID:awolfly9,项目名称:jd_analysis,代码行数:27,代码来源:run_spider.py

示例4: main

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def main():
	"""Main routine for the execution of the Spider"""
	# set up signal to catch items scraped
	def catch_item(sender, item, **kwargs):
		print("Item extracted:", item)
	dispatcher.connect(catch_item, signal=signals.item_passed)
	
	settings = Settings()
	settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
	settings.set("LOG_ENABLED",False)	

	# setup crawler
	from scrapy.crawler import CrawlerProcess

	crawler = CrawlerProcess(settings)

	# define the spider for the crawler
	crawler.crawl(EuropythonSpyder())

	# start scrapy
	print("STARTING ENGINE")
	crawler.start() #iniciar el crawler llamando al spider definido
	print("ENGINE STOPPED") 
开发者ID:PacktPublishing,项目名称:Learning-Python-Networking-Second-Edition,代码行数:25,代码来源:EuropythonSpyder.py

示例5: __init__

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def __init__(self, splash_url, crawler_options):
        self.process = CrawlerProcess({'LOG_ENABLED': True})
        self.crawler = Crawler(self.TorSplashSpider, {
            'USER_AGENT': crawler_options['user_agent'],
            'SPLASH_URL': splash_url,
            'ROBOTSTXT_OBEY': False,
            'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
                                       'scrapy_splash.SplashMiddleware': 725,
                                       'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
                                       'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
                                       },
            'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
            'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
            'HTTPERROR_ALLOW_ALL': True,
            'RETRY_TIMES': 2,
            'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
            'DEPTH_LIMIT': crawler_options['depth_limit'],
            'SPLASH_COOKIES_DEBUG': False
            }) 
开发者ID:CIRCL,项目名称:AIL-framework,代码行数:21,代码来源:TorSplashCrawler.py

示例6: __init__

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def __init__(self, accounts, loglevel, remote=False):
        self.accounts = settings.SCRAPY_ACCOUNTS
        if accounts:
            self.accounts.update(accounts)
        self.loglevel = loglevel
        self.settings = self._get_settings()
        # Values for `loglevel`: CRITICAL, ERROR, WARNING, INFO, DEBUG.
        self.settings.set('LOG_LEVEL', loglevel)
        if remote:
            # Configure remote logging and disable the scrapy logging.
            self.settings.set('LOG_ENABLED', False)
            logger = logging.getLogger()
            handler = ScrapySocketHandler(
                'localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT)
            handler.setLevel(loglevel)
            logger.addHandler(handler)

        self.process = CrawlerProcess(self.settings) 
开发者ID:aplanas,项目名称:kmanga,代码行数:20,代码来源:scrapyctl.py

示例7: load_crawler

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def load_crawler(self, crawler, url, ignore_regex):
        """
        Loads the given crawler with the given url.

        :param class crawler: class of the crawler to load
        :param str url: url to start the crawler with
        :param regex ignore_regex: to be able to ignore urls that match this
                                   regex code
        """
        self.process = CrawlerProcess(self.cfg.get_scrapy_options())
        self.process.crawl(
            crawler,
            self.helper,
            url=url,
            config=self.cfg,
            ignore_regex=ignore_regex) 
开发者ID:fhamborg,项目名称:news-please,代码行数:18,代码来源:single_crawler.py

示例8: cleanup

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def cleanup(ctx):
    """
    Cleanup old cache entries.

    By default, entries older than 90 days will be removed. This value can be
    overriden in the config file.
    """
    settings = ctx.obj["settings"]
    # Manually configure logging since we don't have a CrawlerProcess which
    # would take care of that.
    configure_logging(settings)

    if not settings.getbool("HTTPCACHE_ENABLED"):
        logger.error("Cache is disabled, will not clean up cache dir.")
        return 1

    run_cleanup_cache(settings) 
开发者ID:PyFeeds,项目名称:PyFeeds,代码行数:19,代码来源:cli.py

示例9: fetch_url

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def fetch_url(cls, session, msites, platform_id, purpose):
        """Actual method to do fetch url action.

        Parameters
        ----------
            msites : list
                a list of Site model class, contains info to build spiders.
            platform_id : int
                id of platform, bind fetched url with this id.
            purpose : {'update', 'archive'}
                indicate which url to fetch.
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.UrlPipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        for ms in msites:
            for sm in build_spiders_iter(ms, purpose):
                sm['kwargs']['session'] = session
                sm['kwargs']['platform_id'] = platform_id
                process.crawl(sm['cls'], *sm['args'], **sm['kwargs'])
        process.start() 
开发者ID:IUNetSci,项目名称:hoaxy-backend,代码行数:26,代码来源:crawl.py

示例10: fetch_html

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def fetch_html(cls, session, url_tuples):
        """Actual method to do fetch html action.

        Parameters
        ----------
            session : object
                a SQLAlchemy session object.
            url_tuples : list
                a list of url tuple (id, raw, status_code).
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.HtmlPipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        logger.warning('Number of url to fetch html is: %s', len(url_tuples))
        process.crawl(
            HtmlSpider,
            session=session,
            url_tuples=url_tuples,
            excluded_domains=cls.conf['crawl']['excluded_domains'])
        process.start() 
开发者ID:IUNetSci,项目名称:hoaxy-backend,代码行数:25,代码来源:crawl.py

示例11: crawl

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def crawl(url, user_agent):
    try:
        output = Services.get("output")

        # Settings for the crawler
        settings = get_project_settings()
        settings.set("USER_AGENT", user_agent)
        settings.set("LOG_LEVEL", "CRITICAL")
        settings.set("RETRY_ENABLED", False)
        settings.set("CONCURRENT_REQUESTS", 15)

        # Create the process that will perform the crawl
        output.info("Start crawling the target website")
        process = CrawlerProcess(settings)
        allowed_domains.append(str(urlparse(url).hostname))
        process.crawl(
            SitadelSpider, start_urls=[str(url)], allowed_domains=allowed_domains
        )
        process.start()

        # Clean the results
        clean_urls = []
        for u in urls:
            try:
                new_url = urlparse(u).geturl()
                clean_urls.append(new_url)
            except ValueError:
                continue
        return clean_urls

    except KeyboardInterrupt:
        process.stop()
        raise 
开发者ID:shenril,项目名称:Sitadel,代码行数:35,代码来源:crawler.py

示例12: collect

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def collect(conf, conn):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn)
    process.start() 
开发者ID:opentrials,项目名称:collectors,代码行数:6,代码来源:collector.py

示例13: collect

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def collect(conf, conn, date_from=None, date_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
    process.start() 
开发者ID:opentrials,项目名称:collectors,代码行数:6,代码来源:collector.py

示例14: collect

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def collect(conf, conn, page_from=None, page_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, page_from=page_from, page_to=page_to)
    process.start() 
开发者ID:opentrials,项目名称:collectors,代码行数:6,代码来源:collector.py

示例15: collect

# 需要导入模块: from scrapy import crawler [as 别名]
# 或者: from scrapy.crawler import CrawlerProcess [as 别名]
def collect(conf, conn):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn,
        http_user=conf['ICTRP_USER'],
        http_pass=conf['ICTRP_PASS'])
    process.start() 
开发者ID:opentrials,项目名称:collectors,代码行数:8,代码来源:collector.py


注:本文中的scrapy.crawler.CrawlerProcess方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。