当前位置: 首页>>代码示例>>Python>>正文


Python project.get_project_settings方法代码示例

本文整理汇总了Python中scrapy.utils.project.get_project_settings方法的典型用法代码示例。如果您正苦于以下问题:Python project.get_project_settings方法的具体用法?Python project.get_project_settings怎么用?Python project.get_project_settings使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在scrapy.utils.project的用法示例。


在下文中一共展示了project.get_project_settings方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: runspider

# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def runspider(name):
    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='log/%s.log' % name,
        format='%(levelname)s %(asctime)s: %(message)s',
        level=logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runspider start spider:%s' % name)
        process.crawl(name)
        process.start()
    except Exception as e:
        logging.exception('runspider spider:%s exception:%s' % (name, e))

    logging.debug('finish this spider:%s\n\n' % name) 
开发者ID:awolfly9,项目名称:IPProxyTool,代码行数:18,代码来源:run_spider.py

示例2: runspider

# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def runspider(spargs):
    url = spargs.get('url')
    name = spargs.get('name', 'jd')

    if not os.path.exists('log'):
        os.makedirs('log')

    configure_logging(install_root_handler = False)
    logging.basicConfig(
            filename = 'log/%s.log' % name,
            format = '%(levelname)s %(asctime)s: %(message)s',
            level = logging.ERROR
    )
    print "get_project_settings().attributes:", get_project_settings().attributes['SPIDER_MODULES']
    process = CrawlerProcess(get_project_settings())
    start_time = time.time()
    try:
        logging.info('进入爬虫')
        process.crawl(name, **spargs)
        process.start()
    except Exception, e:
        process.stop()
        logging.error("url:%s, errorMsg:%s" % (url, e.message)) 
开发者ID:awolfly9,项目名称:jd_analysis,代码行数:25,代码来源:real_time_analysis.py

示例3: runspider

# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def runspider(spargs):
    url = spargs.get('url')
    name = spargs.get('name', 'jd')
    guid = spargs.get('guid')
    product_id = spargs.get('product_id')

    if not os.path.exists('log'):
        os.makedirs('log')

    configure_logging(install_root_handler = False)
    logging.basicConfig(
            filename = 'log/%s.log' % name,
            format = '%(levelname)s %(asctime)s: %(message)s',
            level = logging.ERROR
    )
    print "get_project_settings().attributes:", get_project_settings().attributes['SPIDER_MODULES']
    process = CrawlerProcess(get_project_settings())
    start_time = time.time()
    try:
        logging.info('进入爬虫')
        process.crawl(name, **spargs)
        process.start()
    except Exception, e:
        process.stop()
        logging.error("url:%s, errorMsg:%s" % (url, e.message)) 
开发者ID:awolfly9,项目名称:jd_analysis,代码行数:27,代码来源:run_spider.py

示例4: run

# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def run():
    configure_logging()
    # importing project settings for further usage
    # mainly because of the middlewares
    settings = get_project_settings()
    runner = CrawlerRunner(settings)

    # running spiders sequentially (non-distributed)
    @defer.inlineCallbacks
    def crawl():
        yield runner.crawl(IPTesterSpider)
        yield runner.crawl(UATesterSpider)
        reactor.stop()

    crawl()
    reactor.run() # block until the last call 
开发者ID:matejbasic,项目名称:PythonScrapyBasicSetup,代码行数:18,代码来源:run.py

示例5: __init__

# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def __init__(self, *args, **kwargs):
        super(YoutubeHistorySpider, self).__init__(*args, **kwargs)
        settings = get_project_settings()
        hf = settings.get("CHROME_HEADERS_FILE")
        cj = settings.get("COOKIES_JSON")
        if hf:
            ch = ChromeRequest.from_file(hf)
            self.init_cookies = ch.cookies
        elif cj:
            with open (cj, 'r') as fh:
                cookies = parse_cookies(fh.read())
                self.init_cookies = cookies

        if not hasattr(self, "init_cookies"):
            raise ValueError("Need to specify 'CHROME_HEADERS_FILE' "+
                             "or 'COOKIES_JSON' in settings.") 
开发者ID:zvodd,项目名称:Youtube-Watch-History-Scraper,代码行数:18,代码来源:youtube_history_spider.py

示例6: get_follow_requests_and_items

# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def get_follow_requests_and_items(project_path, spider_name, args):
    """
    get follows
    :param project_path:
    :param spider_name:
    :param args:
    :return:
    """
    work_cwd = os.getcwd()
    try:
        os.chdir(project_path)
        settings = get_project_settings()
        check_deprecated_settings(settings)
        sp = SpiderParser(settings, spider_name, args)
        results = sp.run()
        return results
    finally:
        os.chdir(work_cwd) 
开发者ID:Gerapy,项目名称:Gerapy,代码行数:20,代码来源:parser.py

示例7: crawler_start

# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def crawler_start(usage, tasks):
    """Start specified spiders or validators from cmd with scrapy core api.
    There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
    assign any tasks, all these spiders will run.
    """
    if usage == 'crawler':
        maps = CRAWLER_TASK_MAPS
        origin_spiders = DEFAULT_CRAWLERS
    else:
        maps = TEMP_TASK_MAPS
        origin_spiders = DEFAULT_VALIDATORS

    if not tasks:
        spiders = origin_spiders
    else:
        spiders = list()
        cases = list(map(BaseCase, origin_spiders))
        for task in tasks:
            for case in cases:
                if case.check(task, maps):
                    spiders.append(case.spider)
                    break
            else:
                # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
                #     task, list(maps.keys())))
                pass
    if not spiders:
        #crawler_logger.warning('no spider starts up, please check your task input')
        return

    settings = get_project_settings()
    configure_logging(settings)
    runner = CrawlerRunner(settings)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run() 
开发者ID:SpiderClub,项目名称:haipproxy,代码行数:40,代码来源:scheduler.py

示例8: __init__

# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def __init__(self):
		scrapy.spiders.Spider.__init__(self)

		self.global_settings = get_project_settings()
		if self.global_settings['PLATFORM'] in ['win', 'mac']:
			self.driver = webdriver.PhantomJS(executable_path= self.global_settings['PHANTOMJS_PATH'])
		elif self.global_settings['PLATFORM'] in ['linux']:
			self.driver = webdriver.PhantomJS()
		self.driver.set_page_load_timeout(30)
		self.driver.implicitly_wait(10)

		self.type_id_list = self.global_settings['CRAWLER']['type_id_list']
		self.re_type_id = re.compile(self.global_settings['CRAWLER']['re_type_id'])
		self.url_template = self.global_settings['CRAWLER']['url_template'] 
开发者ID:czs0x55aa,项目名称:video_url_crawler_demo,代码行数:16,代码来源:aiqiyi_spider.py

示例9: setUp

# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def setUp(self):
        self.settings = get_project_settings()
        self.settings.set('KAFKA_TOPIC_PREFIX', "demo_test")
        # set up redis
        self.redis_conn = redis.Redis(host=self.settings['REDIS_HOST'],
                                      port=self.settings['REDIS_PORT'],
                                      db=self.settings['REDIS_DB'])
        try:
            self.redis_conn.info()
        except ConnectionError:
            print("Could not connect to Redis")
            # plugin is essential to functionality
            sys.exit(1)

        # clear out older test keys if any
        keys = self.redis_conn.keys("test-spider:*")
        for key in keys:
            self.redis_conn.delete(key)

        # set up kafka to consumer potential result
        self.consumer = KafkaConsumer(
            "demo_test.crawled_firehose",
            bootstrap_servers=self.settings['KAFKA_HOSTS'],
            group_id="demo-id",
            auto_commit_interval_ms=10,
            consumer_timeout_ms=5000,
            auto_offset_reset='earliest'
        )
        time.sleep(1) 
开发者ID:istresearch,项目名称:scrapy-cluster,代码行数:31,代码来源:online.py

示例10: runspider

# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def runspider(self):
        configure_logging(install_root_handler = False)
        s = get_project_settings()
        runner = CrawlerRunner(settings = s)

        @defer.inlineCallbacks
        def crawl(**spargs):
            yield runner.crawl(JDItemInfoSpider, **spargs)
            yield runner.crawl(JDCommentSpider, **spargs)
            reactor.stop()

        crawl(**self.spargs)
        reactor.run()  # the script will block here until the last crawl call is finished

    # 调度分析 
开发者ID:awolfly9,项目名称:jd_analysis,代码行数:17,代码来源:full_analysis.py

示例11: crawl

# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def crawl(url, user_agent):
    try:
        output = Services.get("output")

        # Settings for the crawler
        settings = get_project_settings()
        settings.set("USER_AGENT", user_agent)
        settings.set("LOG_LEVEL", "CRITICAL")
        settings.set("RETRY_ENABLED", False)
        settings.set("CONCURRENT_REQUESTS", 15)

        # Create the process that will perform the crawl
        output.info("Start crawling the target website")
        process = CrawlerProcess(settings)
        allowed_domains.append(str(urlparse(url).hostname))
        process.crawl(
            SitadelSpider, start_urls=[str(url)], allowed_domains=allowed_domains
        )
        process.start()

        # Clean the results
        clean_urls = []
        for u in urls:
            try:
                new_url = urlparse(u).geturl()
                clean_urls.append(new_url)
            except ValueError:
                continue
        return clean_urls

    except KeyboardInterrupt:
        process.stop()
        raise 
开发者ID:shenril,项目名称:Sitadel,代码行数:35,代码来源:crawler.py

示例12: import_settings

# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def import_settings(self):
        settings = get_project_settings()
        self.password = settings['AUTH_PASSWORD']
        self.http_proxy = settings['HTTP_PROXY']
        self.control_port = settings['CONTROL_PORT']
        self.max_req_per_ip = settings['MAX_REQ_PER_IP']

        self.exit_nodes = settings['EXIT_NODES']
        if self.exit_nodes:
            with Controller.from_port(port=self.control_port) as controller:
                controller.authenticate(self.password)
                controller.set_conf('ExitNodes', self.exit_nodes)
                controller.close() 
开发者ID:matejbasic,项目名称:PythonScrapyBasicSetup,代码行数:15,代码来源:proxy.py

示例13: execute

# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    check_deprecated_settings(settings)

    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
                                   conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(settings)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode) 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:41,代码来源:cmdline.py

示例14: __init__

# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def __init__(self, uri, access_key=None, secret_key=None, acl=None):
        # BEGIN Backward compatibility for initialising without keys (and
        # without using from_crawler)
        no_defaults = access_key is None and secret_key is None
        if no_defaults:
            from scrapy.utils.project import get_project_settings
            settings = get_project_settings()
            if 'AWS_ACCESS_KEY_ID' in settings or 'AWS_SECRET_ACCESS_KEY' in settings:
                import warnings
                from scrapy.exceptions import ScrapyDeprecationWarning
                warnings.warn(
                    "Initialising `scrapy.extensions.feedexport.S3FeedStorage` "
                    "without AWS keys is deprecated. Please supply credentials or "
                    "use the `from_crawler()` constructor.",
                    category=ScrapyDeprecationWarning,
                    stacklevel=2
                )
                access_key = settings['AWS_ACCESS_KEY_ID']
                secret_key = settings['AWS_SECRET_ACCESS_KEY']
        # END Backward compatibility
        u = urlparse(uri)
        self.bucketname = u.hostname
        self.access_key = u.username or access_key
        self.secret_key = u.password or secret_key
        self.is_botocore = is_botocore()
        self.keyname = u.path[1:]  # remove first "/"
        self.acl = acl
        if self.is_botocore:
            import botocore.session
            session = botocore.session.get_session()
            self.s3_client = session.create_client(
                's3', aws_access_key_id=self.access_key,
                aws_secret_access_key=self.secret_key)
        else:
            import boto
            self.connect_s3 = boto.connect_s3 
开发者ID:wistbean,项目名称:learn_python3_spider,代码行数:38,代码来源:feedexport.py

示例15: prepare_callback_replay

# 需要导入模块: from scrapy.utils import project [as 别名]
# 或者: from scrapy.utils.project import get_project_settings [as 别名]
def prepare_callback_replay(fixture_path, encoding="utf-8"):
    with open(str(fixture_path), 'rb') as f:
        raw_data = f.read()

    fixture_info = unpickle_data(decompress_data(raw_data), encoding)
    if 'fixture_version' in fixture_info:
        encoding = fixture_info['encoding']
        data = unpickle_data(fixture_info['data'], encoding)
    else:
        data = fixture_info  # legacy tests

    settings = get_project_settings()

    spider_name = data.get('spider_name')
    if not spider_name:  # legacy tests
        spider_name = os.path.basename(
            os.path.dirname(
                os.path.dirname(fixture_path)
            )
        )

    spider_cls = get_spider_class(spider_name, settings)
    spider_cls.update_settings(settings)

    for k, v in data.get('settings', {}).items():
        settings.set(k, v, 50)

    crawler = Crawler(spider_cls, settings)
    spider_args_in = data.get('spider_args', data.get('spider_args_in', {}))
    spider = spider_cls.from_crawler(crawler, **spider_args_in)
    crawler.spider = spider

    return data, crawler, spider, settings 
开发者ID:scrapinghub,项目名称:scrapy-autounit,代码行数:35,代码来源:utils.py


注:本文中的scrapy.utils.project.get_project_settings方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。