当前位置: 首页>>代码示例>>Python>>正文


Python pyspark.SparkConf方法代码示例

本文整理汇总了Python中pyspark.SparkConf方法的典型用法代码示例。如果您正苦于以下问题:Python pyspark.SparkConf方法的具体用法?Python pyspark.SparkConf怎么用?Python pyspark.SparkConf使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark的用法示例。


在下文中一共展示了pyspark.SparkConf方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: create_spark_context

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def create_spark_context(app_name="Quiz Bowl", configs=None) -> SparkContext:
    if QB_SPARK_MASTER != "":
        log.info("Spark master is %s" % QB_SPARK_MASTER)
        spark_conf = SparkConf()\
            .set('spark.rpc.message.maxSize', 300)\
            .setAppName(app_name)\
            .setMaster(QB_SPARK_MASTER)
    else:
        spark_conf = SparkConf()\
            .set('spark.rpc.message.maxSize', 300)\
            .setAppName(app_name)
    if configs is not None:
        for key, value in configs:
            if key in ('spark.executor.cores', 'spark.max.cores'):
                if value > QB_MAX_CORES:
                    log.info('Requested {r_cores} cores when the machine only has {n_cores} cores, reducing number of '
                             'cores to {n_cores}'.format(r_cores=value, n_cores=QB_MAX_CORES))
                    value = QB_MAX_CORES
            spark_conf = spark_conf.set(key, value)
    return SparkContext.getOrCreate(spark_conf) 
开发者ID:Pinafore,项目名称:qb,代码行数:22,代码来源:spark.py

示例2: run

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def run():
    from pyspark import SparkContext, SparkConf

    conf = SparkConf()
    conf.setAppName('dispel4py')
    conf.set("spark.storage.memoryFraction", "0.5")
    sc = SparkContext(
        conf=conf)

    from dispel4py.new import processor
    from dispel4py.utils import load_graph

    args = parse_args()

    graph = load_graph(args.module, args.attr)
    if graph is None:
        return
    graph.flatten()

    inputs = processor.create_inputs(args, graph)

    process(sc, graph, inputs=inputs, args=args) 
开发者ID:dispel4py,项目名称:dispel4py,代码行数:24,代码来源:spark_process.py

示例3: setUpClass

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def setUpClass(cls):
        #load sample warc files
        fh           = open('tests/sample_wat.paths')
        cls.watPaths = fh.readlines()

        #initialize class
        cls.cclinks = CCLinks('CC-MAIN-2018-13', 5)
        cls.cclinks.output = 'tests/output/{}/parquet'.format(cls.cclinks.crawlIndex)

        #remove output directory
        if os.path.exists(cls.cclinks.output):
            shutil.rmtree('tests/output')

        #init pyspark
        conf   = pyspark.SparkConf().setMaster('local[*]').setAppName('Test_ExtractCCLinks')
        cls.sc = pyspark.SparkContext.getOrCreate(conf=conf) 
开发者ID:creativecommons,项目名称:cccatalog,代码行数:18,代码来源:test_ExtractCCLinks.py

示例4: create_sc

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def create_sc():
    sc_conf = SparkConf()
    sc_conf.setAppName("finance-similarity-app")
    sc_conf.setMaster('spark://10.21.208.21:7077')
    sc_conf.set('spark.executor.memory', '2g')
    sc_conf.set('spark.executor.cores', '4')
    sc_conf.set('spark.cores.max', '40')
    sc_conf.set('spark.logConf', True)
    print sc_conf.getAll()

    sc = None
    try:
        sc.stop()
        sc = SparkContext(conf=sc_conf)
    except:
        sc = SparkContext(conf=sc_conf)

    return sc 
开发者ID:litaotao,项目名称:Spark-in-Finance-Quantitative-Investing,代码行数:20,代码来源:finance_similarity.py

示例5: sparkSession

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def sparkSession(cls):
        if not hasattr(cls, "spark"):
            # We can't use the SparkSession Builder here, since we need to call
            # Scala side's SmvTestHive.createContext to create the HiveTestContext's
            # SparkSession.
            # So we need to
            #   * Create a java_gateway
            #   * Create a SparkConf using the jgw (since without it SparkContext will ignore the given conf)
            #   * Create python SparkContext using the SparkConf (so we can specify the warehouse.dir)
            #   * Create Scala side HiveTestContext SparkSession
            #   * Create python SparkSession
            jgw = launch_gateway(None)
            jvm = jgw.jvm
            import tempfile
            import getpass
            hivedir = "file://{0}/{1}/smv_hive_test".format(tempfile.gettempdir(), getpass.getuser())
            sConf = SparkConf(False, _jvm=jvm).set("spark.sql.test", "")\
                                              .set("spark.sql.hive.metastore.barrierPrefixes",
                                                   "org.apache.spark.sql.hive.execution.PairSerDe")\
                                              .set("spark.sql.warehouse.dir", hivedir)\
                                              .set("spark.ui.enabled", "false")
            sc = SparkContext(master="local[1]", appName="SMV Python Test", conf=sConf, gateway=jgw).getOrCreate()
            jss = sc._jvm.org.apache.spark.sql.hive.test.SmvTestHive.createContext(sc._jsc.sc())
            cls.spark = SparkSession(sc, jss.sparkSession())
        return cls.spark 
开发者ID:TresAmigosSD,项目名称:SMV,代码行数:27,代码来源:testconfig.py

示例6: create_spark_conf

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def create_spark_conf(**kwargs):
    """
    Configure the Spark master node

    :param kwargs:
    :return:
    """
    spark_executor_memory = kwargs.get("spark_executor_memory", "2g")
    spark_driver_memory = kwargs.get("spark_driver_memory", "2g")
    url = kwargs.get("url", SPARK_ADDR)
    app  = kwargs.get("app", 'pyFTS')

    conf = SparkConf()
    conf.setMaster(url)
    conf.setAppName(app)
    conf.set("spark.executor.memory", spark_executor_memory)
    conf.set("spark.driver.memory", spark_driver_memory)
    conf.set("spark.memory.offHeap.enabled",True)
    conf.set("spark.memory.offHeap.size","16g")
    
    return conf 
开发者ID:PYFTS,项目名称:pyFTS,代码行数:23,代码来源:spark.py

示例7: __call__

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def __call__(self):
        c = SparkConf().setAppName('Build %s' % self.model_name)

        log.info('Using spark master: %s', c.get('spark.master'))
        sc = SparkContext(conf=c)

        kwargs = self.model.prepare(sc)
        m = self.model.build(**kwargs)
        m = self.model.format_items(m)
        m = self.formatter(m)

        if self.output_path:
            log.info("Saving to: %s", self.output_path)
            if os.path.isdir(self.output_path):
                log.warn('Writing over output path: %s', self.output_path)
                shutil.rmtree(self.output_path)
            m.saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec')
        elif self.sample > 0:
            print '\n'.join(str(i) for i in m.take(self.sample))

        log.info('Done.') 
开发者ID:wikilinks,项目名称:sift,代码行数:23,代码来源:build.py

示例8: main

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def main(date, aws_access_key_id, aws_secret_access_key, region, table, sample_rate):

    # Clobber the AWS access credentials
    os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
    os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key

    APP_NAME = "TaarDynamo"
    conf = SparkConf().setAppName(APP_NAME)
    spark = SparkSession.builder.config(conf=conf).getOrCreate()
    date_obj = datetime.strptime(date, "%Y%m%d") - PATCH_DAYS

    reduction_output = run_etljob(
        spark,
        date_obj,
        region,
        table,
        sample_rate,
        aws_access_key_id,
        aws_secret_access_key,
    )
    pprint(reduction_output) 
开发者ID:mozilla,项目名称:telemetry-airflow,代码行数:23,代码来源:taar_dynamo.py

示例9: initialize

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def initialize(cls, options_from_ini=None):
        if cls._instance:
            return cls._instance

        from pyspark import SparkConf

        cls._instance = SparkConf()

        cls.options = dict(cls.DEFAULTS)
        if options_from_ini:
            cls.options.update(cls._parse_config(options_from_ini))

        for k, v in cls.options.items():
            cls._instance.set(k, v)

        return cls._instance 
开发者ID:malexer,项目名称:pytest-spark,代码行数:18,代码来源:config.py

示例10: run

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def run(self):
        self.args = self.parse_arguments()

        conf = SparkConf()

        if self.args.spark_profiler:
            conf = conf.set("spark.python.profile", "true")

        sc = SparkContext(
            appName=self.name,
            conf=conf)
        sqlc = SQLContext(sparkContext=sc)

        self.init_accumulators(sc)

        self.run_job(sc, sqlc)

        if self.args.spark_profiler:
            sc.show_profiles()

        sc.stop() 
开发者ID:commoncrawl,项目名称:cc-pyspark,代码行数:23,代码来源:sparkcc.py

示例11: set_spark_defaults

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def set_spark_defaults(conf, name='spark-job'):
    """
    Update the configuration dictionary for setting up spark, creating the
    dictionary if does not exist yet
    """
    if not conf:
        conf = dict()

    home = os.path.join('/tmp', str(uuid.uuid4()))
    conf['SparkConfiguration'] = SparkConf()\
        .setMaster('yarn-client')\
        .setAppName(name)\
        .set("spark.sql.shuffle.partitions", "1000")\
        .set("spark.scheduler.revive.interval", "3")\
        .set("spark.task.maxFailures", "0")\
        .set("spark.executorEnv.HOME", home)

    return conf 
开发者ID:dsaidgovsg,项目名称:airflow-pipeline,代码行数:20,代码来源:spark_conf.py

示例12: create_task

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def create_task(words):
    conf = SparkConf().setAppName('letter count')
    sc = SparkContext(conf=conf)
    seq = words.split()
    data = sc.parallelize(seq)
    counts = data.map(lambda word: (word, 1)).reduceByKey(add).collect()
    sc.stop()
    return dict(counts) 
开发者ID:mjhea0,项目名称:flask-spark-docker,代码行数:10,代码来源:tasks.py

示例13: parse_raw_wikidata

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def parse_raw_wikidata(output):
    spark_conf = SparkConf().setAppName('QB Wikidata').setMaster(QB_SPARK_MASTER)
    sc = SparkContext.getOrCreate(spark_conf)  # type: SparkContext

    wikidata = sc.textFile('s3a://entilzha-us-west-2/wikidata/wikidata-20170306-all.json')

    def parse_line(line):
        if len(line) == 0:
            return []
        if line[0] == '[' or line[0] == ']':
            return []
        elif line.endswith(','):
            return [json.loads(line[:-1])]
        else:
            return [json.loads(line)]

    parsed_wikidata = wikidata.flatMap(parse_line).cache()
    property_map = extract_property_map(parsed_wikidata)
    b_property_map = sc.broadcast(property_map)

    wikidata_items = parsed_wikidata.filter(lambda d: d['type'] == 'item').cache()
    parsed_wikidata.unpersist()
    item_page_map = extract_item_page_map(wikidata_items)
    b_item_page_map = sc.broadcast(item_page_map)

    parsed_item_map = extract_items(wikidata_items, b_property_map, b_item_page_map)

    with open(output, 'wb') as f:
        pickle.dump({
            'parsed_item_map': parsed_item_map,
            'item_page_map': item_page_map,
            'property_map': property_map
        }, f)

    sc.stop() 
开发者ID:Pinafore,项目名称:qb,代码行数:37,代码来源:wikidata.py

示例14: work_spark

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def work_spark(args):
    conf = pyspark.SparkConf().setAppName( "temp1" ).setMaster( "local[*]" ).set( "spark.driver.host", "localhost" ) \
            .set('spark.executor.memory', '6g')
    with pyspark.SparkContext("local[*]", "PySparkWordCount", conf=conf) as sc:
        (sc.parallelize(args.files)
                .flatMap(get_games)
                .flatMap(lambda game: game.mainline())
                #.sample(False, .1)
                .map(process)
                .mapPartitions(merge)
                .saveAsPickleFile('pikle.out')
                ) 
开发者ID:thomasahle,项目名称:fastchess,代码行数:14,代码来源:proc.py

示例15: __init__

# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def __init__(self, processes: int = 8):
        self.spark_conf = SparkConf().setAppName("jmetalpy").setMaster(f"local[{processes}]")
        self.spark_context = SparkContext(conf=self.spark_conf)

        logger = self.spark_context._jvm.org.apache.log4j
        logger.LogManager.getLogger("org").setLevel(logger.Level.WARN) 
开发者ID:jMetal,项目名称:jMetalPy,代码行数:8,代码来源:evaluator.py


注:本文中的pyspark.SparkConf方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。