当前位置: 首页>>代码示例>>Python>>正文


Python SparkConf.set方法代码示例

本文整理汇总了Python中pyspark.SparkConf.set方法的典型用法代码示例。如果您正苦于以下问题:Python SparkConf.set方法的具体用法?Python SparkConf.set怎么用?Python SparkConf.set使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.SparkConf的用法示例。


在下文中一共展示了SparkConf.set方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def main():
    # Setting the cluster configuration parameters
    conf = SparkConf()
    conf.setMaster("spark://localhost:7077")
    conf.setAppName("Tweet App")
    conf.set("spark.executor.memory", "3g")
    conf.set("spark.driver.memory", "4g")

    # Creating a Spark Context with conf file
    sc = SparkContext(conf=conf)

    # Creating and SQL context to perform SQL queries
    sqlContext = SQLContext(sc)

    # Define the data path
    curr_path = os.path.dirname(os.path.abspath(__file__))
    json_name = "out.json"

    json_file_path = os.path.join(curr_path +
                                  "/../Spark_Jobs/data/",
                                  json_name)

    parquet_file_path = createSQLContext(json_file_path, sqlContext)
    print(parquet_file_path)

    # Read from parquet file
    parquetFile = sqlContext.read.parquet(parquet_file_path)
    parquetFile.registerTempTable("tweets")
    counter = sqlContext.sql("SELECT count(*) as cnt FROM tweets")
    print("============= Count =================")
    print("Count:: " + str(counter.collect()[0].cnt))
开发者ID:alt-code,项目名称:AutoSpark,代码行数:33,代码来源:tweet_scanner.py

示例2: configureSpark

# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def configureSpark():
	conf = SparkConf()
	conf.setMaster("local")
	conf.setAppName("Apache Spark Alarm Parser")
	conf.set("spark.executor.memory", "1g")
	sc = SparkContext(conf = conf)
	return sc
开发者ID:ChinmaySKulkarni,项目名称:Alarm_Tracker,代码行数:9,代码来源:spark_parser.py

示例3: start_spark

# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
    def start_spark(self,
                    spark_conf=None, 
                    executor_memory=None,
                    profiling=False, 
                    graphframes_package='graphframes:graphframes:0.3.0-spark2.0-s_2.11', 
                    extra_conf = None):
        """Launch a SparkContext 
        
        Parameters

        spark_conf: path
            path to a spark configuration directory
        executor_memory: string
            executor memory in java memory string format, e.g. '4G'
            If `None`, `memory_per_executor` is used. 
        profiling: boolean
            whether to turn on python profiling or not
        graphframes_package: string
            which graphframes to load - if it isn't found, spark will attempt to download it
        extra_conf: dict
            additional configuration options
        """

        os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages {graphframes_package} pyspark-shell"\
                                            .format(graphframes_package=graphframes_package)
        
        if spark_conf is None:
            spark_conf = os.path.join(os.environ['SPARK_HOME'], 'conf')

        os.environ['SPARK_CONF_DIR'] = os.path.realpath(spark_conf)

        os.environ['PYSPARK_PYTHON'] = sys.executable

        try: 
            import findspark; findspark.init()
            from pyspark import SparkContext, SparkConf
        except ImportError: 
            raise ImportError("Unable to find pyspark -- are you sure SPARK_HOME is set?")

        conf = SparkConf()

        conf.set('spark.driver.maxResultSize', '0')

        if executor_memory is None: 
            executor_memory = '%dM'%self.memory_per_executor

        conf.set('spark.executor.memory', executor_memory)

        if profiling: 
            conf.set('spark.python.profile', 'true')
        else:
            conf.set('spark.python.profile', 'false')
        
        if extra_conf is not None: 
            for k,v in extra_conf.items(): 
                conf.set(k,v)

        sc = SparkContext(master=self.master_url(), conf=conf)

        return sc    
开发者ID:vinjana,项目名称:sparkhpc,代码行数:62,代码来源:sparkjob.py

示例4: setUp

# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
 def setUp(self):
     conf = SparkConf().setAppName('testing').setMaster('local[2]').set('spark.driver.host', 'localhost')
     conf.set('spark.ui.showConsoleProgress', False)
     self.session = SparkSession.builder.config(conf=conf).getOrCreate()
     self.test_data = [
         ('Ricardo', 'engineering', 2),
         ('Tisa', 'sales', 3),
         ('Sheree', 'marketing', 4), 
         ('Chantelle', 'engineering', 5),
         ('Kylee', 'finance', 2),
         ('Tamatha', 'marketing', 5),
         ('Trena', 'engineering', 2),
         ('Arica', 'engineering', 1),
         ('Santina', 'finance', 2),
         ('Daria', 'marketing', 1),
         ('Magnolia', 'sales', 2),
         ('Antonina', 'finance', 1),
         ('Sumiko', 'engineering', 1),
         ('Carmen', 'sales', 2),
         ('Delois', 'engineering', 1),
         ('Luetta', 'marketing', 3),
         ('Yessenia', 'sales', 1),
         ('Petra', 'engineering', 3),
         ('Charisse', 'engineering', 4),
         ('Lillian', 'engineering', 3),
         ('Wei', 'engineering', 2),
         ('Lahoma', 'sales', 2),
         ('Lucilla', 'marketing', 1),
         ('Stephaine', 'finance', 2),
     ]
开发者ID:bfemiano,项目名称:misc_scripts,代码行数:32,代码来源:test_analysis_teacher.py

示例5: main

# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def main():
    """
    Main entry point of the application
    """

    # Create spark configuration and spark context
    include_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'preprocessing.py'))
    conf = SparkConf()
    conf.set('spark.executor.memory', '1500m')
    conf.setAppName("Generating predictions")
    sc = SparkContext(conf=conf, pyFiles=[include_path])

    # Set S3 configuration
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", os.environ['AWS_ACCESS_KEY'])
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", os.environ['AWS_SECRET_KEY'])

    # Single-pass predictions
    fast_predict(sc, file_input="s3n://twitter-stream-data/twitter-*",
                 file_output="s3n://twitter-stream-predictions/final",
                 sports_model="PyTwitterNews/models/sports.model",
                 politics_model="PyTwitterNews/models/politics.model",
                 technology_model="PyTwitterNews/models/technology.model")

    # Stop application
    sc.stop()
开发者ID:alialavia,项目名称:TwitterNews,代码行数:27,代码来源:predict.py

示例6: main

# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def main():
	conf = SparkConf()
	conf.set("spark.default.parallelism", "24")
	sc = SparkContext(appName="PhoneLab Preprocessing", conf=conf)

	lines = sc.textFile(data_files, use_unicode=False)

	# Create LogLine objects and filter out empty lines
	logs = lines.flatMap(ll_mapper)

	# Save in an intermediate format
	logs.saveAsTextFile(out_dir, compressionCodecClass=codec)
	return

	# Gap detection
	keyed = logs.map(ll_gap_map)
	merged = keyed.groupByKey()

	# At this point we have ((boot_id, date), [line_num]) tuples The last step.
	# is to find all the gaps within each key/tuple.
	result = merged.flatMap(find_gaps)
	gaps = result.collect()

	fd = open("/spark/gaps.json", 'w')
	fd.write(json.dumps(gaps, indent=4))
开发者ID:gurupras,项目名称:phonelab-postprocessing,代码行数:27,代码来源:preprocess.py

示例7: __connected_yarn_spark_cluster

# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
    def __connected_yarn_spark_cluster(self, pilotcompute_description):

        number_cores=1
        if pilotcompute_description.has_key("number_cores"):
            number_cores=int(pilotcompute_description["number_cores"])
        
        number_of_processes = 1
        if pilotcompute_description.has_key("number_of_processes"):
            number_of_processes = int(pilotcompute_description["number_of_processes"])

        executor_memory="1g"
        if pilotcompute_description.has_key("number_of_processes"):
            executor_memory = pilotcompute_description["physical_memory_per_process"]

        conf = SparkConf()
        conf.set("spark.num.executors", str(number_of_processes))
        conf.set("spark.executor.instances", str(number_of_processes))
        conf.set("spark.executor.memory", executor_memory)
        conf.set("spark.executor.cores", number_cores)
        if pilotcompute_description!=None:
            for i in pilotcompute_description.keys():
                if i.startswith("spark"):
                    conf.set(i, pilotcompute_description[i])
        conf.setAppName("Pilot-Spark")
        conf.setMaster("yarn-client")
        sc = SparkContext(conf=conf)
        sqlCtx = SQLContext(sc)
        pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx)
        return pilot
开发者ID:drelu,项目名称:SAGA-Hadoop,代码行数:31,代码来源:__init__.py

示例8: start

# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def start():
    sconf = SparkConf()
    sconf.set('spark.cores.max', 2)
    sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    brokers = "192.192.0.27:9092"
    topics = ['topic7']

    kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})

    lines1 = kafkaStreams_lines.map(lambda x: x[1])  # 注意 取tuple下的第二个即为接收到的kafka流

    words = lines1.flatMap(lambda line: line.split(" "))

    pairs = words.map(lambda word: (word, 1))

    wordcounts = pairs.reduceByKey(lambda x, y: x + y)

    wordcounts.saveAsTextFiles("/var/lib/hadoop-hdfs/spark-libin/kafka")

    wordcounts.pprint()
    # 统计生成的随机数的分布情况
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
开发者ID:blair1,项目名称:hadoop-spark,代码行数:27,代码来源:kafka_streaming_direct.py

示例9: __init__

# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
class SparkContextFactory:
  def __init__(self):
    # not sure why windows environment variable can't be read, I set it 
    ##os.environ["SPARK_HOME"] = "C:\Spark"
    # not sure why windows environment variable can't be read, I set it 
    ##os.environ["HADOOP_CONF_DIR"] = "C:\hdp\bin"
    ##sys.path.append("C:\Spark\python")
    ##sys.path.append("C:\Spark\bin")

    # specify spark home
    os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark"
    # specify pyspark path so its libraries can be accessed by this application
    sys.path.append("/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark/python")
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SQLContext

    self.conf = SparkConf().setMaster("yarn-client")
    self.conf.setAppName("MrT")
    self.conf.set("spark.executor.memory", "5g")
    self.conf.set("spark.driver.memory", "10g")

    self.sc = SparkContext(conf = self.conf, pyFiles =
    ["ComputeCovHistory.py", "go.py", "risk_DSconvert.py", "ewstats.py", "ewstatsRDD.py", "ewstatswrap.py"])

    """
    toDF method is a monkey patch executed inside SQLContext constructor
    so to be able to use it you have to create a SQLContext first
    """
    self.sqlContextInstance = SQLContext(self.sc)


  def disconnect(self):
    self.sc.stop()
开发者ID:howardx,项目名称:pyspark,代码行数:35,代码来源:risk_SparkContextFactory.py

示例10: main

# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def main():
    spark_conf = SparkConf().setAppName("Different-Sampling data").setMaster('local[*]')
    spark_conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    sc = SparkContext(conf= spark_conf)
    GA.logInConsole(0, "input file read!")
    rdd = sc.textFile("/home/fatemeh/Data/saveData.txt",  minPartitions= 500, use_unicode=False)
    rdd.unpersist()
#     print('\nNumber of Partitions for this run: ', rdd.getNumPartitions())
    vectorRDD = rdd.map(lambda line: toVector(line, splitter = ' '))
    
    GA.logInConsole(0 , "Data Vectorized!")
    ss = list()
    GA.logInConsole(-1, 'Start the ensemble')
    GA.logInConsole(-10, "GA with range 3")
    ss.append(GA.parallel_GA_main(vectorRDD,sc, 5))
#     GA.logInConsole(-10, "GA with range 4")
#     ss.append(GA.parallel_GA_main(norm,sc, 4))
#     GA.logInConsole(-10, "GA with range 5")
#     ss.append(GA.parallel_GA_main(norm,sc, 5))
#     GA.logInConsole(-10, "GA with range 3 and Sampled data set")
#    sampleRDD = norm.sample(False, 0.6, seed=10)
#    ss.append(GA.parallel_GA_main(sampleRDD,sc, 3))
    print(ss)
    #selectedSS = voted_subsapces(ss)
#     SSD.outlierDetection(vectorRDD, ss)
    GA.logInConsole(100, "\nend of program")
    sc.stop()
开发者ID:fchgithub,项目名称:OriginPySparkRepository,代码行数:29,代码来源:ODHD.py

示例11: start

# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def start():
    sconf = SparkConf()
    sconf.set('spark.cores.max', 2)
    sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
    ssc = StreamingContext(sc, 2)

    brokers = "localhost:9092"
    topics = ['test']

    kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})

    lines1 = kafkaStreams_lines.map(lambda x: x[1])  # 注意 取tuple下的第二个即为接收到的kafka流

    words = lines1.flatMap(lambda line: line.split(" "))

    pairs = words.map(lambda word: (word, 1))

    wordcounts = pairs.reduceByKey(lambda x, y: x + y)

    print(wordcounts)

    kafkaStreams_lines.transform(storeOffsetRanges).foreachRDD(printOffsetRanges)

    wordcounts.pprint()
    # 统计生成的随机数的分布情况
    ssc.start()  # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
开发者ID:blair1,项目名称:hadoop-spark,代码行数:29,代码来源:kafka-direct.py

示例12: get_default_spark_conf

# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def get_default_spark_conf():
    conf = SparkConf(). \
        setAppName("pyunit-test"). \
        setMaster("local-cluster[3,1,2048]"). \
        set("spark.ext.h2o.disable.ga","true"). \
        set("spark.driver.memory", "2g"). \
        set("spark.executor.memory", "2g"). \
        set("spark.ext.h2o.client.log.level", "DEBUG"). \
        set("spark.ext.h2o.repl.enabled", "false"). \
        set("spark.task.maxFailures", "1"). \
        set("spark.rpc.numRetries", "1"). \
        set("spark.deploy.maxExecutorRetries", "1"). \
        set("spark.network.timeout", "360s"). \
        set("spark.worker.timeout", "360"). \
        set("spark.ext.h2o.backend.cluster.mode", ExternalClusterTestHelper.cluster_mode()). \
        set("spark.ext.h2o.cloud.name", ExternalClusterTestHelper.unique_cloud_name("test")). \
        set("spark.ext.h2o.external.start.mode", os.getenv("spark.ext.h2o.external.start.mode", "manual")) .\
        set("spark.sql.warehouse.dir", "file:" + os.path.join(os.getcwd(), "spark-warehouse"))


    if ExternalClusterTestHelper.tests_in_external_mode():
        conf.set("spark.ext.h2o.client.ip", ExternalClusterTestHelper.local_ip())
        conf.set("spark.ext.h2o.external.cluster.num.h2o.nodes", "2")

    return conf
开发者ID:seuwangcy,项目名称:sparkling-water,代码行数:27,代码来源:test_utils.py

示例13: setUpClass

# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
    def setUpClass(cls):

        class_name = cls.__name__
        conf = SparkConf()
        conf.set('spark.app.name', 'class_name')

        # Read the spark configuration and update the spark conf
        test_spark_config = ConfigParser.ConfigParser()
        test_spark_config.read('test_config.cfg')
        test_spark_config.sections()
        configs = dict(test_spark_config.items('spark_conf_test_generic'))
        for k, v in configs.items():
            conf.set(k, v)
        cls.spark_test_configs = configs
        # Create the spark context
        cls.sc = SparkContext(conf=conf)
        if 'PYSPARK_DRIVER_PYTHON' in configs.keys():
            cls.sc.pythonExec = configs['PYSPARK_DRIVER_PYTHON']
        else:
            cls.sc.pythonExec = 'python2.7'

        logger = cls.sc._jvm.org.apache.log4j
        logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
        logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)

        logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s: %(message)s')
        cls.logger = logging.getLogger(__name__)
        cls.logger.setLevel(logging.DEBUG)
开发者ID:bossjones,项目名称:sparkonda,代码行数:30,代码来源:test_sparkonda.py

示例14: getSparkContext

# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
 def getSparkContext(self, appName, master):
     print(appName)
     print(master)
     conf = SparkConf().setAppName(appName).setMaster(master)
     conf.set("spark.local.ip", "127.0.0.1")
     conf.set("spark.driver.host", "127.0.0.1")
     return SparkContext(conf=conf)
开发者ID:seoeun25,项目名称:spark-app,代码行数:9,代码来源:WordCount2.py

示例15: _test_broadcast_on_driver

# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
 def _test_broadcast_on_driver(self, *extra_confs):
     conf = SparkConf()
     for key, value in extra_confs:
         conf.set(key, value)
     conf.setMaster("local-cluster[2,1,1024]")
     self.sc = SparkContext(conf=conf)
     bs = self.sc.broadcast(value=5)
     self.assertEqual(5, bs.value)
开发者ID:Brett-A,项目名称:spark,代码行数:10,代码来源:test_broadcast.py


注:本文中的pyspark.SparkConf.set方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。