本文整理汇总了Python中pyspark.SparkConf.set方法的典型用法代码示例。如果您正苦于以下问题:Python SparkConf.set方法的具体用法?Python SparkConf.set怎么用?Python SparkConf.set使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkConf
的用法示例。
在下文中一共展示了SparkConf.set方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def main():
# Setting the cluster configuration parameters
conf = SparkConf()
conf.setMaster("spark://localhost:7077")
conf.setAppName("Tweet App")
conf.set("spark.executor.memory", "3g")
conf.set("spark.driver.memory", "4g")
# Creating a Spark Context with conf file
sc = SparkContext(conf=conf)
# Creating and SQL context to perform SQL queries
sqlContext = SQLContext(sc)
# Define the data path
curr_path = os.path.dirname(os.path.abspath(__file__))
json_name = "out.json"
json_file_path = os.path.join(curr_path +
"/../Spark_Jobs/data/",
json_name)
parquet_file_path = createSQLContext(json_file_path, sqlContext)
print(parquet_file_path)
# Read from parquet file
parquetFile = sqlContext.read.parquet(parquet_file_path)
parquetFile.registerTempTable("tweets")
counter = sqlContext.sql("SELECT count(*) as cnt FROM tweets")
print("============= Count =================")
print("Count:: " + str(counter.collect()[0].cnt))
示例2: configureSpark
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def configureSpark():
conf = SparkConf()
conf.setMaster("local")
conf.setAppName("Apache Spark Alarm Parser")
conf.set("spark.executor.memory", "1g")
sc = SparkContext(conf = conf)
return sc
示例3: start_spark
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def start_spark(self,
spark_conf=None,
executor_memory=None,
profiling=False,
graphframes_package='graphframes:graphframes:0.3.0-spark2.0-s_2.11',
extra_conf = None):
"""Launch a SparkContext
Parameters
spark_conf: path
path to a spark configuration directory
executor_memory: string
executor memory in java memory string format, e.g. '4G'
If `None`, `memory_per_executor` is used.
profiling: boolean
whether to turn on python profiling or not
graphframes_package: string
which graphframes to load - if it isn't found, spark will attempt to download it
extra_conf: dict
additional configuration options
"""
os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages {graphframes_package} pyspark-shell"\
.format(graphframes_package=graphframes_package)
if spark_conf is None:
spark_conf = os.path.join(os.environ['SPARK_HOME'], 'conf')
os.environ['SPARK_CONF_DIR'] = os.path.realpath(spark_conf)
os.environ['PYSPARK_PYTHON'] = sys.executable
try:
import findspark; findspark.init()
from pyspark import SparkContext, SparkConf
except ImportError:
raise ImportError("Unable to find pyspark -- are you sure SPARK_HOME is set?")
conf = SparkConf()
conf.set('spark.driver.maxResultSize', '0')
if executor_memory is None:
executor_memory = '%dM'%self.memory_per_executor
conf.set('spark.executor.memory', executor_memory)
if profiling:
conf.set('spark.python.profile', 'true')
else:
conf.set('spark.python.profile', 'false')
if extra_conf is not None:
for k,v in extra_conf.items():
conf.set(k,v)
sc = SparkContext(master=self.master_url(), conf=conf)
return sc
示例4: setUp
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def setUp(self):
conf = SparkConf().setAppName('testing').setMaster('local[2]').set('spark.driver.host', 'localhost')
conf.set('spark.ui.showConsoleProgress', False)
self.session = SparkSession.builder.config(conf=conf).getOrCreate()
self.test_data = [
('Ricardo', 'engineering', 2),
('Tisa', 'sales', 3),
('Sheree', 'marketing', 4),
('Chantelle', 'engineering', 5),
('Kylee', 'finance', 2),
('Tamatha', 'marketing', 5),
('Trena', 'engineering', 2),
('Arica', 'engineering', 1),
('Santina', 'finance', 2),
('Daria', 'marketing', 1),
('Magnolia', 'sales', 2),
('Antonina', 'finance', 1),
('Sumiko', 'engineering', 1),
('Carmen', 'sales', 2),
('Delois', 'engineering', 1),
('Luetta', 'marketing', 3),
('Yessenia', 'sales', 1),
('Petra', 'engineering', 3),
('Charisse', 'engineering', 4),
('Lillian', 'engineering', 3),
('Wei', 'engineering', 2),
('Lahoma', 'sales', 2),
('Lucilla', 'marketing', 1),
('Stephaine', 'finance', 2),
]
示例5: main
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def main():
"""
Main entry point of the application
"""
# Create spark configuration and spark context
include_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'preprocessing.py'))
conf = SparkConf()
conf.set('spark.executor.memory', '1500m')
conf.setAppName("Generating predictions")
sc = SparkContext(conf=conf, pyFiles=[include_path])
# Set S3 configuration
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", os.environ['AWS_ACCESS_KEY'])
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", os.environ['AWS_SECRET_KEY'])
# Single-pass predictions
fast_predict(sc, file_input="s3n://twitter-stream-data/twitter-*",
file_output="s3n://twitter-stream-predictions/final",
sports_model="PyTwitterNews/models/sports.model",
politics_model="PyTwitterNews/models/politics.model",
technology_model="PyTwitterNews/models/technology.model")
# Stop application
sc.stop()
示例6: main
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def main():
conf = SparkConf()
conf.set("spark.default.parallelism", "24")
sc = SparkContext(appName="PhoneLab Preprocessing", conf=conf)
lines = sc.textFile(data_files, use_unicode=False)
# Create LogLine objects and filter out empty lines
logs = lines.flatMap(ll_mapper)
# Save in an intermediate format
logs.saveAsTextFile(out_dir, compressionCodecClass=codec)
return
# Gap detection
keyed = logs.map(ll_gap_map)
merged = keyed.groupByKey()
# At this point we have ((boot_id, date), [line_num]) tuples The last step.
# is to find all the gaps within each key/tuple.
result = merged.flatMap(find_gaps)
gaps = result.collect()
fd = open("/spark/gaps.json", 'w')
fd.write(json.dumps(gaps, indent=4))
示例7: __connected_yarn_spark_cluster
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def __connected_yarn_spark_cluster(self, pilotcompute_description):
number_cores=1
if pilotcompute_description.has_key("number_cores"):
number_cores=int(pilotcompute_description["number_cores"])
number_of_processes = 1
if pilotcompute_description.has_key("number_of_processes"):
number_of_processes = int(pilotcompute_description["number_of_processes"])
executor_memory="1g"
if pilotcompute_description.has_key("number_of_processes"):
executor_memory = pilotcompute_description["physical_memory_per_process"]
conf = SparkConf()
conf.set("spark.num.executors", str(number_of_processes))
conf.set("spark.executor.instances", str(number_of_processes))
conf.set("spark.executor.memory", executor_memory)
conf.set("spark.executor.cores", number_cores)
if pilotcompute_description!=None:
for i in pilotcompute_description.keys():
if i.startswith("spark"):
conf.set(i, pilotcompute_description[i])
conf.setAppName("Pilot-Spark")
conf.setMaster("yarn-client")
sc = SparkContext(conf=conf)
sqlCtx = SQLContext(sc)
pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx)
return pilot
示例8: start
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def start():
sconf = SparkConf()
sconf.set('spark.cores.max', 2)
sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
ssc = StreamingContext(sc, 2)
brokers = "192.192.0.27:9092"
topics = ['topic7']
kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})
lines1 = kafkaStreams_lines.map(lambda x: x[1]) # 注意 取tuple下的第二个即为接收到的kafka流
words = lines1.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
wordcounts = pairs.reduceByKey(lambda x, y: x + y)
wordcounts.saveAsTextFiles("/var/lib/hadoop-hdfs/spark-libin/kafka")
wordcounts.pprint()
# 统计生成的随机数的分布情况
ssc.start() # Start the computation
ssc.awaitTermination() # Wait for the computation to terminate
示例9: __init__
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
class SparkContextFactory:
def __init__(self):
# not sure why windows environment variable can't be read, I set it
##os.environ["SPARK_HOME"] = "C:\Spark"
# not sure why windows environment variable can't be read, I set it
##os.environ["HADOOP_CONF_DIR"] = "C:\hdp\bin"
##sys.path.append("C:\Spark\python")
##sys.path.append("C:\Spark\bin")
# specify spark home
os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark"
# specify pyspark path so its libraries can be accessed by this application
sys.path.append("/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark/python")
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
self.conf = SparkConf().setMaster("yarn-client")
self.conf.setAppName("MrT")
self.conf.set("spark.executor.memory", "5g")
self.conf.set("spark.driver.memory", "10g")
self.sc = SparkContext(conf = self.conf, pyFiles =
["ComputeCovHistory.py", "go.py", "risk_DSconvert.py", "ewstats.py", "ewstatsRDD.py", "ewstatswrap.py"])
"""
toDF method is a monkey patch executed inside SQLContext constructor
so to be able to use it you have to create a SQLContext first
"""
self.sqlContextInstance = SQLContext(self.sc)
def disconnect(self):
self.sc.stop()
示例10: main
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def main():
spark_conf = SparkConf().setAppName("Different-Sampling data").setMaster('local[*]')
spark_conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sc = SparkContext(conf= spark_conf)
GA.logInConsole(0, "input file read!")
rdd = sc.textFile("/home/fatemeh/Data/saveData.txt", minPartitions= 500, use_unicode=False)
rdd.unpersist()
# print('\nNumber of Partitions for this run: ', rdd.getNumPartitions())
vectorRDD = rdd.map(lambda line: toVector(line, splitter = ' '))
GA.logInConsole(0 , "Data Vectorized!")
ss = list()
GA.logInConsole(-1, 'Start the ensemble')
GA.logInConsole(-10, "GA with range 3")
ss.append(GA.parallel_GA_main(vectorRDD,sc, 5))
# GA.logInConsole(-10, "GA with range 4")
# ss.append(GA.parallel_GA_main(norm,sc, 4))
# GA.logInConsole(-10, "GA with range 5")
# ss.append(GA.parallel_GA_main(norm,sc, 5))
# GA.logInConsole(-10, "GA with range 3 and Sampled data set")
# sampleRDD = norm.sample(False, 0.6, seed=10)
# ss.append(GA.parallel_GA_main(sampleRDD,sc, 3))
print(ss)
#selectedSS = voted_subsapces(ss)
# SSD.outlierDetection(vectorRDD, ss)
GA.logInConsole(100, "\nend of program")
sc.stop()
示例11: start
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def start():
sconf = SparkConf()
sconf.set('spark.cores.max', 2)
sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
ssc = StreamingContext(sc, 2)
brokers = "localhost:9092"
topics = ['test']
kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})
lines1 = kafkaStreams_lines.map(lambda x: x[1]) # 注意 取tuple下的第二个即为接收到的kafka流
words = lines1.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
wordcounts = pairs.reduceByKey(lambda x, y: x + y)
print(wordcounts)
kafkaStreams_lines.transform(storeOffsetRanges).foreachRDD(printOffsetRanges)
wordcounts.pprint()
# 统计生成的随机数的分布情况
ssc.start() # Start the computation
ssc.awaitTermination() # Wait for the computation to terminate
示例12: get_default_spark_conf
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def get_default_spark_conf():
conf = SparkConf(). \
setAppName("pyunit-test"). \
setMaster("local-cluster[3,1,2048]"). \
set("spark.ext.h2o.disable.ga","true"). \
set("spark.driver.memory", "2g"). \
set("spark.executor.memory", "2g"). \
set("spark.ext.h2o.client.log.level", "DEBUG"). \
set("spark.ext.h2o.repl.enabled", "false"). \
set("spark.task.maxFailures", "1"). \
set("spark.rpc.numRetries", "1"). \
set("spark.deploy.maxExecutorRetries", "1"). \
set("spark.network.timeout", "360s"). \
set("spark.worker.timeout", "360"). \
set("spark.ext.h2o.backend.cluster.mode", ExternalClusterTestHelper.cluster_mode()). \
set("spark.ext.h2o.cloud.name", ExternalClusterTestHelper.unique_cloud_name("test")). \
set("spark.ext.h2o.external.start.mode", os.getenv("spark.ext.h2o.external.start.mode", "manual")) .\
set("spark.sql.warehouse.dir", "file:" + os.path.join(os.getcwd(), "spark-warehouse"))
if ExternalClusterTestHelper.tests_in_external_mode():
conf.set("spark.ext.h2o.client.ip", ExternalClusterTestHelper.local_ip())
conf.set("spark.ext.h2o.external.cluster.num.h2o.nodes", "2")
return conf
示例13: setUpClass
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def setUpClass(cls):
class_name = cls.__name__
conf = SparkConf()
conf.set('spark.app.name', 'class_name')
# Read the spark configuration and update the spark conf
test_spark_config = ConfigParser.ConfigParser()
test_spark_config.read('test_config.cfg')
test_spark_config.sections()
configs = dict(test_spark_config.items('spark_conf_test_generic'))
for k, v in configs.items():
conf.set(k, v)
cls.spark_test_configs = configs
# Create the spark context
cls.sc = SparkContext(conf=conf)
if 'PYSPARK_DRIVER_PYTHON' in configs.keys():
cls.sc.pythonExec = configs['PYSPARK_DRIVER_PYTHON']
else:
cls.sc.pythonExec = 'python2.7'
logger = cls.sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s: %(message)s')
cls.logger = logging.getLogger(__name__)
cls.logger.setLevel(logging.DEBUG)
示例14: getSparkContext
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def getSparkContext(self, appName, master):
print(appName)
print(master)
conf = SparkConf().setAppName(appName).setMaster(master)
conf.set("spark.local.ip", "127.0.0.1")
conf.set("spark.driver.host", "127.0.0.1")
return SparkContext(conf=conf)
示例15: _test_broadcast_on_driver
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import set [as 别名]
def _test_broadcast_on_driver(self, *extra_confs):
conf = SparkConf()
for key, value in extra_confs:
conf.set(key, value)
conf.setMaster("local-cluster[2,1,1024]")
self.sc = SparkContext(conf=conf)
bs = self.sc.broadcast(value=5)
self.assertEqual(5, bs.value)