本文整理汇总了Python中pyspark.SparkConf.setAppName方法的典型用法代码示例。如果您正苦于以下问题:Python SparkConf.setAppName方法的具体用法?Python SparkConf.setAppName怎么用?Python SparkConf.setAppName使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkConf
的用法示例。
在下文中一共展示了SparkConf.setAppName方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: spark_config
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import setAppName [as 别名]
def spark_config(self):
if self._spark_config is None:
os.environ['SPARK_SUBMIT_CLASSPATH'] = ','.join(self.spex_conf.spark_config.jars)
conf = SparkConf()
conf.setAppName(self.spex_conf.spark_config.name)
conf.setMaster(self.spex_conf.spark_config.master)
conf.set('spark.rdd.compress', 'true')
conf.set('spark.io.compression.codec', 'lz4')
conf.set('spark.mesos.coarse',
'true' if self.spex_conf.spark_config.coarse_mode else 'false')
# TODO - Setup all the other cruft as needed
#conf.set('spark.executor.memory', '4g')
#conf.set('spark.cores.max', '16')
#conf.set('spark.task.cpus', '6')
# TODO - bind port for spark web ui
self._spark_config = conf
config = self._spark_config
# These are always set, if someone changes them we simply set them back
config.set('spark.executor.uri', self.artifact_resolver(self.spex_conf.spark_distro))
config.setExecutorEnv(key='PYSPARK_PYTHON', value='./%s daemon' % self.spex_conf.spex_name)
return config
示例2: main
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import setAppName [as 别名]
def main():
"""
Main entry point of the application
"""
# Create spark configuration and spark context
include_path = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'preprocessing.py'))
conf = SparkConf()
conf.set('spark.executor.memory', '1500m')
conf.setAppName("Generating predictions")
sc = SparkContext(conf=conf, pyFiles=[include_path])
# Set S3 configuration
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", os.environ['AWS_ACCESS_KEY'])
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", os.environ['AWS_SECRET_KEY'])
# Single-pass predictions
fast_predict(sc, file_input="s3n://twitter-stream-data/twitter-*",
file_output="s3n://twitter-stream-predictions/final",
sports_model="PyTwitterNews/models/sports.model",
politics_model="PyTwitterNews/models/politics.model",
technology_model="PyTwitterNews/models/technology.model")
# Stop application
sc.stop()
示例3: __init__
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import setAppName [as 别名]
class SparkContextFactory:
def __init__(self):
# not sure why windows environment variable can't be read, I set it
##os.environ["SPARK_HOME"] = "C:\Spark"
# not sure why windows environment variable can't be read, I set it
##os.environ["HADOOP_CONF_DIR"] = "C:\hdp\bin"
##sys.path.append("C:\Spark\python")
##sys.path.append("C:\Spark\bin")
# specify spark home
os.environ["SPARK_HOME"] = "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark"
# specify pyspark path so its libraries can be accessed by this application
sys.path.append("/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p0.4/lib/spark/python")
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
self.conf = SparkConf().setMaster("yarn-client")
self.conf.setAppName("MrT")
self.conf.set("spark.executor.memory", "5g")
self.conf.set("spark.driver.memory", "10g")
self.sc = SparkContext(conf = self.conf, pyFiles =
["ComputeCovHistory.py", "go.py", "risk_DSconvert.py", "ewstats.py", "ewstatsRDD.py", "ewstatswrap.py"])
"""
toDF method is a monkey patch executed inside SQLContext constructor
so to be able to use it you have to create a SQLContext first
"""
self.sqlContextInstance = SQLContext(self.sc)
def disconnect(self):
self.sc.stop()
示例4: main
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import setAppName [as 别名]
def main():
# Setting the cluster configuration parameters
conf = SparkConf()
conf.setMaster("spark://localhost:7077")
conf.setAppName("Tweet App")
conf.set("spark.executor.memory", "3g")
conf.set("spark.driver.memory", "4g")
# Creating a Spark Context with conf file
sc = SparkContext(conf=conf)
# Creating and SQL context to perform SQL queries
sqlContext = SQLContext(sc)
# Define the data path
curr_path = os.path.dirname(os.path.abspath(__file__))
json_name = "out.json"
json_file_path = os.path.join(curr_path +
"/../Spark_Jobs/data/",
json_name)
parquet_file_path = createSQLContext(json_file_path, sqlContext)
print(parquet_file_path)
# Read from parquet file
parquetFile = sqlContext.read.parquet(parquet_file_path)
parquetFile.registerTempTable("tweets")
counter = sqlContext.sql("SELECT count(*) as cnt FROM tweets")
print("============= Count =================")
print("Count:: " + str(counter.collect()[0].cnt))
示例5: configureSpark
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import setAppName [as 别名]
def configureSpark():
conf = SparkConf()
conf.setMaster("local")
conf.setAppName("Apache Spark Alarm Parser")
conf.set("spark.executor.memory", "1g")
sc = SparkContext(conf = conf)
return sc
示例6: __connected_yarn_spark_cluster
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import setAppName [as 别名]
def __connected_yarn_spark_cluster(self, pilotcompute_description):
number_cores=1
if pilotcompute_description.has_key("number_cores"):
number_cores=int(pilotcompute_description["number_cores"])
number_of_processes = 1
if pilotcompute_description.has_key("number_of_processes"):
number_of_processes = int(pilotcompute_description["number_of_processes"])
executor_memory="1g"
if pilotcompute_description.has_key("number_of_processes"):
executor_memory = pilotcompute_description["physical_memory_per_process"]
conf = SparkConf()
conf.set("spark.num.executors", str(number_of_processes))
conf.set("spark.executor.instances", str(number_of_processes))
conf.set("spark.executor.memory", executor_memory)
conf.set("spark.executor.cores", number_cores)
if pilotcompute_description!=None:
for i in pilotcompute_description.keys():
if i.startswith("spark"):
conf.set(i, pilotcompute_description[i])
conf.setAppName("Pilot-Spark")
conf.setMaster("yarn-client")
sc = SparkContext(conf=conf)
sqlCtx = SQLContext(sc)
pilot = PilotCompute(spark_context=sc, spark_sql_context=sqlCtx)
return pilot
示例7: sparkconfig
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import setAppName [as 别名]
def sparkconfig():
# spark configuration options
# conf = SparkConf()
# conf.setMaster("spark://3.168.100.58:7077") # uncomment for standalone cluster
# conf.setMaster("local") # uncomment for local execution
# conf.setAppName("demo_chain")
# conf.set("spark.executor.memory", "2g")
# conf.set("spark.default.parallelism", 56) # 48)
# conf.set("spark.sql.inMemoryColumnarStorage.compressed","true")
# conf.set("sql.inMemoryColumnarStorage.batchSize",2000)
# AMAZON AWS EMR
conf = SparkConf()
conf.setMaster("yarn-client") #client gets output to terminals
#conf.setMaster("yarn-cluster") # this seems to runf aster but can't confirm
conf.set("spark.default.parallelism",648)
conf.setAppName("spark_markov_chain")
conf.set("spark.executor.memory", "22g")
conf.set("spark.executor.instances",9)
conf.set("spark.executor.cores",9)
conf.set("spark.yarn.executor.memoryOverhead",800)
conf.set("spark.rdd.compress","True")
conf.set("spark.shuffle.consolidateFiles","True")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
return conf
示例8: stackexchange_xml_spark_job
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import setAppName [as 别名]
def stackexchange_xml_spark_job():
server = bluebook_conf.HDFS_FQDN
conf = SparkConf()
xml_file_address = "hdfs://" + server + "/" +\
bluebook_conf.STACKEXCHANGE_XML_FOLDER_NAME +\
bluebook_conf.STACKEXCHANGE_XML_FILE_NAME
json_ques_folder_address = "hdfs://" + server + "/" +\
bluebook_conf.STACKEXCHANGE_JSON_QUES_FOLDER_NAME
json_ans_folder_address = "hdfs://" + server + "/" +\
bluebook_conf.STACKEXCHANGE_JSON_ANS_FOLDER_NAME
conf.setAppName('stackexchange_xml_spark_job')
spark_context = SparkContext(conf=conf)
file = spark_context.textFile(xml_file_address)
# Ques and Ans files are stored seperately depending of their 'posttypeid'
# Ques -> posttypeid == 1
# Ans -> posttypeid == 2
ques = file.map(stackexchange_xml_mapper)\
.filter(lambda dic: 'posttypeid' in dic.keys())\
.filter(lambda dic: dic['posttypeid'] == '1')\
.map(lambda d: jsoner(d))
ans = file.map(stackexchange_xml_mapper)\
.filter(lambda dic: 'posttypeid' in dic.keys())\
.filter(lambda dic: dic['posttypeid'] == '2')\
.map(lambda d: jsoner(d))
ques.saveAsTextFile(json_ques_folder_address)
ans.saveAsTextFile(json_ans_folder_address)
示例9: main
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import setAppName [as 别名]
def main(args):
if len(args) < 2:
sys.exit(1)
# Setting the cluster configuration parameters
spark_master = args[0]
spark_data_file_name = args[1]
file_path = CURR_DIR + "/" + spark_data_file_name
conf = SparkConf()
conf.setMaster(spark_master)
conf.setAppName("Log Scanner")
# Creating a Spark Context with conf file
sc = SparkContext(conf=conf)
txt_logs = sc.textFile(file_path).filter(lambda line: check(line))
access_logs = txt_logs.map(lambda line: AccessLog(line))
# Getting response_codes from log objects and caching it
response_codes = access_logs.map(lambda log: log.get_status()).cache()
log_count = response_codes.count()
print("Total Resonse Codes: " + str(log_count))
cnt = response_codes.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
response200 = cnt.filter(lambda x: x[0] == "200").map(lambda (x, y): y).collect()
print("###########################")
print("## Success Rate : " + str(int(response200[0])*100/log_count) + " % ##")
print("###########################")
示例10: read_conf
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import setAppName [as 别名]
def read_conf():
"""
Setting up spark contexts
"""
conf = SparkConf()
conf.setMaster("local[*]")
conf.setAppName("Testing")
return conf
示例11: getSparkConf
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import setAppName [as 别名]
def getSparkConf(self):
conf = SparkConf()
conf.setAppName(self.PROJECT_NAME)
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.cleaner.ttl", self.TTL)
# es
conf.set("es.index.auto.create", "true")
conf.set("es.nodes", self.ES_NODES)
return conf
示例12: init_spark_context
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import setAppName [as 别名]
def init_spark_context():
# load spark context
conf = SparkConf().setAppName("event-contour-server")
conf.setMaster("local[4]")
conf.setAppName("reduce")
conf.set("spark.executor.memory", "4g")
# IMPORTANT: pass aditional Python modules to each worker
sc = SparkContext(conf=conf, pyFiles=['app.py', 'contourGenerator.py','EventParallelize.py'])
return sc
示例13: get_sc
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import setAppName [as 别名]
def get_sc():
""" Defines and returns a SparkContext from some configurations via SparkConf. """
conf = SparkConf()
conf.setAppName("Jon's PySpark")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
conf.set("spark.kryroserializer.buffer.mb", "256")
conf.set("spark.akka.frameSize", "500")
conf.set("spark.akka.askTimeout", "30")
return SparkContext(conf=conf)
示例14: init
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import setAppName [as 别名]
def init(self):
os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
# os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
# os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
conf = SparkConf()
conf.setMaster("local[10]")
conf.setAppName("PySparkShell")
conf.set("spark.executor.memory", "2g")
conf.set("spark.driver.memory", "1g")
self.sc = SparkContext(conf=conf)
self.sqlContext = SQLContext(self.sc)
示例15: __init__
# 需要导入模块: from pyspark import SparkConf [as 别名]
# 或者: from pyspark.SparkConf import setAppName [as 别名]
def __init__(self, master, name):
self.name=name
self.master=master
print "init spark ..."
os.environ["HADOOP_HOME"]="D:\code\wqr\hadoop-common-2.2.0-bin"
conf = SparkConf()
conf.setMaster(self.master)
conf.setAppName(self.name)
self.sc = SparkContext(conf=conf)