本文整理汇总了Python中pyspark.SparkConf方法的典型用法代码示例。如果您正苦于以下问题:Python pyspark.SparkConf方法的具体用法?Python pyspark.SparkConf怎么用?Python pyspark.SparkConf使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark
的用法示例。
在下文中一共展示了pyspark.SparkConf方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: create_spark_context
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def create_spark_context(app_name="Quiz Bowl", configs=None) -> SparkContext:
if QB_SPARK_MASTER != "":
log.info("Spark master is %s" % QB_SPARK_MASTER)
spark_conf = SparkConf()\
.set('spark.rpc.message.maxSize', 300)\
.setAppName(app_name)\
.setMaster(QB_SPARK_MASTER)
else:
spark_conf = SparkConf()\
.set('spark.rpc.message.maxSize', 300)\
.setAppName(app_name)
if configs is not None:
for key, value in configs:
if key in ('spark.executor.cores', 'spark.max.cores'):
if value > QB_MAX_CORES:
log.info('Requested {r_cores} cores when the machine only has {n_cores} cores, reducing number of '
'cores to {n_cores}'.format(r_cores=value, n_cores=QB_MAX_CORES))
value = QB_MAX_CORES
spark_conf = spark_conf.set(key, value)
return SparkContext.getOrCreate(spark_conf)
示例2: run
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def run():
from pyspark import SparkContext, SparkConf
conf = SparkConf()
conf.setAppName('dispel4py')
conf.set("spark.storage.memoryFraction", "0.5")
sc = SparkContext(
conf=conf)
from dispel4py.new import processor
from dispel4py.utils import load_graph
args = parse_args()
graph = load_graph(args.module, args.attr)
if graph is None:
return
graph.flatten()
inputs = processor.create_inputs(args, graph)
process(sc, graph, inputs=inputs, args=args)
示例3: setUpClass
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def setUpClass(cls):
#load sample warc files
fh = open('tests/sample_wat.paths')
cls.watPaths = fh.readlines()
#initialize class
cls.cclinks = CCLinks('CC-MAIN-2018-13', 5)
cls.cclinks.output = 'tests/output/{}/parquet'.format(cls.cclinks.crawlIndex)
#remove output directory
if os.path.exists(cls.cclinks.output):
shutil.rmtree('tests/output')
#init pyspark
conf = pyspark.SparkConf().setMaster('local[*]').setAppName('Test_ExtractCCLinks')
cls.sc = pyspark.SparkContext.getOrCreate(conf=conf)
示例4: create_sc
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def create_sc():
sc_conf = SparkConf()
sc_conf.setAppName("finance-similarity-app")
sc_conf.setMaster('spark://10.21.208.21:7077')
sc_conf.set('spark.executor.memory', '2g')
sc_conf.set('spark.executor.cores', '4')
sc_conf.set('spark.cores.max', '40')
sc_conf.set('spark.logConf', True)
print sc_conf.getAll()
sc = None
try:
sc.stop()
sc = SparkContext(conf=sc_conf)
except:
sc = SparkContext(conf=sc_conf)
return sc
示例5: sparkSession
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def sparkSession(cls):
if not hasattr(cls, "spark"):
# We can't use the SparkSession Builder here, since we need to call
# Scala side's SmvTestHive.createContext to create the HiveTestContext's
# SparkSession.
# So we need to
# * Create a java_gateway
# * Create a SparkConf using the jgw (since without it SparkContext will ignore the given conf)
# * Create python SparkContext using the SparkConf (so we can specify the warehouse.dir)
# * Create Scala side HiveTestContext SparkSession
# * Create python SparkSession
jgw = launch_gateway(None)
jvm = jgw.jvm
import tempfile
import getpass
hivedir = "file://{0}/{1}/smv_hive_test".format(tempfile.gettempdir(), getpass.getuser())
sConf = SparkConf(False, _jvm=jvm).set("spark.sql.test", "")\
.set("spark.sql.hive.metastore.barrierPrefixes",
"org.apache.spark.sql.hive.execution.PairSerDe")\
.set("spark.sql.warehouse.dir", hivedir)\
.set("spark.ui.enabled", "false")
sc = SparkContext(master="local[1]", appName="SMV Python Test", conf=sConf, gateway=jgw).getOrCreate()
jss = sc._jvm.org.apache.spark.sql.hive.test.SmvTestHive.createContext(sc._jsc.sc())
cls.spark = SparkSession(sc, jss.sparkSession())
return cls.spark
示例6: create_spark_conf
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def create_spark_conf(**kwargs):
"""
Configure the Spark master node
:param kwargs:
:return:
"""
spark_executor_memory = kwargs.get("spark_executor_memory", "2g")
spark_driver_memory = kwargs.get("spark_driver_memory", "2g")
url = kwargs.get("url", SPARK_ADDR)
app = kwargs.get("app", 'pyFTS')
conf = SparkConf()
conf.setMaster(url)
conf.setAppName(app)
conf.set("spark.executor.memory", spark_executor_memory)
conf.set("spark.driver.memory", spark_driver_memory)
conf.set("spark.memory.offHeap.enabled",True)
conf.set("spark.memory.offHeap.size","16g")
return conf
示例7: __call__
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def __call__(self):
c = SparkConf().setAppName('Build %s' % self.model_name)
log.info('Using spark master: %s', c.get('spark.master'))
sc = SparkContext(conf=c)
kwargs = self.model.prepare(sc)
m = self.model.build(**kwargs)
m = self.model.format_items(m)
m = self.formatter(m)
if self.output_path:
log.info("Saving to: %s", self.output_path)
if os.path.isdir(self.output_path):
log.warn('Writing over output path: %s', self.output_path)
shutil.rmtree(self.output_path)
m.saveAsTextFile(self.output_path, 'org.apache.hadoop.io.compress.GzipCodec')
elif self.sample > 0:
print '\n'.join(str(i) for i in m.take(self.sample))
log.info('Done.')
示例8: main
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def main(date, aws_access_key_id, aws_secret_access_key, region, table, sample_rate):
# Clobber the AWS access credentials
os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
APP_NAME = "TaarDynamo"
conf = SparkConf().setAppName(APP_NAME)
spark = SparkSession.builder.config(conf=conf).getOrCreate()
date_obj = datetime.strptime(date, "%Y%m%d") - PATCH_DAYS
reduction_output = run_etljob(
spark,
date_obj,
region,
table,
sample_rate,
aws_access_key_id,
aws_secret_access_key,
)
pprint(reduction_output)
示例9: initialize
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def initialize(cls, options_from_ini=None):
if cls._instance:
return cls._instance
from pyspark import SparkConf
cls._instance = SparkConf()
cls.options = dict(cls.DEFAULTS)
if options_from_ini:
cls.options.update(cls._parse_config(options_from_ini))
for k, v in cls.options.items():
cls._instance.set(k, v)
return cls._instance
示例10: run
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def run(self):
self.args = self.parse_arguments()
conf = SparkConf()
if self.args.spark_profiler:
conf = conf.set("spark.python.profile", "true")
sc = SparkContext(
appName=self.name,
conf=conf)
sqlc = SQLContext(sparkContext=sc)
self.init_accumulators(sc)
self.run_job(sc, sqlc)
if self.args.spark_profiler:
sc.show_profiles()
sc.stop()
示例11: set_spark_defaults
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def set_spark_defaults(conf, name='spark-job'):
"""
Update the configuration dictionary for setting up spark, creating the
dictionary if does not exist yet
"""
if not conf:
conf = dict()
home = os.path.join('/tmp', str(uuid.uuid4()))
conf['SparkConfiguration'] = SparkConf()\
.setMaster('yarn-client')\
.setAppName(name)\
.set("spark.sql.shuffle.partitions", "1000")\
.set("spark.scheduler.revive.interval", "3")\
.set("spark.task.maxFailures", "0")\
.set("spark.executorEnv.HOME", home)
return conf
示例12: create_task
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def create_task(words):
conf = SparkConf().setAppName('letter count')
sc = SparkContext(conf=conf)
seq = words.split()
data = sc.parallelize(seq)
counts = data.map(lambda word: (word, 1)).reduceByKey(add).collect()
sc.stop()
return dict(counts)
示例13: parse_raw_wikidata
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def parse_raw_wikidata(output):
spark_conf = SparkConf().setAppName('QB Wikidata').setMaster(QB_SPARK_MASTER)
sc = SparkContext.getOrCreate(spark_conf) # type: SparkContext
wikidata = sc.textFile('s3a://entilzha-us-west-2/wikidata/wikidata-20170306-all.json')
def parse_line(line):
if len(line) == 0:
return []
if line[0] == '[' or line[0] == ']':
return []
elif line.endswith(','):
return [json.loads(line[:-1])]
else:
return [json.loads(line)]
parsed_wikidata = wikidata.flatMap(parse_line).cache()
property_map = extract_property_map(parsed_wikidata)
b_property_map = sc.broadcast(property_map)
wikidata_items = parsed_wikidata.filter(lambda d: d['type'] == 'item').cache()
parsed_wikidata.unpersist()
item_page_map = extract_item_page_map(wikidata_items)
b_item_page_map = sc.broadcast(item_page_map)
parsed_item_map = extract_items(wikidata_items, b_property_map, b_item_page_map)
with open(output, 'wb') as f:
pickle.dump({
'parsed_item_map': parsed_item_map,
'item_page_map': item_page_map,
'property_map': property_map
}, f)
sc.stop()
示例14: work_spark
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def work_spark(args):
conf = pyspark.SparkConf().setAppName( "temp1" ).setMaster( "local[*]" ).set( "spark.driver.host", "localhost" ) \
.set('spark.executor.memory', '6g')
with pyspark.SparkContext("local[*]", "PySparkWordCount", conf=conf) as sc:
(sc.parallelize(args.files)
.flatMap(get_games)
.flatMap(lambda game: game.mainline())
#.sample(False, .1)
.map(process)
.mapPartitions(merge)
.saveAsPickleFile('pikle.out')
)
示例15: __init__
# 需要导入模块: import pyspark [as 别名]
# 或者: from pyspark import SparkConf [as 别名]
def __init__(self, processes: int = 8):
self.spark_conf = SparkConf().setAppName("jmetalpy").setMaster(f"local[{processes}]")
self.spark_context = SparkContext(conf=self.spark_conf)
logger = self.spark_context._jvm.org.apache.log4j
logger.LogManager.getLogger("org").setLevel(logger.Level.WARN)