本文整理汇总了Python中pyspark.SparkContext.getOrCreate方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.getOrCreate方法的具体用法?Python SparkContext.getOrCreate怎么用?Python SparkContext.getOrCreate使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.SparkContext
的用法示例。
在下文中一共展示了SparkContext.getOrCreate方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: create_spark_context
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import getOrCreate [as 别名]
def create_spark_context(app_name="Quiz Bowl", configs=None) -> SparkContext:
if QB_SPARK_MASTER != "":
log.info("Spark master is %s" % QB_SPARK_MASTER)
spark_conf = SparkConf()\
.set('spark.rpc.message.maxSize', 300)\
.setAppName(app_name)\
.setMaster(QB_SPARK_MASTER)
else:
spark_conf = SparkConf()\
.set('spark.rpc.message.maxSize', 300)\
.setAppName(app_name)
if configs is not None:
for key, value in configs:
if key in ('spark.executor.cores', 'spark.max.cores'):
if value > QB_MAX_CORES:
log.info('Requested {r_cores} cores when the machine only has {n_cores} cores, reducing number of '
'cores to {n_cores}'.format(r_cores=value, n_cores=QB_MAX_CORES))
value = QB_MAX_CORES
spark_conf = spark_conf.set(key, value)
return SparkContext.getOrCreate(spark_conf)
示例2: parallelize
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import getOrCreate [as 别名]
def parallelize(self,
data: Iterable,
name,
namespace,
partition,
include_key,
persistent,
chunk_size,
in_place_computing,
create_if_missing,
error_if_exist):
_iter = data if include_key else enumerate(data)
from pyspark import SparkContext
rdd = SparkContext.getOrCreate().parallelize(_iter, partition)
rdd = util.materialize(rdd)
if namespace is None:
namespace = self._session_id
return RDDTable.from_rdd(rdd=rdd, job_id=self._session_id, namespace=namespace, name=name)
示例3: test_lf_applier_spark_preprocessor_memoized
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import getOrCreate [as 别名]
def test_lf_applier_spark_preprocessor_memoized(self) -> None:
sc = SparkContext.getOrCreate()
sql = SQLContext(sc)
@preprocessor(memoize=True)
def square_memoize(x: DataPoint) -> DataPoint:
return Row(num=x.num, num_squared=x.num ** 2)
@labeling_function(pre=[square_memoize])
def fp_memoized(x: DataPoint) -> int:
return 0 if x.num_squared > 42 else -1
df = pd.DataFrame(dict(num=DATA))
rdd = sql.createDataFrame(df).rdd
applier = SparkLFApplier([f, fp_memoized])
L = applier.apply(rdd)
np.testing.assert_equal(L, L_PREPROCESS_EXPECTED)
示例4: _load_pyfunc
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import getOrCreate [as 别名]
def _load_pyfunc(path):
"""
Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``.
:param path: Local filesystem path to the MLflow Model with the ``spark`` flavor.
"""
# NOTE: The getOrCreate() call below may change settings of the active session which we do not
# intend to do here. In particular, setting master to local[1] can break distributed clusters.
# To avoid this problem, we explicitly check for an active session. This is not ideal but there
# is no good workaround at the moment.
import pyspark
spark = pyspark.sql.SparkSession._instantiatedSession
if spark is None:
spark = pyspark.sql.SparkSession.builder.config("spark.python.worker.reuse", True) \
.master("local[1]").getOrCreate()
return _PyFuncModelWrapper(spark, _load_model(model_uri=path))
示例5: parse_raw_wikidata
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import getOrCreate [as 别名]
def parse_raw_wikidata(output):
spark_conf = SparkConf().setAppName('QB Wikidata').setMaster(QB_SPARK_MASTER)
sc = SparkContext.getOrCreate(spark_conf) # type: SparkContext
wikidata = sc.textFile('s3a://entilzha-us-west-2/wikidata/wikidata-20170306-all.json')
def parse_line(line):
if len(line) == 0:
return []
if line[0] == '[' or line[0] == ']':
return []
elif line.endswith(','):
return [json.loads(line[:-1])]
else:
return [json.loads(line)]
parsed_wikidata = wikidata.flatMap(parse_line).cache()
property_map = extract_property_map(parsed_wikidata)
b_property_map = sc.broadcast(property_map)
wikidata_items = parsed_wikidata.filter(lambda d: d['type'] == 'item').cache()
parsed_wikidata.unpersist()
item_page_map = extract_item_page_map(wikidata_items)
b_item_page_map = sc.broadcast(item_page_map)
parsed_item_map = extract_items(wikidata_items, b_property_map, b_item_page_map)
with open(output, 'wb') as f:
pickle.dump({
'parsed_item_map': parsed_item_map,
'item_page_map': item_page_map,
'property_map': property_map
}, f)
sc.stop()
示例6: create_spark_session
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import getOrCreate [as 别名]
def create_spark_session(app_name='Quiz Bowl', configs=None) -> SparkSession:
create_spark_context(app_name=app_name, configs=configs)
return SparkSession.builder.getOrCreate()
示例7: _getScaleHintList
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import getOrCreate [as 别名]
def _getScaleHintList():
featurizer = SparkContext.getOrCreate()._jvm.com.databricks.sparkdl.DeepImageFeaturizer
if isinstance(featurizer, py4j.java_gateway.JavaPackage):
# do not see DeepImageFeaturizer, possibly running without spark
# instead of failing return empty list
return []
return dict(featurizer.scaleHintsJava()).keys()
示例8: readImagesWithCustomFn
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import getOrCreate [as 别名]
def readImagesWithCustomFn(path, decode_f, numPartition=None):
"""
Read a directory of images (or a single image) into a DataFrame using a custom library to
decode the images.
:param path: str, file path.
:param decode_f: function to decode the raw bytes into an array compatible with one of the
supported OpenCv modes. see @imageIO.PIL_decode for an example.
:param numPartition: [optional] int, number or partitions to use for reading files.
:return: DataFrame with schema == ImageSchema.imageSchema.
"""
warnings.warn("readImagesWithCustomFn() will be removed in the next release of sparkdl. "
"Please use pillow and Pandas UDF instead.", DeprecationWarning)
return _readImagesWithCustomFn(path, decode_f, numPartition, sc=SparkContext.getOrCreate())
示例9: test_start_sentry_listener
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import getOrCreate [as 别名]
def test_start_sentry_listener():
spark_context = SparkContext.getOrCreate()
gateway = spark_context._gateway
assert gateway._callback_server is None
_start_sentry_listener(spark_context)
assert gateway._callback_server is not None
示例10: _rdd_from_dtable
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import getOrCreate [as 别名]
def _rdd_from_dtable(self):
storage_iterator = self._dtable.get_all()
if self._dtable.count() <= 0:
storage_iterator = []
num_partition = self._dtable.get_partitions()
from pyspark import SparkContext
self._rdd = SparkContext.getOrCreate() \
.parallelize(storage_iterator, num_partition) \
.persist(util.get_storage_level())
return self._rdd
示例11: broadcast_eggroll_session
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import getOrCreate [as 别名]
def broadcast_eggroll_session(work_mode, eggroll_session):
import pickle
pickled_client = pickle.dumps((work_mode.value, eggroll_session)).hex()
from pyspark import SparkContext
SparkContext.getOrCreate().setLocalProperty(_EGGROLL_CLIENT, pickled_client)
# noinspection PyProtectedMember,PyUnresolvedReferences
示例12: _rdd_from_dtable
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import getOrCreate [as 别名]
def _rdd_from_dtable(self):
storage_iterator = self._dtable.collect(use_serialize=True)
if self._dtable.count() <= 0:
storage_iterator = []
num_partition = self._dtable._partitions
from pyspark import SparkContext
self._rdd = SparkContext.getOrCreate() \
.parallelize(storage_iterator, num_partition) \
.persist(util.get_storage_level())
return self._rdd
示例13: test_lf_applier_spark
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import getOrCreate [as 别名]
def test_lf_applier_spark(self) -> None:
sc = SparkContext.getOrCreate()
sql = SQLContext(sc)
df = pd.DataFrame(dict(num=DATA))
rdd = sql.createDataFrame(df).rdd
applier = SparkLFApplier([f, g])
L = applier.apply(rdd)
np.testing.assert_equal(L, L_EXPECTED)
示例14: test_lf_applier_spark_fault
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import getOrCreate [as 别名]
def test_lf_applier_spark_fault(self) -> None:
sc = SparkContext.getOrCreate()
sql = SQLContext(sc)
df = pd.DataFrame(dict(num=DATA))
rdd = sql.createDataFrame(df).rdd
applier = SparkLFApplier([f, f_bad])
with self.assertRaises(Exception):
applier.apply(rdd)
L = applier.apply(rdd, fault_tolerant=True)
np.testing.assert_equal(L, L_EXPECTED_BAD)
示例15: test_lf_applier_spark_preprocessor
# 需要导入模块: from pyspark import SparkContext [as 别名]
# 或者: from pyspark.SparkContext import getOrCreate [as 别名]
def test_lf_applier_spark_preprocessor(self) -> None:
sc = SparkContext.getOrCreate()
sql = SQLContext(sc)
df = pd.DataFrame(dict(num=DATA))
rdd = sql.createDataFrame(df).rdd
applier = SparkLFApplier([f, fp])
L = applier.apply(rdd)
np.testing.assert_equal(L, L_PREPROCESS_EXPECTED)