本文整理汇总了Python中pyspark.context.SparkContext方法的典型用法代码示例。如果您正苦于以下问题:Python context.SparkContext方法的具体用法?Python context.SparkContext怎么用?Python context.SparkContext使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.context
的用法示例。
在下文中一共展示了context.SparkContext方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_module_dependency
# 需要导入模块: from pyspark import context [as 别名]
# 或者: from pyspark.context import SparkContext [as 别名]
def test_module_dependency(self):
"""Submit and test a script with a dependency on another module"""
script = self.createTempFile("test.py", """
|from pyspark import SparkContext
|from mylib import myfunc
|
|sc = SparkContext()
|print sc.parallelize([1, 2, 3]).map(myfunc).collect()
""")
zip = self.createFileInZip("mylib.py", """
|def myfunc(x):
| return x + 1
""")
proc = subprocess.Popen([self.sparkSubmit, "--py-files", zip, script],
stdout=subprocess.PIPE)
out, err = proc.communicate()
self.assertEqual(0, proc.returncode)
self.assertIn("[2, 3, 4]", out)
示例2: test_module_dependency_on_cluster
# 需要导入模块: from pyspark import context [as 别名]
# 或者: from pyspark.context import SparkContext [as 别名]
def test_module_dependency_on_cluster(self):
"""Submit and test a script with a dependency on another module on a cluster"""
script = self.createTempFile("test.py", """
|from pyspark import SparkContext
|from mylib import myfunc
|
|sc = SparkContext()
|print sc.parallelize([1, 2, 3]).map(myfunc).collect()
""")
zip = self.createFileInZip("mylib.py", """
|def myfunc(x):
| return x + 1
""")
proc = subprocess.Popen(
[self.sparkSubmit, "--py-files", zip, "--master", "local-cluster[1,1,512]", script],
stdout=subprocess.PIPE)
out, err = proc.communicate()
self.assertEqual(0, proc.returncode)
self.assertIn("[2, 3, 4]", out)
示例3: setUp
# 需要导入模块: from pyspark import context [as 别名]
# 或者: from pyspark.context import SparkContext [as 别名]
def setUp(self):
self.sc = SparkContext()
self.sqlContext = SQLContext(self.sc)
示例4: _test_multiple_broadcasts
# 需要导入模块: from pyspark import context [as 别名]
# 或者: from pyspark.context import SparkContext [as 别名]
def _test_multiple_broadcasts(self, *extra_confs):
"""
Test broadcast variables make it OK to the executors. Tests multiple broadcast variables,
and also multiple jobs.
"""
conf = SparkConf()
for key, value in extra_confs:
conf.set(key, value)
conf.setMaster("local-cluster[2,1,1024]")
self.sc = SparkContext(conf=conf)
self._test_encryption_helper([5])
self._test_encryption_helper([5, 10, 20])
示例5: getOrCreate
# 需要导入模块: from pyspark import context [as 别名]
# 或者: from pyspark.context import SparkContext [as 别名]
def getOrCreate(cls, sc):
"""
Get the existing SQLContext or create a new one with given SparkContext.
:param sc: SparkContext
"""
if cls._instantiatedContext is None:
jsqlContext = sc._jvm.SQLContext.getOrCreate(sc._jsc.sc())
sparkSession = SparkSession(sc, jsqlContext.sparkSession())
cls(sc, sparkSession, jsqlContext)
return cls._instantiatedContext
示例6: newSession
# 需要导入模块: from pyspark import context [as 别名]
# 或者: from pyspark.context import SparkContext [as 别名]
def newSession(self):
"""
Returns a new SQLContext as new session, that has separate SQLConf,
registered temporary views and UDFs, but shared SparkContext and
table cache.
"""
return self.__class__(self._sc, self.sparkSession.newSession())
示例7: _test
# 需要导入模块: from pyspark import context [as 别名]
# 或者: from pyspark.context import SparkContext [as 别名]
def _test():
import os
import doctest
import tempfile
from pyspark.context import SparkContext
from pyspark.sql import Row, SQLContext
import pyspark.sql.context
os.chdir(os.environ["SPARK_HOME"])
globs = pyspark.sql.context.__dict__.copy()
sc = SparkContext('local[4]', 'PythonTest')
globs['tempfile'] = tempfile
globs['os'] = os
globs['sc'] = sc
globs['sqlContext'] = SQLContext(sc)
globs['rdd'] = rdd = sc.parallelize(
[Row(field1=1, field2="row1"),
Row(field1=2, field2="row2"),
Row(field1=3, field2="row3")]
)
globs['df'] = rdd.toDF()
jsonStrings = [
'{"field1": 1, "field2": "row1", "field3":{"field4":11}}',
'{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},'
'"field6":[{"field7": "row2"}]}',
'{"field1" : null, "field2": "row3", '
'"field3":{"field4":33, "field5": []}}'
]
globs['jsonStrings'] = jsonStrings
globs['json'] = sc.parallelize(jsonStrings)
(failure_count, test_count) = doctest.testmod(
pyspark.sql.context, globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE)
globs['sc'].stop()
if failure_count:
sys.exit(-1)
示例8: _test
# 需要导入模块: from pyspark import context [as 别名]
# 或者: from pyspark.context import SparkContext [as 别名]
def _test():
import doctest
import os
import tempfile
import py4j
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, Row
import pyspark.sql.readwriter
os.chdir(os.environ["SPARK_HOME"])
globs = pyspark.sql.readwriter.__dict__.copy()
sc = SparkContext('local[4]', 'PythonTest')
try:
spark = SparkSession.builder.getOrCreate()
except py4j.protocol.Py4JError:
spark = SparkSession(sc)
globs['tempfile'] = tempfile
globs['os'] = os
globs['sc'] = sc
globs['spark'] = spark
globs['df'] = spark.read.parquet('python/test_support/sql/parquet_partitioned')
(failure_count, test_count) = doctest.testmod(
pyspark.sql.readwriter, globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
sc.stop()
if failure_count:
sys.exit(-1)
示例9: _do_init
# 需要导入模块: from pyspark import context [as 别名]
# 或者: from pyspark.context import SparkContext [as 别名]
def _do_init(self, *args, **kwargs):
# Modifies base _do_init to add a Java-Cassandra SparkContext (jcsc)
# to the instance
super(CassandraSparkContext, self)._do_init(*args, **kwargs)
java_import(self._jvm, "com.datastax.spark.connector.CassandraJavaUtil")
java_import(self._jvm, "com.datastax.spark.connector.RowConvertingIterator")
self._jcsc = self._jvm.CassandraJavaUtil.javaFunctions(self._jsc)
示例10: registerFunction
# 需要导入模块: from pyspark import context [as 别名]
# 或者: from pyspark.context import SparkContext [as 别名]
def registerFunction(self, ssc, jsession, function_name, params):
jvm = self.gateway.jvm
# If we don't have a reference to a running SparkContext
# Get the SparkContext from the provided SparkSession.
if not self._sc:
master = ssc.master()
jsc = jvm.org.apache.spark.api.java.JavaSparkContext(ssc)
jsparkConf = ssc.conf()
sparkConf = SparkConf(_jconf=jsparkConf)
self._sc = SparkContext(
master=master,
conf=sparkConf,
gateway=self.gateway,
jsc=jsc)
self._session = SparkSession.builder.getOrCreate()
if function_name in functions_info:
function_info = functions_info[function_name]
if params:
evaledParams = ast.literal_eval(params)
else:
evaledParams = []
func = function_info.func(*evaledParams)
ret_type = function_info.returnType()
self._count = self._count + 1
registration_name = function_name + str(self._count)
udf = UserDefinedFunction(func, ret_type, registration_name)
# Used to allow non-default (e.g. Arrow) UDFS
udf.evalType = function_info.evalType()
judf = udf._judf
return judf
else:
print("Could not find function")
# We do this rather than raising an exception since Py4J debugging
# is rough and we can check it.
return None
示例11: accumulate_covariance_estimators
# 需要导入模块: from pyspark import context [as 别名]
# 或者: from pyspark.context import SparkContext [as 别名]
def accumulate_covariance_estimators(sc, data, model):
"""
Analogous function to function of the same name in lopq.model.
:param SparkContext sc:
a SparkContext
:param RDD data:
an RDD of numpy arrays
:param KMeansModel model:
a KMeansModel instance for which to fit local rotations
"""
def get_residual(x):
cluster = model.predict(x)
centroid = model.clusterCenters[cluster]
residual = x - centroid
return (cluster, residual)
def seq_op(acc, x):
acc += np.outer(x, x)
return acc
# Compute (assignment, residual) k/v pairs
residuals = data.map(get_residual)
residuals.cache()
# Collect counts and mean residuals
count = residuals.countByKey()
mu = residuals.reduceByKey(add).collectAsMap()
# Extract the dimension of the data
D = len(mu.values()[0])
# Collect accumulated outer products
A = residuals.aggregateByKey(np.zeros((D, D)), seq_op, add).collectAsMap()
residuals.unpersist()
return A, mu, count
示例12: compute_local_rotations
# 需要导入模块: from pyspark import context [as 别名]
# 或者: from pyspark.context import SparkContext [as 别名]
def compute_local_rotations(sc, data, model, num_buckets):
"""
Analogous to the function of the same name in lopq.model.
:param SparkContext sc:
a SparkContext
:param RDD data:
an RDD of numpy arrays
:param KMeansModel model:
a KMeansModel instance for which to fit local rotations
:param int num_buckets:
the number of subvectors over which to balance residual variance
"""
# Get estimators
A, mu, count = accumulate_covariance_estimators(sc, data, model)
# Format as ndarrays
V = len(model.centers)
A = dict_to_ndarray(A, V)
mu = dict_to_ndarray(mu, V)
count = dict_to_ndarray(count, V)
# Compute params
R, mu = compute_rotations_from_accumulators(A, mu, count, num_buckets)
return R, mu, count
示例13: __init__
# 需要导入模块: from pyspark import context [as 别名]
# 或者: from pyspark.context import SparkContext [as 别名]
def __init__(self, spark_context, sql_ctx=None):
"""Initialize a PSparkContext with the associacted spark context,
and Spark SQL context if provided. This context is usef to load
data into L{DataFrame}s.
Parameters
----------
spark_context: SparkContext
Initialized and configured spark context. If you are running in the
PySpark shell, this is already created as "sc".
sql_ctx: SQLContext, optional
Initialized and configured SQL context, if not provided Sparkling
Panda's will create one.
Returns
-------
Correctly initialized SparklingPandasContext.
"""
self.spark_ctx = spark_context
if sql_ctx:
self.sql_ctx = sql_ctx
else:
logging.info("No sql context provided, creating")
from pyspark.sql import SQLContext
self.sql_ctx = SQLContext(self.spark_ctx)
# Register our magical functions
register_sql_extensions(self.sql_ctx)
示例14: simple
# 需要导入模块: from pyspark import context [as 别名]
# 或者: from pyspark.context import SparkContext [as 别名]
def simple(cls, *args, **kwargs):
"""Takes the same arguments as SparkContext and constructs a
PSparkContext"""
return PSparkContext(SparkContext(*args, **kwargs))
示例15: stop
# 需要导入模块: from pyspark import context [as 别名]
# 或者: from pyspark.context import SparkContext [as 别名]
def stop(self):
"""Stop the underlying SparkContext
"""
self.spark_ctx.stop()