本文整理汇总了Python中pyspark.context.SparkContext.stop方法的典型用法代码示例。如果您正苦于以下问题:Python SparkContext.stop方法的具体用法?Python SparkContext.stop怎么用?Python SparkContext.stop使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.context.SparkContext
的用法示例。
在下文中一共展示了SparkContext.stop方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _test
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import stop [as 别名]
def _test():
import doctest
import os
import tempfile
import py4j
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, Row
import pyspark.sql.readwriter
os.chdir(os.environ["SPARK_HOME"])
globs = pyspark.sql.readwriter.__dict__.copy()
sc = SparkContext('local[4]', 'PythonTest')
try:
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
except py4j.protocol.Py4JError:
spark = SparkSession(sc)
globs['tempfile'] = tempfile
globs['os'] = os
globs['sc'] = sc
globs['spark'] = spark
globs['df'] = spark.read.parquet('python/test_support/sql/parquet_partitioned')
globs['sdf'] = \
spark.read.format('text').stream('python/test_support/sql/streaming')
(failure_count, test_count) = doctest.testmod(
pyspark.sql.readwriter, globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF)
sc.stop()
if failure_count:
exit(-1)
示例2: SparkTestingBaseTestCase
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import stop [as 别名]
class SparkTestingBaseTestCase(unittest2.TestCase):
"""Basic common test case for Spark. Provides a Spark context as sc.
For non local mode testing you can either override sparkMaster
or set the enviroment property SPARK_MASTER for non-local mode testing."""
@classmethod
def getMaster(cls):
return os.getenv('SPARK_MASTER', "local[4]")
def setUp(self):
"""Setup a basic Spark context for testing"""
self.sc = SparkContext(self.getMaster())
self.sql_context = HiveContext(self.sc)
quiet_py4j()
def tearDown(self):
"""
Tear down the basic panda spark test case. This stops the running
context and does a hack to prevent Akka rebinding on the same port.
"""
self.sc.stop()
# To avoid Akka rebinding to the same port, since it doesn't unbind
# immediately on shutdown
self.sc._jvm.System.clearProperty("spark.driver.port")
示例3: _test
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import stop [as 别名]
def _test():
import doctest
import os
import tempfile
import py4j
from pyspark.context import SparkContext
from pyspark.sql import SparkSession, Row
import pyspark.sql.readwriter
os.chdir(os.environ["SPARK_HOME"])
globs = pyspark.sql.readwriter.__dict__.copy()
sc = SparkContext("local[4]", "PythonTest")
try:
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
except py4j.protocol.Py4JError:
spark = SparkSession(sc)
globs["tempfile"] = tempfile
globs["os"] = os
globs["sc"] = sc
globs["spark"] = spark
globs["df"] = spark.read.parquet("python/test_support/sql/parquet_partitioned")
(failure_count, test_count) = doctest.testmod(
pyspark.sql.readwriter,
globs=globs,
optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF,
)
sc.stop()
if failure_count:
exit(-1)
示例4: PyVertexRDDTestCase
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import stop [as 别名]
class PyVertexRDDTestCase(unittest.TestCase):
"""
Test collect, take, count, mapValues, diff,
filter, mapVertexPartitions, innerJoin and leftJoin
for VertexRDD
"""
def setUp(self):
class_name = self.__class__.__name__
conf = SparkConf().set("spark.default.parallelism", 1)
self.sc = SparkContext(appName=class_name, conf=conf)
self.sc.setCheckpointDir("/tmp")
def tearDown(self):
self.sc.stop()
def collect(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.take(1)
self.assertEqual(results, [(3, ("rxin", "student"))])
def take(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.collect()
self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
def count(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.count()
self.assertEqual(results, 2)
def mapValues(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.mapValues(lambda x: x + ":" + x)
self.assertEqual(results, [(3, ("rxin:rxin", "student:student")),
(7, ("jgonzal:jgonzal", "postdoc:postdoc"))])
def innerJoin(self):
vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
vertices0 = VertexRDD(vertexData0)
vertices1 = VertexRDD(vertexData1)
results = vertices0.innerJoin(vertices1).collect()
self.assertEqual(results, [])
def leftJoin(self):
vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
vertices0 = VertexRDD(vertexData0)
vertices1 = VertexRDD(vertexData1)
results = vertices0.diff(vertices1)
self.assertEqual(results, 2)
示例5: PySparkTestCase
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import stop [as 别名]
class PySparkTestCase(unittest.TestCase):
def setUp(self):
self._old_sys_path = list(sys.path)
class_name = self.__class__.__name__
self.sc = SparkContext('local[4]', class_name, batchSize=2)
def tearDown(self):
self.sc.stop()
sys.path = self._old_sys_path
示例6: PyEdgeRDDTestCase
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import stop [as 别名]
class PyEdgeRDDTestCase(unittest.TestCase):
"""
Test collect, take, count, mapValues,
filter and innerJoin for EdgeRDD
"""
def setUp(self):
class_name = self.__class__.__name__
conf = SparkConf().set("spark.default.parallelism", 1)
self.sc = SparkContext(appName=class_name, conf=conf)
self.sc.setCheckpointDir("/tmp")
def tearDown(self):
self.sc.stop()
# TODO
def collect(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.collect()
self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
# TODO
def take(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.collect()
self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
# TODO
def count(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.collect()
self.assertEqual(results, 2)
# TODO
def mapValues(self):
vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertices = VertexRDD(vertexData)
results = vertices.collect()
self.assertEqual(results, 2)
# TODO
def filter(self):
return
# TODO
def innerJoin(self):
vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))])
vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))])
vertices0 = VertexRDD(vertexData0)
vertices1 = VertexRDD(vertexData1)
results = vertices0.diff(vertices1)
self.assertEqual(results, 2)
示例7: PySparkTestCase
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import stop [as 别名]
class PySparkTestCase(unittest.TestCase):
def setUp(self):
self._old_sys_path = list(sys.path)
class_name = self.__class__.__name__
self.sc = SparkContext("local[4]", class_name, batchSize=2)
def tearDown(self):
self.sc.stop()
sys.path = self._old_sys_path
# To avoid Akka rebinding to the same port, since it doesn't unbind
# immediately on shutdown
self.sc._jvm.System.clearProperty("spark.driver.port")
示例8: PySparkTestCase
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import stop [as 别名]
class PySparkTestCase(unittest.TestCase):
def setUp(self):
class_name = self.__class__.__name__
self.sc = SparkContext('local', class_name)
self.sc._jvm.System.setProperty("spark.ui.showConsoleProgress", "false")
log4j = self.sc._jvm.org.apache.log4j
log4j.LogManager.getRootLogger().setLevel(log4j.Level.FATAL)
def tearDown(self):
self.sc.stop()
# To avoid Akka rebinding to the same port, since it doesn't unbind
# immediately on shutdown
self.sc._jvm.System.clearProperty("spark.driver.port")
示例9: PySparkTestCase
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import stop [as 别名]
class PySparkTestCase(unittest.TestCase):
def setUp(self):
class_name = self.__class__.__name__
self.sc = SparkContext('local', class_name)
def tearDown(self):
self.sc.stop()
def test_should_be_able_to_word_count(self):
rdd = self.sc.parallelize(["This is a text", "Another text", "More text", "a text"])
result = python_word_count.wordcount(rdd)
expected = [('a', 2), ('This', 1), ('text', 4), ('is', 1), ('Another', 1), ('More', 1)]
self.assertEquals(expected, result.collect())
示例10: SparkTestingBaseTestCase
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import stop [as 别名]
class SparkTestingBaseTestCase(unittest2.TestCase):
"""Basic common test case for Spark. Provides a Spark context as sc.
For non local mode testing you can either override sparkMaster
or set the enviroment property SPARK_MASTER for non-local mode testing."""
@classmethod
def getMaster(cls):
return os.getenv('SPARK_MASTER', "local[4]")
def setUp(self):
"""Setup a basic Spark context for testing"""
self.sc = SparkContext(self.getMaster())
quiet_py4j()
def tearDown(self):
"""
Tear down the basic panda spark test case. This stops the running
context and does a hack to prevent Akka rebinding on the same port.
"""
self.sc.stop()
# To avoid Akka rebinding to the same port, since it doesn't unbind
# immediately on shutdown
self.sc._jvm.System.clearProperty("spark.driver.port")
def assertRDDEquals(self, expected, result):
return self.compareRDD(expected, result) == []
def compareRDD(self, expected, result):
expectedKeyed = expected.map(lambda x: (x, 1))\
.reduceByKey(lambda x, y: x + y)
resultKeyed = result.map(lambda x: (x, 1))\
.reduceByKey(lambda x, y: x + y)
return expectedKeyed.cogroup(resultKeyed)\
.map(lambda x: tuple(map(list, x[1])))\
.filter(lambda x: x[0] != x[1]).take(1)
def assertRDDEqualsWithOrder(self, expected, result):
return self.compareRDDWithOrder(expected, result) == []
def compareRDDWithOrder(self, expected, result):
def indexRDD(rdd):
return rdd.zipWithIndex().map(lambda x: (x[1], x[0]))
indexExpected = indexRDD(expected)
indexResult = indexRDD(result)
return indexExpected.cogroup(indexResult)\
.map(lambda x: tuple(map(list, x[1])))\
.filter(lambda x: x[0] != x[1]).take(1)
示例11: SparkTestCase
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import stop [as 别名]
class SparkTestCase(unittest.TestCase):
def resourceFile(self, filename, module='adam-core'):
adamRoot = os.path.dirname(os.getcwd())
return os.path.join(os.path.join(adamRoot,
"%s/src/test/resources" % module),
filename)
def tmpFile(self):
tempFile = tempfile.NamedTemporaryFile(delete=True)
tempFile.close()
return tempFile.name
def checkFiles(self, file1, file2):
f1 = open(file1)
f2 = open(file2)
try:
self.assertEquals(f1.read(), f2.read())
finally:
f1.close()
f2.close()
def setUp(self):
self._old_sys_path = list(sys.path)
class_name = self.__class__.__name__
self.sc = SparkContext('local[4]', class_name)
def tearDown(self):
self.sc.stop()
sys.path = self._old_sys_path
示例12: predictQuantiles
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import stop [as 别名]
def predictQuantiles(self, features):
"""
Predicted Quantiles
"""
return self._call_java("predictQuantiles", features)
def predict(self, features):
"""
Predicted value
"""
return self._call_java("predict", features)
if __name__ == "__main__":
import doctest
import pyspark.ml.regression
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
globs = pyspark.ml.regression.__dict__.copy()
# The small batch size here ensures that we see multiple batches,
# even in these small test examples:
sc = SparkContext("local[2]", "ml.regression tests")
sqlContext = SQLContext(sc)
globs['sc'] = sc
globs['sqlContext'] = sqlContext
(failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS)
sc.stop()
if failure_count:
exit(-1)
示例13: StreamingContext
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import stop [as 别名]
class StreamingContext(object):
"""
Main entry point for Spark Streaming functionality. A StreamingContext represents the
connection to a Spark cluster, and can be used to create L{DStream}s and
broadcast variables on that cluster.
"""
def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
gateway=None, sparkContext=None, duration=None):
"""
Create a new StreamingContext. At least the master and app name and duration
should be set, either through the named parameters here or through C{conf}.
@param master: Cluster URL to connect to
(e.g. mesos://host:port, spark://host:port, local[4]).
@param appName: A name for your job, to display on the cluster web UI.
@param sparkHome: Location where Spark is installed on cluster nodes.
@param pyFiles: Collection of .zip or .py files to send to the cluster
and add to PYTHONPATH. These can be paths on the local file
system or HDFS, HTTP, HTTPS, or FTP URLs.
@param environment: A dictionary of environment variables to set on
worker nodes.
@param batchSize: The number of Python objects represented as a single
Java object. Set 1 to disable batching or -1 to use an
unlimited batch size.
@param serializer: The serializer for RDDs.
@param conf: A L{SparkConf} object setting Spark properties.
@param gateway: Use an existing gateway and JVM, otherwise a new JVM
will be instatiated.
@param sparkContext: L{SparkContext} object.
@param duration: A L{Duration} object for SparkStreaming.
"""
if not isinstance(duration, Duration):
raise TypeError("Input should be pyspark.streaming.duration.Duration object")
if sparkContext is None:
# Create the Python Sparkcontext
self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome,
pyFiles=pyFiles, environment=environment, batchSize=batchSize,
serializer=serializer, conf=conf, gateway=gateway)
else:
self._sc = sparkContext
# Start py4j callback server.
# Callback sever is need only by SparkStreming; therefore the callback sever
# is started in StreamingContext.
SparkContext._gateway.restart_callback_server()
self._set_clean_up_handler()
self._jvm = self._sc._jvm
self._jssc = self._initialize_context(self._sc._jsc, duration._jduration)
# Initialize StremaingContext in function to allow subclass specific initialization
def _initialize_context(self, jspark_context, jduration):
return self._jvm.JavaStreamingContext(jspark_context, jduration)
def _set_clean_up_handler(self):
""" set clean up hander using atexit """
def clean_up_handler():
SparkContext._gateway.shutdown()
atexit.register(clean_up_handler)
# atext is not called when the program is killed by a signal not handled by
# Python.
for sig in (SIGINT, SIGTERM):
signal(sig, clean_up_handler)
@property
def sparkContext(self):
"""
Return SparkContext which is associated with this StreamingContext.
"""
return self._sc
def start(self):
"""
Start the execution of the streams.
"""
self._jssc.start()
def awaitTermination(self, timeout=None):
"""
Wait for the execution to stop.
@param timeout: time to wait in milliseconds
"""
if timeout is None:
self._jssc.awaitTermination()
else:
self._jssc.awaitTermination(timeout)
def remember(self, duration):
"""
Set each DStreams in this context to remember RDDs it generated in the last given duration.
DStreams remember RDDs only for a limited duration of time and releases them for garbage
collection. This method allows the developer to specify how to long to remember the RDDs (
if the developer wishes to query old data outside the DStream computation).
@param duration pyspark.streaming.duration.Duration object.
#.........这里部分代码省略.........
示例14: ui_get_available_port
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import stop [as 别名]
conf = (SparkConf()
.setMaster(os.environ["SPARK_MASTER"]))
# set the UI port
conf.set("spark.ui.port", ui_get_available_port())
# configure docker containers as executors
conf.setSparkHome(os.environ.get("SPARK_HOME"))
conf.set("spark.mesos.executor.docker.image", "lab41/spark-mesos-dockerworker-ipython")
conf.set("spark.mesos.executor.home", "/usr/local/spark-1.4.1-bin-hadoop2.4")
conf.set("spark.executorEnv.MESOS_NATIVE_LIBRARY", "/usr/local/lib/libmesos.so")
conf.set("spark.network.timeout", "100")
# establish config-based context
sc = SparkContext(appName="DockerIPythonShell", pyFiles=add_files, conf=conf)
atexit.register(lambda: sc.stop())
try:
# Try to access HiveConf, it will raise exception if Hive is not added
sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
sqlCtx = sqlContext = HiveContext(sc)
except py4j.protocol.Py4JError:
sqlCtx = sqlContext = SQLContext(sc)
print("""Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/__ / .__/\_,_/_/ /_/\_\ version %s
/_/
""" % sc.version)
示例15: MrGeo
# 需要导入模块: from pyspark.context import SparkContext [as 别名]
# 或者: from pyspark.context.SparkContext import stop [as 别名]
class MrGeo(object):
operators = {"+": ["__add__", "__radd__", "__iadd__"],
"-": ["__sub__", "__rsub__", "__isub__"],
"*": ["__mul__", "__rmul__", "__imul__"],
"/": ["__div__", "__truediv__", "__rdiv__", "__rtruediv__", "__idiv__", "__itruediv__"],
"//": [], # floor div
"**": ["__pow__", "__rpow__", "__ipow__"], # pow
"=": [], # assignment, can't do!
"<": ["__lt__"],
"<=": ["__le__"],
">": ["__lt__"],
">=": ["__ge__"],
"==": ["__eq__"],
"!=": ["__ne__"],
"<>": [],
"!": [],
"&&": ["__and__", "__rand__", "__iand__"],
"&": [],
"||": ["__or__", "__ror__", "__ior__"],
"|": [],
"^": ["__xor__", "__rxor__", "__ixor__"],
"^=": []}
reserved = ["or", "and", "str", "int", "long", "float", "bool"]
gateway = None
lock = Lock()
sparkPyContext = None
sparkContext = None
job = None
def __init__(self, gateway=None):
MrGeo.ensure_gateway_initialized(self, gateway=gateway)
try:
self.initialize()
except:
# If an error occurs, clean up in order to allow future SparkContext creation:
self.stop()
raise
@classmethod
def ensure_gateway_initialized(cls, instance=None, gateway=None):
"""
Checks whether a SparkContext is initialized or not.
Throws error if a SparkContext is already running.
"""
with MrGeo.lock:
if not MrGeo.gateway:
MrGeo.gateway = gateway or launch_gateway()
MrGeo.jvm = MrGeo.gateway.jvm
def _create_job(self):
jvm = self.gateway.jvm
java_import(jvm, "org.mrgeo.data.DataProviderFactory")
java_import(jvm, "org.mrgeo.job.*")
java_import(jvm, "org.mrgeo.utils.DependencyLoader")
java_import(jvm, "org.mrgeo.utils.StringUtils")
appname = "PyMrGeo"
self.job = jvm.JobArguments()
set_field(self.job, "name", appname)
# Yarn in the default
self.useyarn()
def initialize(self):
self._create_job()
self._load_mapops()
def _load_mapops(self):
jvm = self.gateway.jvm
client = self.gateway._gateway_client
java_import(jvm, "org.mrgeo.job.*")
java_import(jvm, "org.mrgeo.mapalgebra.MapOpFactory")
java_import(jvm, "org.mrgeo.mapalgebra.raster.RasterMapOp")
java_import(jvm, "org.mrgeo.mapalgebra.raster.MrsPyramidMapOp")
java_import(jvm, "org.mrgeo.mapalgebra.ExportMapOp")
java_import(jvm, "org.mrgeo.mapalgebra.vector.VectorMapOp")
java_import(jvm, "org.mrgeo.mapalgebra.MapOp")
java_import(jvm, "org.mrgeo.utils.SparkUtils")
java_import(jvm, "org.mrgeo.data.*")
mapops = jvm.MapOpFactory.getMapOpClasses()
for rawmapop in mapops:
mapop = str(rawmapop.getCanonicalName().rstrip('$'))
java_import(jvm, mapop)
cls = JavaClass(mapop, gateway_client=client)
if self.is_instance_of(cls, jvm.RasterMapOp):
instance = 'RasterMapOp'
elif self.is_instance_of(cls, jvm.VectorMapOp):
instance = 'VectorMapOp'
elif self.is_instance_of(cls, jvm.MapOp):
#.........这里部分代码省略.........