本文整理汇总了Python中pyspark.streaming.context.StreamingContext.start方法的典型用法代码示例。如果您正苦于以下问题:Python StreamingContext.start方法的具体用法?Python StreamingContext.start怎么用?Python StreamingContext.start使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.streaming.context.StreamingContext
的用法示例。
在下文中一共展示了StreamingContext.start方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _writeAndVerify
# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import start [as 别名]
def _writeAndVerify(self, ports):
# Set up the streaming context and input streams
ssc = StreamingContext(self.sc, self.duration)
try:
addresses = [("localhost", port) for port in ports]
dstream = FlumeUtils.createPollingStream(
ssc,
addresses,
maxBatchSize=self._utils.eventsPerBatch(),
parallelism=5)
outputBuffer = []
def get_output(_, rdd):
for e in rdd.collect():
outputBuffer.append(e)
dstream.foreachRDD(get_output)
ssc.start()
self._utils.sendDatAndEnsureAllDataHasBeenReceived()
self.wait_for(outputBuffer, self._utils.getTotalEvents())
outputHeaders = [event[0] for event in outputBuffer]
outputBodies = [event[1] for event in outputBuffer]
self._utils.assertOutput(outputHeaders, outputBodies)
finally:
ssc.stop(False)
示例2: StreamingContextTests
# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import start [as 别名]
class StreamingContextTests(PySparkStreamingTestCase):
duration = 0.1
setupCalled = False
def _add_input_stream(self):
inputs = [range(1, x) for x in range(101)]
stream = self.ssc.queueStream(inputs)
self._collect(stream, 1, block=False)
def test_stop_only_streaming_context(self):
self._add_input_stream()
self.ssc.start()
self.ssc.stop(False)
self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)
def test_stop_multiple_times(self):
self._add_input_stream()
self.ssc.start()
self.ssc.stop(False)
self.ssc.stop(False)
def test_queue_stream(self):
input = [list(range(i + 1)) for i in range(3)]
dstream = self.ssc.queueStream(input)
result = self._collect(dstream, 3)
self.assertEqual(input, result)
def test_text_file_stream(self):
d = tempfile.mkdtemp()
self.ssc = StreamingContext(self.sc, self.duration)
dstream2 = self.ssc.textFileStream(d).map(int)
result = self._collect(dstream2, 2, block=False)
self.ssc.start()
for name in ('a', 'b'):
time.sleep(1)
with open(os.path.join(d, name), "w") as f:
f.writelines(["%d\n" % i for i in range(10)])
self.wait_for(result, 2)
self.assertEqual([list(range(10)), list(range(10))], result)
def test_binary_records_stream(self):
d = tempfile.mkdtemp()
self.ssc = StreamingContext(self.sc, self.duration)
dstream = self.ssc.binaryRecordsStream(d, 10).map(
lambda v: struct.unpack("10b", bytes(v)))
result = self._collect(dstream, 2, block=False)
self.ssc.start()
for name in ('a', 'b'):
time.sleep(1)
with open(os.path.join(d, name), "wb") as f:
f.write(bytearray(range(10)))
self.wait_for(result, 2)
self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])
def test_union(self):
input = [list(range(i + 1)) for i in range(3)]
dstream = self.ssc.queueStream(input)
dstream2 = self.ssc.queueStream(input)
dstream3 = self.ssc.union(dstream, dstream2)
result = self._collect(dstream3, 3)
expected = [i * 2 for i in input]
self.assertEqual(expected, result)
def test_transform(self):
dstream1 = self.ssc.queueStream([[1]])
dstream2 = self.ssc.queueStream([[2]])
dstream3 = self.ssc.queueStream([[3]])
def func(rdds):
rdd1, rdd2, rdd3 = rdds
return rdd2.union(rdd3).union(rdd1)
dstream = self.ssc.transform([dstream1, dstream2, dstream3], func)
self.assertEqual([2, 3, 1], self._take(dstream, 3))
def test_get_active(self):
self.assertEqual(StreamingContext.getActive(), None)
# Verify that getActive() returns the active context
self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
self.ssc.start()
self.assertEqual(StreamingContext.getActive(), self.ssc)
# Verify that getActive() returns None
self.ssc.stop(False)
self.assertEqual(StreamingContext.getActive(), None)
# Verify that if the Java context is stopped, then getActive() returns None
self.ssc = StreamingContext(self.sc, self.duration)
self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
self.ssc.start()
self.assertEqual(StreamingContext.getActive(), self.ssc)
self.ssc._jssc.stop(False)
self.assertEqual(StreamingContext.getActive(), None)
def test_get_active_or_create(self):
# Test StreamingContext.getActiveOrCreate() without checkpoint data
# See CheckpointTests for tests with checkpoint data
#.........这里部分代码省略.........
示例3: PySparkStreamingTestCase
# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import start [as 别名]
class PySparkStreamingTestCase(unittest.TestCase):
timeout = 10 # seconds
duration = .5
@classmethod
def setUpClass(cls):
class_name = cls.__name__
conf = SparkConf().set("spark.default.parallelism", 1)
cls.sc = SparkContext(appName=class_name, conf=conf)
cls.sc.setCheckpointDir("/tmp")
@classmethod
def tearDownClass(cls):
cls.sc.stop()
# Clean up in the JVM just in case there has been some issues in Python API
jSparkContextOption = SparkContext._jvm.SparkContext.get()
if jSparkContextOption.nonEmpty():
jSparkContextOption.get().stop()
def setUp(self):
self.ssc = StreamingContext(self.sc, self.duration)
def tearDown(self):
if self.ssc is not None:
self.ssc.stop(False)
# Clean up in the JVM just in case there has been some issues in Python API
jStreamingContextOption = StreamingContext._jvm.SparkContext.getActive()
if jStreamingContextOption.nonEmpty():
jStreamingContextOption.get().stop(False)
def wait_for(self, result, n):
start_time = time.time()
while len(result) < n and time.time() - start_time < self.timeout:
time.sleep(0.01)
if len(result) < n:
print("timeout after", self.timeout)
def _take(self, dstream, n):
"""
Return the first `n` elements in the stream (will start and stop).
"""
results = []
def take(_, rdd):
if rdd and len(results) < n:
results.extend(rdd.take(n - len(results)))
dstream.foreachRDD(take)
self.ssc.start()
self.wait_for(results, n)
return results
def _collect(self, dstream, n, block=True):
"""
Collect each RDDs into the returned list.
:return: list, which will have the collected items.
"""
result = []
def get_output(_, rdd):
if rdd and len(result) < n:
r = rdd.collect()
if r:
result.append(r)
dstream.foreachRDD(get_output)
if not block:
return result
self.ssc.start()
self.wait_for(result, n)
return result
def _test_func(self, input, func, expected, sort=False, input2=None):
"""
@param input: dataset for the test. This should be list of lists.
@param func: wrapped function. This function should return PythonDStream object.
@param expected: expected output for this testcase.
"""
if not isinstance(input[0], RDD):
input = [self.sc.parallelize(d, 1) for d in input]
input_stream = self.ssc.queueStream(input)
if input2 and not isinstance(input2[0], RDD):
input2 = [self.sc.parallelize(d, 1) for d in input2]
input_stream2 = self.ssc.queueStream(input2) if input2 is not None else None
# Apply test function to stream.
if input2:
stream = func(input_stream, input_stream2)
else:
stream = func(input_stream)
result = self._collect(stream, len(expected))
if sort:
self._sort_result_based_on_key(result)
self._sort_result_based_on_key(expected)
#.........这里部分代码省略.........
示例4: SparkConf
# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import start [as 别名]
# 6.2.4절 예제 6-12
from pyspark import SparkContext, SparkConf, storagelevel
from pyspark.streaming.context import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
## pyspark에서 실행할 경우 sparkContext는 생성하지 않습니다!
# ./pyspark --packages org.apache.spark:spark-streaming-kafka-0-8-assembly_2.11:2.0.2
conf = SparkConf()
sc = SparkContext(master="local[*]", appName="KafkaSample", conf=conf)
ssc = StreamingContext(sc, 3)
ds1 = KafkaUtils.createStream(ssc, "localhost:2181", "test-consumer-group1", {"test": 3})
ds2 = KafkaUtils.createDirectStream(ssc, ["test"], {"metadata.broker.list": "localhost:9092"})
ds1.pprint()
ds2.pprint()
ssc.start()
ssc.awaitTermination()
示例5: StreamingContextTests
# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import start [as 别名]
class StreamingContextTests(PySparkStreamingTestCase):
duration = 0.1
def _add_input_stream(self):
inputs = [range(1, x) for x in range(101)]
stream = self.ssc.queueStream(inputs)
self._collect(stream, 1, block=False)
def test_stop_only_streaming_context(self):
self._add_input_stream()
self.ssc.start()
self.ssc.stop(False)
self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)
def test_stop_multiple_times(self):
self._add_input_stream()
self.ssc.start()
self.ssc.stop(False)
self.ssc.stop(False)
def test_queue_stream(self):
input = [list(range(i + 1)) for i in range(3)]
dstream = self.ssc.queueStream(input)
result = self._collect(dstream, 3)
self.assertEqual(input, result)
def test_text_file_stream(self):
d = tempfile.mkdtemp()
self.ssc = StreamingContext(self.sc, self.duration)
dstream2 = self.ssc.textFileStream(d).map(int)
result = self._collect(dstream2, 2, block=False)
self.ssc.start()
for name in ("a", "b"):
time.sleep(1)
with open(os.path.join(d, name), "w") as f:
f.writelines(["%d\n" % i for i in range(10)])
self.wait_for(result, 2)
self.assertEqual([list(range(10)), list(range(10))], result)
def test_binary_records_stream(self):
d = tempfile.mkdtemp()
self.ssc = StreamingContext(self.sc, self.duration)
dstream = self.ssc.binaryRecordsStream(d, 10).map(lambda v: struct.unpack("10b", bytes(v)))
result = self._collect(dstream, 2, block=False)
self.ssc.start()
for name in ("a", "b"):
time.sleep(1)
with open(os.path.join(d, name), "wb") as f:
f.write(bytearray(range(10)))
self.wait_for(result, 2)
self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])
def test_union(self):
input = [list(range(i + 1)) for i in range(3)]
dstream = self.ssc.queueStream(input)
dstream2 = self.ssc.queueStream(input)
dstream3 = self.ssc.union(dstream, dstream2)
result = self._collect(dstream3, 3)
expected = [i * 2 for i in input]
self.assertEqual(expected, result)
def test_transform(self):
dstream1 = self.ssc.queueStream([[1]])
dstream2 = self.ssc.queueStream([[2]])
dstream3 = self.ssc.queueStream([[3]])
def func(rdds):
rdd1, rdd2, rdd3 = rdds
return rdd2.union(rdd3).union(rdd1)
dstream = self.ssc.transform([dstream1, dstream2, dstream3], func)
self.assertEqual([2, 3, 1], self._take(dstream, 3))
示例6: StreamingTestCase
# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import start [as 别名]
class StreamingTestCase(SparkTestingBaseReuse):
"""Basic common test case for Spark Streaming tests. Provides a
Spark Streaming context as well as some helper methods for creating
streaming input and collecting streaming output.
Modeled after PySparkStreamingTestCase."""
timeout = 15 # seconds
duration = .5
@classmethod
def setUpClass(cls):
super(StreamingTestCase, cls).setUpClass()
cls.sc.setCheckpointDir("/tmp")
@classmethod
def tearDownClass(cls):
super(StreamingTestCase, cls).tearDownClass()
@classmethod
def _sort_result_based_on_key(cls, result):
return map(lambda x: sorted(x), result)
def setUp(self):
self.ssc = StreamingContext(self.sc, self.duration)
def tearDown(self):
self.ssc.stop(False)
def wait_for(self, result, n):
start_time = time.time()
while len(result) < n and time.time() - start_time < self.timeout:
time.sleep(0.01)
if len(result) < n:
print("timeout after", self.timeout)
def _take(self, dstream, n):
"""
Return the first `n` elements in the stream (will start and stop).
"""
results = []
def take(_, rdd):
if rdd and len(results) < n:
results.extend(rdd.take(n - len(results)))
dstream.foreachRDD(take)
self.ssc.start()
self.wait_for(results, n)
return results
def _collect(self, dstream, n, block=True):
"""
Collect each RDDs into the returned list.
:return: list, which will have the collected items.
"""
result = []
def get_output(_, rdd):
if rdd and len(result) < n:
r = rdd.collect()
if r:
result.append(r)
dstream.foreachRDD(get_output)
if not block:
return result
self.ssc.start()
self.wait_for(result, n)
return result
def run_func(self, input, func, expected, sort=False, input2=None):
"""
@param input: dataset for the test. This should be list of lists
or list of RDDs.
@param input2: Optional second dataset for the test. If provided your
func must take two PythonDStreams as input.
@param func: wrapped function. This function should return
PythonDStream.
@param expected: expected output for this testcase.
Warning: If output is longer than expected this will silently
discard the additional output. TODO: fail when this happens.
"""
if not isinstance(input[0], RDD):
input = [self.sc.parallelize(d, 1) for d in input]
input_stream = self.ssc.queueStream(input)
if input2 and not isinstance(input2[0], RDD):
input2 = [self.sc.parallelize(d, 1) for d in input2]
# Apply test function to stream.
if input2:
input_stream2 = self.ssc.queueStream(input2)
stream = func(input_stream, input_stream2)
else:
stream = func(input_stream)
#.........这里部分代码省略.........
示例7: BasicOperationTests
# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import start [as 别名]
class BasicOperationTests(PySparkStreamingTestCase):
def test_map(self):
"""Basic operation test for DStream.map."""
input = [range(1, 5), range(5, 9), range(9, 13)]
def func(dstream):
return dstream.map(str)
expected = [list(map(str, x)) for x in input]
self._test_func(input, func, expected)
def test_flatMap(self):
"""Basic operation test for DStream.flatMap."""
input = [range(1, 5), range(5, 9), range(9, 13)]
def func(dstream):
return dstream.flatMap(lambda x: (x, x * 2))
expected = [list(chain.from_iterable((map(lambda y: [y, y * 2], x))))
for x in input]
self._test_func(input, func, expected)
def test_filter(self):
"""Basic operation test for DStream.filter."""
input = [range(1, 5), range(5, 9), range(9, 13)]
def func(dstream):
return dstream.filter(lambda x: x % 2 == 0)
expected = [[y for y in x if y % 2 == 0] for x in input]
self._test_func(input, func, expected)
def test_count(self):
"""Basic operation test for DStream.count."""
input = [range(5), range(10), range(20)]
def func(dstream):
return dstream.count()
expected = [[len(x)] for x in input]
self._test_func(input, func, expected)
def test_slice(self):
"""Basic operation test for DStream.slice."""
import datetime as dt
self.ssc = StreamingContext(self.sc, 1.0)
self.ssc.remember(4.0)
input = [[1], [2], [3], [4]]
stream = self.ssc.queueStream([self.sc.parallelize(d, 1) for d in input])
time_vals = []
def get_times(t, rdd):
if rdd and len(time_vals) < len(input):
time_vals.append(t)
stream.foreachRDD(get_times)
self.ssc.start()
self.wait_for(time_vals, 4)
begin_time = time_vals[0]
def get_sliced(begin_delta, end_delta):
begin = begin_time + dt.timedelta(seconds=begin_delta)
end = begin_time + dt.timedelta(seconds=end_delta)
rdds = stream.slice(begin, end)
result_list = [rdd.collect() for rdd in rdds]
return [r for result in result_list for r in result]
self.assertEqual(set([1]), set(get_sliced(0, 0)))
self.assertEqual(set([2, 3]), set(get_sliced(1, 2)))
self.assertEqual(set([2, 3, 4]), set(get_sliced(1, 4)))
self.assertEqual(set([1, 2, 3, 4]), set(get_sliced(0, 4)))
def test_reduce(self):
"""Basic operation test for DStream.reduce."""
input = [range(1, 5), range(5, 9), range(9, 13)]
def func(dstream):
return dstream.reduce(operator.add)
expected = [[reduce(operator.add, x)] for x in input]
self._test_func(input, func, expected)
def test_reduceByKey(self):
"""Basic operation test for DStream.reduceByKey."""
input = [[("a", 1), ("a", 1), ("b", 1), ("b", 1)],
[("", 1), ("", 1), ("", 1), ("", 1)],
[(1, 1), (1, 1), (2, 1), (2, 1), (3, 1)]]
def func(dstream):
return dstream.reduceByKey(operator.add)
expected = [[("a", 2), ("b", 2)], [("", 4)], [(1, 2), (2, 2), (3, 1)]]
self._test_func(input, func, expected, sort=True)
def test_mapValues(self):
"""Basic operation test for DStream.mapValues."""
input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)],
[(0, 4), (1, 1), (2, 2), (3, 3)],
[(1, 1), (2, 1), (3, 1), (4, 1)]]
def func(dstream):
return dstream.mapValues(lambda x: x + 10)
expected = [[("a", 12), ("b", 12), ("c", 11), ("d", 11)],
#.........这里部分代码省略.........
示例8: TestStreamingContextSuite
# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import start [as 别名]
class TestStreamingContextSuite(unittest.TestCase):
"""
Should we have conf property in SparkContext?
@property
def conf(self):
return self._conf
"""
def setUp(self):
self.master = "local[2]"
self.appName = self.__class__.__name__
self.batachDuration = Milliseconds(500)
self.sparkHome = "SomeDir"
self.envPair = {"key": "value"}
self.ssc = None
self.sc = None
def tearDown(self):
# Do not call pyspark.streaming.context.StreamingContext.stop directly because
# we do not wait to shutdown py4j client.
# We need change this simply calll streamingConxt.Stop
#self.ssc._jssc.stop()
if self.ssc is not None:
self.ssc.stop()
if self.sc is not None:
self.sc.stop()
# Why does it long time to terminate StremaingContext and SparkContext?
# Should we change the sleep time if this depends on machine spec?
time.sleep(1)
@classmethod
def tearDownClass(cls):
# Make sure tp shutdown the callback server
SparkContext._gateway._shutdown_callback_server()
def test_from_no_conf_constructor(self):
self.ssc = StreamingContext(master=self.master, appName=self.appName,
duration=self.batachDuration)
# Alternative call master: ssc.sparkContext.master
# I try to make code close to Scala.
self.assertEqual(self.ssc.sparkContext._conf.get("spark.master"), self.master)
self.assertEqual(self.ssc.sparkContext._conf.get("spark.app.name"), self.appName)
def test_from_no_conf_plus_spark_home(self):
self.ssc = StreamingContext(master=self.master, appName=self.appName,
sparkHome=self.sparkHome, duration=self.batachDuration)
self.assertEqual(self.ssc.sparkContext._conf.get("spark.home"), self.sparkHome)
def test_from_no_conf_plus_spark_home_plus_env(self):
self.ssc = StreamingContext(master=self.master, appName=self.appName,
sparkHome=self.sparkHome, environment=self.envPair,
duration=self.batachDuration)
self.assertEqual(self.ssc.sparkContext._conf.get("spark.executorEnv.key"), self.envPair["key"])
def test_from_existing_spark_context(self):
self.sc = SparkContext(master=self.master, appName=self.appName)
self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
def test_existing_spark_context_with_settings(self):
conf = SparkConf()
conf.set("spark.cleaner.ttl", "10")
self.sc = SparkContext(master=self.master, appName=self.appName, conf=conf)
self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
def test_from_conf_with_settings(self):
conf = SparkConf()
conf.set("spark.cleaner.ttl", "10")
conf.setMaster(self.master)
conf.setAppName(self.appName)
self.ssc = StreamingContext(conf=conf, duration=self.batachDuration)
self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
def test_stop_only_streaming_context(self):
self.sc = SparkContext(master=self.master, appName=self.appName)
self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
self._addInputStream(self.ssc)
self.ssc.start()
self.ssc.stop(False)
self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)
def test_stop_multiple_times(self):
self.ssc = StreamingContext(master=self.master, appName=self.appName,
duration=self.batachDuration)
self._addInputStream(self.ssc)
self.ssc.start()
self.ssc.stop()
self.ssc.stop()
def _addInputStream(self, s):
# Make sure each length of input is over 3 and
# numSlice is 2 due to deserializer problem in pyspark.streaming
test_inputs = map(lambda x: range(1, x), range(5, 101))
test_stream = s._testInputStream(test_inputs, 2)
# Register fake output operation
result = list()
test_stream._test_output(result)