本文整理汇总了Python中pyspark.streaming.context.StreamingContext类的典型用法代码示例。如果您正苦于以下问题:Python StreamingContext类的具体用法?Python StreamingContext怎么用?Python StreamingContext使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了StreamingContext类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _writeAndVerify
def _writeAndVerify(self, ports):
# Set up the streaming context and input streams
ssc = StreamingContext(self.sc, self.duration)
try:
addresses = [("localhost", port) for port in ports]
dstream = FlumeUtils.createPollingStream(
ssc,
addresses,
maxBatchSize=self._utils.eventsPerBatch(),
parallelism=5)
outputBuffer = []
def get_output(_, rdd):
for e in rdd.collect():
outputBuffer.append(e)
dstream.foreachRDD(get_output)
ssc.start()
self._utils.sendDatAndEnsureAllDataHasBeenReceived()
self.wait_for(outputBuffer, self._utils.getTotalEvents())
outputHeaders = [event[0] for event in outputBuffer]
outputBodies = [event[1] for event in outputBuffer]
self._utils.assertOutput(outputHeaders, outputBodies)
finally:
ssc.stop(False)
示例2: setup
def setup():
conf = SparkConf().set("spark.default.parallelism", 1)
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 0.5)
dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1))
wc = dstream.updateStateByKey(updater)
wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test")
wc.checkpoint(0.5)
return ssc
示例3: test_get_or_create
def test_get_or_create(self):
inputd = tempfile.mkdtemp()
outputd = tempfile.mkdtemp() + "/"
def updater(vs, s):
return sum(vs, s or 0)
def setup():
conf = SparkConf().set("spark.default.parallelism", 1)
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 0.5)
dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1))
wc = dstream.updateStateByKey(updater)
wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test")
wc.checkpoint(0.5)
return ssc
cpd = tempfile.mkdtemp("test_streaming_cps")
ssc = StreamingContext.getOrCreate(cpd, setup)
ssc.start()
def check_output(n):
while not os.listdir(outputd):
time.sleep(0.01)
time.sleep(1) # make sure mtime is larger than the previous one
with open(os.path.join(inputd, str(n)), "w") as f:
f.writelines(["%d\n" % i for i in range(10)])
while True:
p = os.path.join(outputd, max(os.listdir(outputd)))
if "_SUCCESS" not in os.listdir(p):
# not finished
time.sleep(0.01)
continue
ordd = ssc.sparkContext.textFile(p).map(lambda line: line.split(","))
d = ordd.values().map(int).collect()
if not d:
time.sleep(0.01)
continue
self.assertEqual(10, len(d))
s = set(d)
self.assertEqual(1, len(s))
m = s.pop()
if n > m:
continue
self.assertEqual(n, m)
break
check_output(1)
check_output(2)
ssc.stop(True, True)
time.sleep(1)
ssc = StreamingContext.getOrCreate(cpd, setup)
ssc.start()
check_output(3)
ssc.stop(True, True)
示例4: setup
def setup():
conf = SparkConf().set("spark.default.parallelism", 1)
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 0.5)
# A function that cannot be serialized
def process(time, rdd):
sc.parallelize(range(1, 10))
ssc.textFileStream(inputd).foreachRDD(process)
return ssc
示例5: test_transform_function_serializer_failure
def test_transform_function_serializer_failure(self):
inputd = tempfile.mkdtemp()
self.cpd = tempfile.mkdtemp("test_transform_function_serializer_failure")
def setup():
conf = SparkConf().set("spark.default.parallelism", 1)
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 0.5)
# A function that cannot be serialized
def process(time, rdd):
sc.parallelize(range(1, 10))
ssc.textFileStream(inputd).foreachRDD(process)
return ssc
self.ssc = StreamingContext.getOrCreate(self.cpd, setup)
try:
self.ssc.start()
except:
import traceback
failure = traceback.format_exc()
self.assertTrue(
"It appears that you are attempting to reference SparkContext" in failure)
return
self.fail("using SparkContext in process should fail because it's not Serializable")
示例6: test_slice
def test_slice(self):
"""Basic operation test for DStream.slice."""
import datetime as dt
self.ssc = StreamingContext(self.sc, 1.0)
self.ssc.remember(4.0)
input = [[1], [2], [3], [4]]
stream = self.ssc.queueStream([self.sc.parallelize(d, 1) for d in input])
time_vals = []
def get_times(t, rdd):
if rdd and len(time_vals) < len(input):
time_vals.append(t)
stream.foreachRDD(get_times)
self.ssc.start()
self.wait_for(time_vals, 4)
begin_time = time_vals[0]
def get_sliced(begin_delta, end_delta):
begin = begin_time + dt.timedelta(seconds=begin_delta)
end = begin_time + dt.timedelta(seconds=end_delta)
rdds = stream.slice(begin, end)
result_list = [rdd.collect() for rdd in rdds]
return [r for result in result_list for r in result]
self.assertEqual(set([1]), set(get_sliced(0, 0)))
self.assertEqual(set([2, 3]), set(get_sliced(1, 2)))
self.assertEqual(set([2, 3, 4]), set(get_sliced(1, 4)))
self.assertEqual(set([1, 2, 3, 4]), set(get_sliced(0, 4)))
示例7: test_stop_multiple_times
def test_stop_multiple_times(self):
self.ssc = StreamingContext(master=self.master, appName=self.appName,
duration=self.batachDuration)
self._addInputStream(self.ssc)
self.ssc.start()
self.ssc.stop()
self.ssc.stop()
示例8: test_stop_only_streaming_context
def test_stop_only_streaming_context(self):
self.sc = SparkContext(master=self.master, appName=self.appName)
self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration)
self._addInputStream(self.ssc)
self.ssc.start()
self.ssc.stop(False)
self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)
示例9: test_from_conf_with_settings
def test_from_conf_with_settings(self):
conf = SparkConf()
conf.set("spark.cleaner.ttl", "10")
conf.setMaster(self.master)
conf.setAppName(self.appName)
self.ssc = StreamingContext(conf=conf, duration=self.batachDuration)
self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10)
示例10: setUp
def setUp(self):
class_name = self.__class__.__name__
conf = SparkConf().set("spark.default.parallelism", 1)
self.sc = SparkContext(appName=class_name, conf=conf)
self.sc.setCheckpointDir("/tmp")
# TODO: decrease duration to speed up tests
self.ssc = StreamingContext(self.sc, self.duration)
示例11: test_from_no_conf_constructor
def test_from_no_conf_constructor(self):
self.ssc = StreamingContext(master=self.master, appName=self.appName,
duration=self.batachDuration)
# Alternative call master: ssc.sparkContext.master
# I try to make code close to Scala.
self.assertEqual(self.ssc.sparkContext._conf.get("spark.master"), self.master)
self.assertEqual(self.ssc.sparkContext._conf.get("spark.app.name"), self.appName)
示例12: createSSC
def createSSC():
# ssc 생성
conf = SparkConf()
sc = SparkContext(master="local[*]", appName="CheckpointSample", conf=conf)
ssc = StreamingContext(sc, 3)
# DStream 생성
ids1 = ssc.socketTextStream("127.0.0.1", 9000)
ids2 = ids1.flatMap(lambda v: v.split(" ")).map(lambda v: (v, 1))
# updateStateByKey
ids2.updateStateByKey(updateFunc).pprint()
# checkpoint
ssc.checkpoint("./checkPoints/checkPointSample/Python")
# return
return ssc
示例13: test_text_file_stream
def test_text_file_stream(self):
d = tempfile.mkdtemp()
self.ssc = StreamingContext(self.sc, self.duration)
dstream2 = self.ssc.textFileStream(d).map(int)
result = self._collect(dstream2, 2, block=False)
self.ssc.start()
for name in ('a', 'b'):
time.sleep(1)
with open(os.path.join(d, name), "w") as f:
f.writelines(["%d\n" % i for i in range(10)])
self.wait_for(result, 2)
self.assertEqual([list(range(10)), list(range(10))], result)
示例14: test_binary_records_stream
def test_binary_records_stream(self):
d = tempfile.mkdtemp()
self.ssc = StreamingContext(self.sc, self.duration)
dstream = self.ssc.binaryRecordsStream(d, 10).map(lambda v: struct.unpack("10b", bytes(v)))
result = self._collect(dstream, 2, block=False)
self.ssc.start()
for name in ("a", "b"):
time.sleep(1)
with open(os.path.join(d, name), "wb") as f:
f.write(bytearray(range(10)))
self.wait_for(result, 2)
self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])
示例15: test_get_active_or_create
def test_get_active_or_create(self):
# Test StreamingContext.getActiveOrCreate() without checkpoint data
# See CheckpointTests for tests with checkpoint data
self.ssc = None
self.assertEqual(StreamingContext.getActive(), None)
def setupFunc():
ssc = StreamingContext(self.sc, self.duration)
ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
self.setupCalled = True
return ssc
# Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active
self.setupCalled = False
self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc)
self.assertTrue(self.setupCalled)
# Verify that getActiveOrCreate() retuns active context and does not call the setupFunc
self.ssc.start()
self.setupCalled = False
self.assertEqual(StreamingContext.getActiveOrCreate(None, setupFunc), self.ssc)
self.assertFalse(self.setupCalled)
# Verify that getActiveOrCreate() calls setupFunc after active context is stopped
self.ssc.stop(False)
self.setupCalled = False
self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc)
self.assertTrue(self.setupCalled)
# Verify that if the Java context is stopped, then getActive() returns None
self.ssc = StreamingContext(self.sc, self.duration)
self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
self.ssc.start()
self.assertEqual(StreamingContext.getActive(), self.ssc)
self.ssc._jssc.stop(False)
self.setupCalled = False
self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc)
self.assertTrue(self.setupCalled)