本文整理汇总了Python中pyspark.streaming.StreamingContext.stop方法的典型用法代码示例。如果您正苦于以下问题:Python StreamingContext.stop方法的具体用法?Python StreamingContext.stop怎么用?Python StreamingContext.stop使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.streaming.StreamingContext
的用法示例。
在下文中一共展示了StreamingContext.stop方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: start_spark
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
def start_spark(timeout=None, max_items_per_rdd_sent=None):
sc = SparkContext("local[4]", "twitter.trending")
ssc = StreamingContext(sc, 5)
ssc.checkpoint('hdfs://localhost:9000/user/spark/checkpoint/')
kafka_params = {
'zookeeper.connect': config.get('zookeeper', 'host'),
'group.id': config.get('kafka', 'group_id'),
'metadata.broker.list': config.get('kafka', 'hosts')
}
ksc = KafkaUtils.createDirectStream(ssc,
[config.get('kafka', 'topic')],
kafka_params)
hashtag_counts = get_word_counts(ksc)
filtered_tweet_count = filter_tweets(hashtag_counts)
send_dstream_data(filtered_tweet_count, max_items_per_rdd_sent)
ssc.start()
if timeout:
ssc.awaitTermination(timeout)
ssc.stop(stopSparkContext=True, stopGraceFully=True)
else:
ssc.awaitTermination()
示例2: BaseStreamingTestCase
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
class BaseStreamingTestCase(unittest.TestCase):
""" From https://github.com/apache/spark/blob/
master/python/pyspark/streaming/tests.py """
timeout = 10 # seconds
duration = .5
def setUp(self):
self.ssc = StreamingContext(sc, self.duration)
def tearDown(self):
self.ssc.stop(False)
def wait_for(self, result, n):
start_time = time.time()
while len(result) < n and time.time() - start_time < self.timeout:
time.sleep(0.01)
if len(result) < n:
print("timeout after", self.timeout)
def _collect(self, dstream, n):
result = []
def get_output(_, rdd):
if rdd and len(result) < n:
r = rdd.collect()
if r:
result.append(r)
dstream.foreachRDD(get_output)
self.ssc.start()
self.wait_for(result, n)
return result
示例3: MLLibStreamingTestCase
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
class MLLibStreamingTestCase(unittest.TestCase):
def setUp(self):
self.sc = sc
self.ssc = StreamingContext(self.sc, 1.0)
def tearDown(self):
self.ssc.stop(False)
@staticmethod
def _ssc_wait(start_time, end_time, sleep_time):
while time() - start_time < end_time:
sleep(0.01)
示例4: MLLibStreamingTestCase
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
class MLLibStreamingTestCase(unittest.TestCase):
def setUp(self):
self.sc = SparkContext('local[4]', "MLlib tests")
self.ssc = StreamingContext(self.sc, 1.0)
def tearDown(self):
self.ssc.stop(False)
self.sc.stop()
@staticmethod
def _eventually(condition, timeout=30.0, catch_assertions=False):
"""
Wait a given amount of time for a condition to pass, else fail with an error.
This is a helper utility for streaming ML tests.
:param condition: Function that checks for termination conditions.
condition() can return:
- True: Conditions met. Return without error.
- other value: Conditions not met yet. Continue. Upon timeout,
include last such value in error message.
Note that this method may be called at any time during
streaming execution (e.g., even before any results
have been created).
:param timeout: Number of seconds to wait. Default 30 seconds.
:param catch_assertions: If False (default), do not catch AssertionErrors.
If True, catch AssertionErrors; continue, but save
error to throw upon timeout.
"""
start_time = time()
lastValue = None
while time() - start_time < timeout:
if catch_assertions:
try:
lastValue = condition()
except AssertionError as e:
lastValue = e
else:
lastValue = condition()
if lastValue is True:
return
sleep(0.01)
if isinstance(lastValue, AssertionError):
raise lastValue
else:
raise AssertionError(
"Test failed due to timeout after %g sec, with last condition returning: %s"
% (timeout, lastValue))
示例5: __init__
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
class Consumer:
'Simple spark kafka streaming consumer'
def __init__(self, casshost, interval, zookeeper, topic):
self.conf = SparkConf().setAppName("KafkaSpark").set("spark.cassandra.connection.host", casshost)
self.sc = SparkContext(conf=self.conf)
self.sqlContext = SQLContext(sparkContext=self.sc)
self.ssc = StreamingContext(self.sc, batchDuration=interval)
self.zookeeper = zookeeper
self.topic = topic
def check_and_write(self, x):
try:
x.toDF().write.format("org.apache.spark.sql.cassandra").options(table="test1", keyspace = "mykeyspace").save(mode ="append")
except ValueError:
print "No rdd found!"
def consume(self):
messages = KafkaUtils.createStream(self.ssc, self.zookeeper, "spark-streaming-consumer", {self.topic: 1})
lines = messages.map(lambda x: x[1])
rows = lines.map(lambda x: {
"data": json.loads(x)['data'],
"time": json.loads(x)['time']
})
rows.foreachRDD(lambda x: {
self.check_and_write(x)
})
self.ssc.start()
self.ssc.awaitTermination()
def stop(self):
if self.sqlContext != None:
self.sqlContext.stop()
if self.ssc != None:
self.ssc.stop()
if self.sc != None:
self.sc.stop()
示例6: SparkContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
To run this example use
`$ bin/spark-submit examples/src/main/python/streaming/queue_stream.py
"""
import sys
import time
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
if __name__ == "__main__":
sc = SparkContext(appName="PythonStreamingQueueStream")
ssc = StreamingContext(sc, 1)
# Create the queue through which RDDs can be pushed to
# a QueueInputDStream
rddQueue = []
for i in range(5):
rddQueue += [ssc.sparkContext.parallelize([j for j in range(1, 1001)], 10)]
# Create the QueueInputDStream and use it do some processing
inputStream = ssc.queueStream(rddQueue)
mappedStream = inputStream.map(lambda x: (x % 10, 1))
reducedStream = mappedStream.reduceByKey(lambda a, b: a + b)
reducedStream.pprint()
ssc.start()
time.sleep(6)
ssc.stop(stopSparkContext=True, stopGraceFully=True)
示例7: long
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
return [{"time": datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S"), "orderId": long(s[1]), "clientId": long(s[2]), "symbol": s[3],
"amount": int(s[4]), "price": float(s[5]), "buy": s[6] == "B"}]
except Exception as err:
print("Wrong line format (%s): " % line)
return []
orders = filestream.flatMap(parseOrder)
from operator import add
numPerType = orders.map(lambda o: (o['buy'], 1L)).reduceByKey(add)
numPerType.repartition(1).saveAsTextFiles("/home/spark/ch06output/output", "txt")
ssc.start()
ssc.stop(False)
allCounts = sc.textFile("/home/spark/ch06output/output*.txt")
#section 6.1.8
from pyspark.streaming import StreamingContext
ssc = StreamingContext(sc, 5)
filestream = ssc.textFileStream("/home/spark/ch06input")
from datetime import datetime
def parseOrder(line):
s = line.split(",")
try:
if s[6] != "B" and s[6] != "S":
raise Exception('Wrong format')
return [{"time": datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S"), "orderId": long(s[1]), "clientId": long(s[2]), "symbol": s[3],
示例8:
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
log4j = sc._jvm.org.apache.log4j
log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)
try:
# Create a DStream to connect to the wordsocket.py random word server
lines = ssc.socketTextStream("localhost", 6568)
# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))
# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)
# Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.pprint()
except:
print "Unexpected error:", sys.exc_info()[0]
pass
# Start the computation
ssc.start()
# Wait for the computation, terminating after 30s
#ssc.awaitTerminationOrTimeout(30)
import time
time.sleep(30)
ssc.stop()
示例9: ThunderStreamingContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
class ThunderStreamingContext(object):
"""
The streaming context is responsible for creating DStreams and RDDs based on data stored in a time-ordered
database.
"""
DATA_KEY = 'data'
REGRESSOR_KEY = 'regressor'
def __init__(self, tsc, sc, batch_time=5):
self._tsc = tsc
self._sc = sc
self.ssc = StreamingContext(sc, batch_time)
self.rows_per_partition = 5
self.dstream_loaders = []
self._poll_time = 3
self.batch_time = batch_time
self._feeder = None
self._hbase_manager = None
def loadConfig(self, filename=None):
"""
:param filename: The name of the ETL configuration file to load (by default, it will use the file passed in
as the first argument to thunder_streaming, which is stored in the ETL_CONFIG environment variable
:return:
"""
if not filename:
filename = os.environ.get('ETL_CONFIG')
if not filename:
warningLog("Could not load a configuration file (did you pass one in as an argument to thunder_streaming?).")
return
manager = HBaseManager()
feeder = Feeder(filename, manager)
self._hbase_manager = manager
self._feeder = feeder
def set_partition_size(self, rows_per_partition):
"""
Sets the number of rows to include in each partition
:param batch_size:
:return:
"""
self.rows_per_partition = rows_per_partition
def start_feeder(self):
if not self._feeder:
warningLog("Cannot start until a streaming configuration file has been loaded.")
self._feeder.start()
def start_streaming(self):
self.ssc.start()
def stop_streaming(self):
for loader in self.dstream_loaders:
loader.stop()
self.ssc.stop(stopSparkContext=False)
def _loadBytes(self, datasetId=DATA_KEY, minTime=0, maxTime=10):
def _lb(first, last):
# TODO optimize this
manager = HBaseManager()
manager.initialize()
keyed_byte_arrs = manager.get_rows(datasetId, minTime, maxTime)
if len(keyed_byte_arrs) == 0:
return []
return keyed_byte_arrs
chunk_iter = grouper(range(minTime, maxTime), self.rows_per_partition)
chunk_rdd = self._sc.parallelize([(group[0], group[-1] + 1) for group in chunk_iter])
return chunk_rdd.flatMap(lambda (first, last): _lb(first, last))
def loadImages(self, datasetId=DATA_KEY, dtype='uint16', dims=None, minTime=0, maxTime=10):
bytes = self._loadBytes(datasetId, minTime, maxTime)
rdd = bytes.map(lambda (k, v): (k, np.fromstring(v, dtype=dtype).reshape(dims)))
return Images(rdd, dims=dims, dtype=dtype)
def loadSeries(self, datasetId=DATA_KEY, dtype='uint16', minTime=0, maxTime=10):
bytes = self._loadBytes(datasetId, minTime, maxTime)
keyed_rdd = bytes.flatMap(lambda (k, v): map(lambda (idx, vi): (idx, (k, vi)),
list(enumerate(np.fromstring(v, dtype=dtype)))))
rdd = keyed_rdd.groupByKey().map(lambda (k, v): (k, map(lambda (ki, vi): vi, sorted(v))))
return Series(rdd, dtype=dtype)
def _loadBytesDStream(self, datasetId=DATA_KEY):
"""
"""
jvm = self._sc._jvm
java_import(jvm, "thunder_streaming.receivers.*")
feeder_conf = self._feeder.conf
ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())
try:
# TODO: are there closure problems with this approach? (why do Jascha/KafkaUtils do it differently?)
dstream = DStream(
self.ssc._jssc.receiverStream(jvm.HBaseReceiver(
ListConverter().convert(feeder_conf.get_sequence_names(), jvm._gateway_client),
#.........这里部分代码省略.........
示例10: PySparkStreamingTestCase
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
class PySparkStreamingTestCase(unittest.TestCase):
timeout = 30 # seconds
duration = .5
@classmethod
def setUpClass(cls):
class_name = cls.__name__
conf = SparkConf().set("spark.default.parallelism", 1)
cls.sc = SparkContext(appName=class_name, conf=conf)
cls.sc.setCheckpointDir(tempfile.mkdtemp())
@classmethod
def tearDownClass(cls):
cls.sc.stop()
# Clean up in the JVM just in case there has been some issues in Python API
try:
jSparkContextOption = SparkContext._jvm.SparkContext.get()
if jSparkContextOption.nonEmpty():
jSparkContextOption.get().stop()
except:
pass
def setUp(self):
self.ssc = StreamingContext(self.sc, self.duration)
def tearDown(self):
if self.ssc is not None:
self.ssc.stop(False)
# Clean up in the JVM just in case there has been some issues in Python API
try:
jStreamingContextOption = StreamingContext._jvm.SparkContext.getActive()
if jStreamingContextOption.nonEmpty():
jStreamingContextOption.get().stop(False)
except:
pass
def wait_for(self, result, n):
start_time = time.time()
while len(result) < n and time.time() - start_time < self.timeout:
time.sleep(0.01)
if len(result) < n:
print("timeout after", self.timeout)
def _take(self, dstream, n):
"""
Return the first `n` elements in the stream (will start and stop).
"""
results = []
def take(_, rdd):
if rdd and len(results) < n:
results.extend(rdd.take(n - len(results)))
dstream.foreachRDD(take)
self.ssc.start()
self.wait_for(results, n)
return results
def _collect(self, dstream, n, block=True):
"""
Collect each RDDs into the returned list.
:return: list, which will have the collected items.
"""
result = []
def get_output(_, rdd):
if rdd and len(result) < n:
r = rdd.collect()
if r:
result.append(r)
dstream.foreachRDD(get_output)
if not block:
return result
self.ssc.start()
self.wait_for(result, n)
return result
def _test_func(self, input, func, expected, sort=False, input2=None):
"""
@param input: dataset for the test. This should be list of lists.
@param func: wrapped function. This function should return PythonDStream object.
@param expected: expected output for this testcase.
"""
if not isinstance(input[0], RDD):
input = [self.sc.parallelize(d, 1) for d in input]
input_stream = self.ssc.queueStream(input)
if input2 and not isinstance(input2[0], RDD):
input2 = [self.sc.parallelize(d, 1) for d in input2]
input_stream2 = self.ssc.queueStream(input2) if input2 is not None else None
# Apply test function to stream.
if input2:
stream = func(input_stream, input_stream2)
else:
#.........这里部分代码省略.........
示例11: mode
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
mode('append').options(table="raw_metrics", keyspace="metrics").save()
readingsDataFrame.unpersist()
except:
pass
def main():
#main function to execute code
sqlContext = SQLContext(sc)
zk_host = zk_ip+":2181"
consumer_group = "reading-consumer-group"
kafka_partitions={topic:1}
#create kafka stream
kvs = KafkaUtils.createStream(ssc,zk_host,consumer_group,kafka_partitions,valueDecoder=decoder)
lines = kvs.map(lambda x: x[1])
readings = lines.map(lambda x: Row(device_id=x["device_id"],\
metric_time=datetime.datetime.fromtimestamp(int(x["metric_time"])),\
metric_name=x["metric_name"],\
metric_value=float(x["metric_value"])))
readings.foreachRDD(process)
ssc.start()
ssc.awaitTermination()
if __name__ == "__main__":
sc = SparkContext(appName="ReadingWriter")
ssc = StreamingContext(sc,10)
try:
main()
except KeyboardInterrupt:
print("Gracefully stopping Spark Streaming Application")
ssc.stop(True, True)
示例12: StreamingContextTests
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
class StreamingContextTests(PySparkStreamingTestCase):
duration = 0.1
setupCalled = False
def _add_input_stream(self):
inputs = [range(1, x) for x in range(101)]
stream = self.ssc.queueStream(inputs)
self._collect(stream, 1, block=False)
def test_stop_only_streaming_context(self):
self._add_input_stream()
self.ssc.start()
self.ssc.stop(False)
self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)
def test_stop_multiple_times(self):
self._add_input_stream()
self.ssc.start()
self.ssc.stop(False)
self.ssc.stop(False)
def test_queue_stream(self):
input = [list(range(i + 1)) for i in range(3)]
dstream = self.ssc.queueStream(input)
result = self._collect(dstream, 3)
self.assertEqual(input, result)
def test_text_file_stream(self):
d = tempfile.mkdtemp()
self.ssc = StreamingContext(self.sc, self.duration)
dstream2 = self.ssc.textFileStream(d).map(int)
result = self._collect(dstream2, 2, block=False)
self.ssc.start()
for name in ('a', 'b'):
time.sleep(1)
with open(os.path.join(d, name), "w") as f:
f.writelines(["%d\n" % i for i in range(10)])
self.wait_for(result, 2)
self.assertEqual([list(range(10)), list(range(10))], result)
def test_binary_records_stream(self):
d = tempfile.mkdtemp()
self.ssc = StreamingContext(self.sc, self.duration)
dstream = self.ssc.binaryRecordsStream(d, 10).map(
lambda v: struct.unpack("10b", bytes(v)))
result = self._collect(dstream, 2, block=False)
self.ssc.start()
for name in ('a', 'b'):
time.sleep(1)
with open(os.path.join(d, name), "wb") as f:
f.write(bytearray(range(10)))
self.wait_for(result, 2)
self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])
def test_union(self):
input = [list(range(i + 1)) for i in range(3)]
dstream = self.ssc.queueStream(input)
dstream2 = self.ssc.queueStream(input)
dstream3 = self.ssc.union(dstream, dstream2)
result = self._collect(dstream3, 3)
expected = [i * 2 for i in input]
self.assertEqual(expected, result)
def test_transform(self):
dstream1 = self.ssc.queueStream([[1]])
dstream2 = self.ssc.queueStream([[2]])
dstream3 = self.ssc.queueStream([[3]])
def func(rdds):
rdd1, rdd2, rdd3 = rdds
return rdd2.union(rdd3).union(rdd1)
dstream = self.ssc.transform([dstream1, dstream2, dstream3], func)
self.assertEqual([2, 3, 1], self._take(dstream, 3))
def test_transform_pairrdd(self):
# This regression test case is for SPARK-17756.
dstream = self.ssc.queueStream(
[[1], [2], [3]]).transform(lambda rdd: rdd.cartesian(rdd))
self.assertEqual([(1, 1), (2, 2), (3, 3)], self._take(dstream, 3))
def test_get_active(self):
self.assertEqual(StreamingContext.getActive(), None)
# Verify that getActive() returns the active context
self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
self.ssc.start()
self.assertEqual(StreamingContext.getActive(), self.ssc)
# Verify that getActive() returns None
self.ssc.stop(False)
self.assertEqual(StreamingContext.getActive(), None)
# Verify that if the Java context is stopped, then getActive() returns None
self.ssc = StreamingContext(self.sc, self.duration)
self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
self.ssc.start()
self.assertEqual(StreamingContext.getActive(), self.ssc)
#.........这里部分代码省略.........