当前位置: 首页>>代码示例>>Python>>正文


Python StreamingContext.stop方法代码示例

本文整理汇总了Python中pyspark.streaming.StreamingContext.stop方法的典型用法代码示例。如果您正苦于以下问题:Python StreamingContext.stop方法的具体用法?Python StreamingContext.stop怎么用?Python StreamingContext.stop使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.streaming.StreamingContext的用法示例。


在下文中一共展示了StreamingContext.stop方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: start_spark

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
def start_spark(timeout=None, max_items_per_rdd_sent=None):
    sc = SparkContext("local[4]", "twitter.trending")
    ssc = StreamingContext(sc, 5)

    ssc.checkpoint('hdfs://localhost:9000/user/spark/checkpoint/')

    kafka_params = {
        'zookeeper.connect': config.get('zookeeper', 'host'),
        'group.id': config.get('kafka', 'group_id'),
        'metadata.broker.list': config.get('kafka', 'hosts')
    }

    ksc = KafkaUtils.createDirectStream(ssc,
                                        [config.get('kafka', 'topic')],
                                        kafka_params)

    hashtag_counts = get_word_counts(ksc)
    filtered_tweet_count = filter_tweets(hashtag_counts)
    send_dstream_data(filtered_tweet_count, max_items_per_rdd_sent)
    ssc.start()
    if timeout:
        ssc.awaitTermination(timeout)
        ssc.stop(stopSparkContext=True, stopGraceFully=True)
    else:
        ssc.awaitTermination()
开发者ID:joychugh,项目名称:learning-kafka,代码行数:27,代码来源:spark_example.py

示例2: BaseStreamingTestCase

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
class BaseStreamingTestCase(unittest.TestCase):
    """ From https://github.com/apache/spark/blob/
    master/python/pyspark/streaming/tests.py """

    timeout = 10  # seconds
    duration = .5

    def setUp(self):
        self.ssc = StreamingContext(sc, self.duration)

    def tearDown(self):
        self.ssc.stop(False)

    def wait_for(self, result, n):
        start_time = time.time()
        while len(result) < n and time.time() - start_time < self.timeout:
            time.sleep(0.01)
        if len(result) < n:
            print("timeout after", self.timeout)

    def _collect(self, dstream, n):
        result = []

        def get_output(_, rdd):
            if rdd and len(result) < n:
                r = rdd.collect()
                if r:
                    result.append(r)

        dstream.foreachRDD(get_output)

        self.ssc.start()
        self.wait_for(result, n)
        return result
开发者ID:Fighting-Toghter,项目名称:scrapybook,代码行数:36,代码来源:boostwords.py

示例3: MLLibStreamingTestCase

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
class MLLibStreamingTestCase(unittest.TestCase):
    def setUp(self):
        self.sc = sc
        self.ssc = StreamingContext(self.sc, 1.0)

    def tearDown(self):
        self.ssc.stop(False)

    @staticmethod
    def _ssc_wait(start_time, end_time, sleep_time):
        while time() - start_time < end_time:
            sleep(0.01)
开发者ID:HodaAlemi,项目名称:spark,代码行数:14,代码来源:tests.py

示例4: MLLibStreamingTestCase

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
class MLLibStreamingTestCase(unittest.TestCase):
    def setUp(self):
        self.sc = SparkContext('local[4]', "MLlib tests")
        self.ssc = StreamingContext(self.sc, 1.0)

    def tearDown(self):
        self.ssc.stop(False)
        self.sc.stop()

    @staticmethod
    def _eventually(condition, timeout=30.0, catch_assertions=False):
        """
        Wait a given amount of time for a condition to pass, else fail with an error.
        This is a helper utility for streaming ML tests.
        :param condition: Function that checks for termination conditions.
                          condition() can return:
                           - True: Conditions met. Return without error.
                           - other value: Conditions not met yet. Continue. Upon timeout,
                                          include last such value in error message.
                          Note that this method may be called at any time during
                          streaming execution (e.g., even before any results
                          have been created).
        :param timeout: Number of seconds to wait.  Default 30 seconds.
        :param catch_assertions: If False (default), do not catch AssertionErrors.
                                 If True, catch AssertionErrors; continue, but save
                                 error to throw upon timeout.
        """
        start_time = time()
        lastValue = None
        while time() - start_time < timeout:
            if catch_assertions:
                try:
                    lastValue = condition()
                except AssertionError as e:
                    lastValue = e
            else:
                lastValue = condition()
            if lastValue is True:
                return
            sleep(0.01)
        if isinstance(lastValue, AssertionError):
            raise lastValue
        else:
            raise AssertionError(
                "Test failed due to timeout after %g sec, with last condition returning: %s"
                % (timeout, lastValue))
开发者ID:JingchengDu,项目名称:spark,代码行数:48,代码来源:test_streaming_algorithms.py

示例5: __init__

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
class Consumer:
	'Simple spark kafka streaming consumer'

	def __init__(self, casshost, interval, zookeeper, topic):
		self.conf = SparkConf().setAppName("KafkaSpark").set("spark.cassandra.connection.host", casshost)
		self.sc   = SparkContext(conf=self.conf)
		self.sqlContext = SQLContext(sparkContext=self.sc)
		self.ssc = StreamingContext(self.sc, batchDuration=interval)
		self.zookeeper = zookeeper
		self.topic = topic

	def check_and_write(self, x):
		try:
			x.toDF().write.format("org.apache.spark.sql.cassandra").options(table="test1", keyspace = "mykeyspace").save(mode ="append") 
		except ValueError:
			print "No rdd found!"

	def consume(self):
		messages = KafkaUtils.createStream(self.ssc, self.zookeeper, "spark-streaming-consumer", {self.topic: 1})
		lines = messages.map(lambda x: x[1])

		rows = lines.map(lambda x: { 
			"data": json.loads(x)['data'],
			"time": json.loads(x)['time']
		})

		rows.foreachRDD(lambda x: {
			self.check_and_write(x)
		})

		self.ssc.start()
		self.ssc.awaitTermination()

	def stop(self):
		if self.sqlContext != None:
			self.sqlContext.stop()
		if self.ssc != None:
			self.ssc.stop()
		if self.sc != None:
			self.sc.stop()
开发者ID:jvovk,项目名称:kafka-python,代码行数:42,代码来源:kafka_consumer_oop.py

示例6: SparkContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
 To run this example use
    `$ bin/spark-submit examples/src/main/python/streaming/queue_stream.py
"""
import sys
import time

from pyspark import SparkContext
from pyspark.streaming import StreamingContext

if __name__ == "__main__":

    sc = SparkContext(appName="PythonStreamingQueueStream")
    ssc = StreamingContext(sc, 1)

    # Create the queue through which RDDs can be pushed to
    # a QueueInputDStream
    rddQueue = []
    for i in range(5):
        rddQueue += [ssc.sparkContext.parallelize([j for j in range(1, 1001)], 10)]

    # Create the QueueInputDStream and use it do some processing
    inputStream = ssc.queueStream(rddQueue)
    mappedStream = inputStream.map(lambda x: (x % 10, 1))
    reducedStream = mappedStream.reduceByKey(lambda a, b: a + b)
    reducedStream.pprint()

    ssc.start()
    time.sleep(6)
    ssc.stop(stopSparkContext=True, stopGraceFully=True)
开发者ID:0xqq,项目名称:spark,代码行数:31,代码来源:queue_stream.py

示例7: long

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
      return [{"time": datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S"), "orderId": long(s[1]), "clientId": long(s[2]), "symbol": s[3],
      "amount": int(s[4]), "price": float(s[5]), "buy": s[6] == "B"}]
  except Exception as err:
      print("Wrong line format (%s): " % line)
      return []

orders = filestream.flatMap(parseOrder)

from operator import add
numPerType = orders.map(lambda o: (o['buy'], 1L)).reduceByKey(add)

numPerType.repartition(1).saveAsTextFiles("/home/spark/ch06output/output", "txt")

ssc.start()

ssc.stop(False)

allCounts = sc.textFile("/home/spark/ch06output/output*.txt")

#section 6.1.8
from pyspark.streaming import StreamingContext
ssc = StreamingContext(sc, 5)
filestream = ssc.textFileStream("/home/spark/ch06input")

from datetime import datetime
def parseOrder(line):
  s = line.split(",")
  try:
      if s[6] != "B" and s[6] != "S":
        raise Exception('Wrong format')
      return [{"time": datetime.strptime(s[0], "%Y-%m-%d %H:%M:%S"), "orderId": long(s[1]), "clientId": long(s[2]), "symbol": s[3],
开发者ID:AkiraKane,项目名称:first-edition,代码行数:33,代码来源:ch06-listings.py

示例8:

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
log4j = sc._jvm.org.apache.log4j
log4j.LogManager.getRootLogger().setLevel(log4j.Level.ERROR)

try:
    # Create a DStream to connect to the wordsocket.py random word server
    lines = ssc.socketTextStream("localhost", 6568)

    # Split each line into words
    words = lines.flatMap(lambda line: line.split(" "))

    # Count each word in each batch
    pairs = words.map(lambda word: (word, 1))
    wordCounts = pairs.reduceByKey(lambda x, y: x + y)

    # Print the first ten elements of each RDD generated in this DStream to the console
    wordCounts.pprint()
except:
    print "Unexpected error:", sys.exc_info()[0]
    pass

# Start the computation
ssc.start()             

# Wait for the computation, terminating after 30s
#ssc.awaitTerminationOrTimeout(30)  
import time
time.sleep(30)

ssc.stop()

开发者ID:mekasone,项目名称:biginsight-examples,代码行数:31,代码来源:sparkstreaming.py

示例9: ThunderStreamingContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
class ThunderStreamingContext(object):
    """
    The streaming context is responsible for creating DStreams and RDDs based on data stored in a time-ordered
    database.
    """

    DATA_KEY = 'data'
    REGRESSOR_KEY = 'regressor'

    def __init__(self, tsc, sc, batch_time=5):
        self._tsc = tsc
        self._sc = sc
        self.ssc = StreamingContext(sc, batch_time)

        self.rows_per_partition = 5

        self.dstream_loaders = []
        self._poll_time = 3
        self.batch_time = batch_time

        self._feeder = None
        self._hbase_manager = None

    def loadConfig(self, filename=None):
        """
        :param filename:  The name of the ETL configuration file to load (by default, it will use the file passed in
        as the first argument to thunder_streaming, which is stored in the ETL_CONFIG environment variable
        :return:
        """
        if not filename:
            filename = os.environ.get('ETL_CONFIG')
        if not filename:
            warningLog("Could not load a configuration file (did you pass one in as an argument to thunder_streaming?).")
            return
        manager = HBaseManager()
        feeder = Feeder(filename, manager)
        self._hbase_manager = manager
        self._feeder = feeder

    def set_partition_size(self, rows_per_partition):
        """
        Sets the number of rows to include in each partition

        :param batch_size:
        :return:
        """
        self.rows_per_partition = rows_per_partition

    def start_feeder(self):
        if not self._feeder:
            warningLog("Cannot start until a streaming configuration file has been loaded.")
        self._feeder.start()

    def start_streaming(self):
        self.ssc.start()

    def stop_streaming(self):
        for loader in self.dstream_loaders:
            loader.stop()
        self.ssc.stop(stopSparkContext=False)

    def _loadBytes(self, datasetId=DATA_KEY, minTime=0, maxTime=10):
        def _lb(first, last):
            # TODO optimize this
            manager = HBaseManager()
            manager.initialize()
            keyed_byte_arrs = manager.get_rows(datasetId, minTime, maxTime)
            if len(keyed_byte_arrs) == 0:
                return []
            return keyed_byte_arrs
        chunk_iter = grouper(range(minTime, maxTime), self.rows_per_partition)
        chunk_rdd = self._sc.parallelize([(group[0], group[-1] + 1) for group in chunk_iter])
        return chunk_rdd.flatMap(lambda (first, last): _lb(first, last))

    def loadImages(self, datasetId=DATA_KEY, dtype='uint16', dims=None, minTime=0, maxTime=10):
        bytes = self._loadBytes(datasetId, minTime, maxTime)
        rdd =  bytes.map(lambda (k, v): (k, np.fromstring(v, dtype=dtype).reshape(dims)))
        return Images(rdd, dims=dims, dtype=dtype)

    def loadSeries(self, datasetId=DATA_KEY, dtype='uint16', minTime=0, maxTime=10):
        bytes = self._loadBytes(datasetId, minTime, maxTime)
        keyed_rdd = bytes.flatMap(lambda (k, v): map(lambda (idx, vi): (idx, (k, vi)),
                                                         list(enumerate(np.fromstring(v, dtype=dtype)))))
        rdd = keyed_rdd.groupByKey().map(lambda (k, v): (k, map(lambda (ki, vi): vi, sorted(v))))
        return Series(rdd, dtype=dtype)

    def _loadBytesDStream(self, datasetId=DATA_KEY):
        """
        """
        jvm = self._sc._jvm
        java_import(jvm, "thunder_streaming.receivers.*")

        feeder_conf = self._feeder.conf
        ser = PairDeserializer(NoOpSerializer(), NoOpSerializer())

        try:
            # TODO: are there closure problems with this approach? (why do Jascha/KafkaUtils do it differently?)
            dstream = DStream(
                self.ssc._jssc.receiverStream(jvm.HBaseReceiver(
                    ListConverter().convert(feeder_conf.get_sequence_names(), jvm._gateway_client),
#.........这里部分代码省略.........
开发者ID:andrewosh,项目名称:TS2,代码行数:103,代码来源:context.py

示例10: PySparkStreamingTestCase

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
class PySparkStreamingTestCase(unittest.TestCase):

    timeout = 30  # seconds
    duration = .5

    @classmethod
    def setUpClass(cls):
        class_name = cls.__name__
        conf = SparkConf().set("spark.default.parallelism", 1)
        cls.sc = SparkContext(appName=class_name, conf=conf)
        cls.sc.setCheckpointDir(tempfile.mkdtemp())

    @classmethod
    def tearDownClass(cls):
        cls.sc.stop()
        # Clean up in the JVM just in case there has been some issues in Python API
        try:
            jSparkContextOption = SparkContext._jvm.SparkContext.get()
            if jSparkContextOption.nonEmpty():
                jSparkContextOption.get().stop()
        except:
            pass

    def setUp(self):
        self.ssc = StreamingContext(self.sc, self.duration)

    def tearDown(self):
        if self.ssc is not None:
            self.ssc.stop(False)
        # Clean up in the JVM just in case there has been some issues in Python API
        try:
            jStreamingContextOption = StreamingContext._jvm.SparkContext.getActive()
            if jStreamingContextOption.nonEmpty():
                jStreamingContextOption.get().stop(False)
        except:
            pass

    def wait_for(self, result, n):
        start_time = time.time()
        while len(result) < n and time.time() - start_time < self.timeout:
            time.sleep(0.01)
        if len(result) < n:
            print("timeout after", self.timeout)

    def _take(self, dstream, n):
        """
        Return the first `n` elements in the stream (will start and stop).
        """
        results = []

        def take(_, rdd):
            if rdd and len(results) < n:
                results.extend(rdd.take(n - len(results)))

        dstream.foreachRDD(take)

        self.ssc.start()
        self.wait_for(results, n)
        return results

    def _collect(self, dstream, n, block=True):
        """
        Collect each RDDs into the returned list.

        :return: list, which will have the collected items.
        """
        result = []

        def get_output(_, rdd):
            if rdd and len(result) < n:
                r = rdd.collect()
                if r:
                    result.append(r)

        dstream.foreachRDD(get_output)

        if not block:
            return result

        self.ssc.start()
        self.wait_for(result, n)
        return result

    def _test_func(self, input, func, expected, sort=False, input2=None):
        """
        @param input: dataset for the test. This should be list of lists.
        @param func: wrapped function. This function should return PythonDStream object.
        @param expected: expected output for this testcase.
        """
        if not isinstance(input[0], RDD):
            input = [self.sc.parallelize(d, 1) for d in input]
        input_stream = self.ssc.queueStream(input)
        if input2 and not isinstance(input2[0], RDD):
            input2 = [self.sc.parallelize(d, 1) for d in input2]
        input_stream2 = self.ssc.queueStream(input2) if input2 is not None else None

        # Apply test function to stream.
        if input2:
            stream = func(input_stream, input_stream2)
        else:
#.........这里部分代码省略.........
开发者ID:Brett-A,项目名称:spark,代码行数:103,代码来源:streamingutils.py

示例11: mode

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
            mode('append').options(table="raw_metrics", keyspace="metrics").save()
        readingsDataFrame.unpersist()
    except:
        pass

def main():
    #main function to execute code
    sqlContext = SQLContext(sc)
    zk_host = zk_ip+":2181"
    consumer_group = "reading-consumer-group"
    kafka_partitions={topic:1}
    #create kafka stream
    kvs = KafkaUtils.createStream(ssc,zk_host,consumer_group,kafka_partitions,valueDecoder=decoder)
    lines = kvs.map(lambda x: x[1])
    readings = lines.map(lambda x: Row(device_id=x["device_id"],\
        metric_time=datetime.datetime.fromtimestamp(int(x["metric_time"])),\
        metric_name=x["metric_name"],\
        metric_value=float(x["metric_value"])))
    readings.foreachRDD(process)
    ssc.start()
    ssc.awaitTermination()

if __name__ == "__main__":
    sc = SparkContext(appName="ReadingWriter")
    ssc = StreamingContext(sc,10)
    try:
        main()
    except KeyboardInterrupt:
        print("Gracefully stopping Spark Streaming Application")
        ssc.stop(True, True)
开发者ID:vadlak,项目名称:energyiot,代码行数:32,代码来源:writemetrics.py

示例12: StreamingContextTests

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import stop [as 别名]
class StreamingContextTests(PySparkStreamingTestCase):

    duration = 0.1
    setupCalled = False

    def _add_input_stream(self):
        inputs = [range(1, x) for x in range(101)]
        stream = self.ssc.queueStream(inputs)
        self._collect(stream, 1, block=False)

    def test_stop_only_streaming_context(self):
        self._add_input_stream()
        self.ssc.start()
        self.ssc.stop(False)
        self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)

    def test_stop_multiple_times(self):
        self._add_input_stream()
        self.ssc.start()
        self.ssc.stop(False)
        self.ssc.stop(False)

    def test_queue_stream(self):
        input = [list(range(i + 1)) for i in range(3)]
        dstream = self.ssc.queueStream(input)
        result = self._collect(dstream, 3)
        self.assertEqual(input, result)

    def test_text_file_stream(self):
        d = tempfile.mkdtemp()
        self.ssc = StreamingContext(self.sc, self.duration)
        dstream2 = self.ssc.textFileStream(d).map(int)
        result = self._collect(dstream2, 2, block=False)
        self.ssc.start()
        for name in ('a', 'b'):
            time.sleep(1)
            with open(os.path.join(d, name), "w") as f:
                f.writelines(["%d\n" % i for i in range(10)])
        self.wait_for(result, 2)
        self.assertEqual([list(range(10)), list(range(10))], result)

    def test_binary_records_stream(self):
        d = tempfile.mkdtemp()
        self.ssc = StreamingContext(self.sc, self.duration)
        dstream = self.ssc.binaryRecordsStream(d, 10).map(
            lambda v: struct.unpack("10b", bytes(v)))
        result = self._collect(dstream, 2, block=False)
        self.ssc.start()
        for name in ('a', 'b'):
            time.sleep(1)
            with open(os.path.join(d, name), "wb") as f:
                f.write(bytearray(range(10)))
        self.wait_for(result, 2)
        self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])

    def test_union(self):
        input = [list(range(i + 1)) for i in range(3)]
        dstream = self.ssc.queueStream(input)
        dstream2 = self.ssc.queueStream(input)
        dstream3 = self.ssc.union(dstream, dstream2)
        result = self._collect(dstream3, 3)
        expected = [i * 2 for i in input]
        self.assertEqual(expected, result)

    def test_transform(self):
        dstream1 = self.ssc.queueStream([[1]])
        dstream2 = self.ssc.queueStream([[2]])
        dstream3 = self.ssc.queueStream([[3]])

        def func(rdds):
            rdd1, rdd2, rdd3 = rdds
            return rdd2.union(rdd3).union(rdd1)

        dstream = self.ssc.transform([dstream1, dstream2, dstream3], func)

        self.assertEqual([2, 3, 1], self._take(dstream, 3))

    def test_transform_pairrdd(self):
        # This regression test case is for SPARK-17756.
        dstream = self.ssc.queueStream(
            [[1], [2], [3]]).transform(lambda rdd: rdd.cartesian(rdd))
        self.assertEqual([(1, 1), (2, 2), (3, 3)], self._take(dstream, 3))

    def test_get_active(self):
        self.assertEqual(StreamingContext.getActive(), None)

        # Verify that getActive() returns the active context
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(StreamingContext.getActive(), self.ssc)

        # Verify that getActive() returns None
        self.ssc.stop(False)
        self.assertEqual(StreamingContext.getActive(), None)

        # Verify that if the Java context is stopped, then getActive() returns None
        self.ssc = StreamingContext(self.sc, self.duration)
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(StreamingContext.getActive(), self.ssc)
#.........这里部分代码省略.........
开发者ID:Brett-A,项目名称:spark,代码行数:103,代码来源:test_context.py


注:本文中的pyspark.streaming.StreamingContext.stop方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。