当前位置: 首页>>代码示例>>Python>>正文


Python StreamingContext.queueStream方法代码示例

本文整理汇总了Python中pyspark.streaming.context.StreamingContext.queueStream方法的典型用法代码示例。如果您正苦于以下问题:Python StreamingContext.queueStream方法的具体用法?Python StreamingContext.queueStream怎么用?Python StreamingContext.queueStream使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.streaming.context.StreamingContext的用法示例。


在下文中一共展示了StreamingContext.queueStream方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: PySparkStreamingTestCase

# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import queueStream [as 别名]
class PySparkStreamingTestCase(unittest.TestCase):

    timeout = 10  # seconds
    duration = .5

    @classmethod
    def setUpClass(cls):
        class_name = cls.__name__
        conf = SparkConf().set("spark.default.parallelism", 1)
        cls.sc = SparkContext(appName=class_name, conf=conf)
        cls.sc.setCheckpointDir("/tmp")

    @classmethod
    def tearDownClass(cls):
        cls.sc.stop()
        # Clean up in the JVM just in case there has been some issues in Python API
        jSparkContextOption = SparkContext._jvm.SparkContext.get()
        if jSparkContextOption.nonEmpty():
            jSparkContextOption.get().stop()

    def setUp(self):
        self.ssc = StreamingContext(self.sc, self.duration)

    def tearDown(self):
        if self.ssc is not None:
            self.ssc.stop(False)
        # Clean up in the JVM just in case there has been some issues in Python API
        jStreamingContextOption = StreamingContext._jvm.SparkContext.getActive()
        if jStreamingContextOption.nonEmpty():
            jStreamingContextOption.get().stop(False)

    def wait_for(self, result, n):
        start_time = time.time()
        while len(result) < n and time.time() - start_time < self.timeout:
            time.sleep(0.01)
        if len(result) < n:
            print("timeout after", self.timeout)

    def _take(self, dstream, n):
        """
        Return the first `n` elements in the stream (will start and stop).
        """
        results = []

        def take(_, rdd):
            if rdd and len(results) < n:
                results.extend(rdd.take(n - len(results)))

        dstream.foreachRDD(take)

        self.ssc.start()
        self.wait_for(results, n)
        return results

    def _collect(self, dstream, n, block=True):
        """
        Collect each RDDs into the returned list.

        :return: list, which will have the collected items.
        """
        result = []

        def get_output(_, rdd):
            if rdd and len(result) < n:
                r = rdd.collect()
                if r:
                    result.append(r)

        dstream.foreachRDD(get_output)

        if not block:
            return result

        self.ssc.start()
        self.wait_for(result, n)
        return result

    def _test_func(self, input, func, expected, sort=False, input2=None):
        """
        @param input: dataset for the test. This should be list of lists.
        @param func: wrapped function. This function should return PythonDStream object.
        @param expected: expected output for this testcase.
        """
        if not isinstance(input[0], RDD):
            input = [self.sc.parallelize(d, 1) for d in input]
        input_stream = self.ssc.queueStream(input)
        if input2 and not isinstance(input2[0], RDD):
            input2 = [self.sc.parallelize(d, 1) for d in input2]
        input_stream2 = self.ssc.queueStream(input2) if input2 is not None else None

        # Apply test function to stream.
        if input2:
            stream = func(input_stream, input_stream2)
        else:
            stream = func(input_stream)

        result = self._collect(stream, len(expected))
        if sort:
            self._sort_result_based_on_key(result)
            self._sort_result_based_on_key(expected)
#.........这里部分代码省略.........
开发者ID:anitatailor,项目名称:spark,代码行数:103,代码来源:tests.py

示例2: setupFunc

# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import queueStream [as 别名]
 def setupFunc():
     ssc = StreamingContext(self.sc, self.duration)
     ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
     self.setupCalled = True
     return ssc
开发者ID:anitatailor,项目名称:spark,代码行数:7,代码来源:tests.py

示例3: SparkContext

# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import queueStream [as 别名]
sc = SparkContext(master="local[*]", appName="StreamingOps", conf=conf)
ssc = StreamingContext(sc, 1)

rdd1 = ssc.sparkContext.parallelize(["a", "b", "c", "c", "c"])
rdd2 = ssc.sparkContext.parallelize(["1,2,3,4,5"])
rdd3 = ssc.sparkContext.parallelize([("k1", "r1"), ("k2", "r2"), ("k3", "r3")])
rdd4 = ssc.sparkContext.parallelize([("k1", "s1"), ("k2", "s2")])
rdd5 = ssc.sparkContext.range(1, 6)

q1 = [rdd1]
q2 = [rdd2]
q3 = [rdd3]
q4 = [rdd4]
q5 = [rdd5]

ds1 = ssc.queueStream(q1, True)
ds2 = ssc.queueStream(q2, True)
ds3 = ssc.queueStream(q3, True)
ds4 = ssc.queueStream(q4, True)
ds5 = ssc.queueStream(q5, True)

# [예제 실행 방법] 아래에서 원하는 예제의 주석을 제거하고 실행!!

# 6.3.1절(파이썬의 경우 print가 아닌 pprint임)
# ds1.pprint()

# 6.3.2절
# ds1.map(lambda v: (v, 1)).pprint()

# 6.3.3절
# ds2.flatMap(lambda v: v.split(",")).pprint()
开发者ID:oopchoi,项目名称:spark,代码行数:33,代码来源:streaming_ops.py

示例4: StreamingContextTests

# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import queueStream [as 别名]
class StreamingContextTests(PySparkStreamingTestCase):

    duration = 0.1
    setupCalled = False

    def _add_input_stream(self):
        inputs = [range(1, x) for x in range(101)]
        stream = self.ssc.queueStream(inputs)
        self._collect(stream, 1, block=False)

    def test_stop_only_streaming_context(self):
        self._add_input_stream()
        self.ssc.start()
        self.ssc.stop(False)
        self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)

    def test_stop_multiple_times(self):
        self._add_input_stream()
        self.ssc.start()
        self.ssc.stop(False)
        self.ssc.stop(False)

    def test_queue_stream(self):
        input = [list(range(i + 1)) for i in range(3)]
        dstream = self.ssc.queueStream(input)
        result = self._collect(dstream, 3)
        self.assertEqual(input, result)

    def test_text_file_stream(self):
        d = tempfile.mkdtemp()
        self.ssc = StreamingContext(self.sc, self.duration)
        dstream2 = self.ssc.textFileStream(d).map(int)
        result = self._collect(dstream2, 2, block=False)
        self.ssc.start()
        for name in ('a', 'b'):
            time.sleep(1)
            with open(os.path.join(d, name), "w") as f:
                f.writelines(["%d\n" % i for i in range(10)])
        self.wait_for(result, 2)
        self.assertEqual([list(range(10)), list(range(10))], result)

    def test_binary_records_stream(self):
        d = tempfile.mkdtemp()
        self.ssc = StreamingContext(self.sc, self.duration)
        dstream = self.ssc.binaryRecordsStream(d, 10).map(
            lambda v: struct.unpack("10b", bytes(v)))
        result = self._collect(dstream, 2, block=False)
        self.ssc.start()
        for name in ('a', 'b'):
            time.sleep(1)
            with open(os.path.join(d, name), "wb") as f:
                f.write(bytearray(range(10)))
        self.wait_for(result, 2)
        self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])

    def test_union(self):
        input = [list(range(i + 1)) for i in range(3)]
        dstream = self.ssc.queueStream(input)
        dstream2 = self.ssc.queueStream(input)
        dstream3 = self.ssc.union(dstream, dstream2)
        result = self._collect(dstream3, 3)
        expected = [i * 2 for i in input]
        self.assertEqual(expected, result)

    def test_transform(self):
        dstream1 = self.ssc.queueStream([[1]])
        dstream2 = self.ssc.queueStream([[2]])
        dstream3 = self.ssc.queueStream([[3]])

        def func(rdds):
            rdd1, rdd2, rdd3 = rdds
            return rdd2.union(rdd3).union(rdd1)

        dstream = self.ssc.transform([dstream1, dstream2, dstream3], func)

        self.assertEqual([2, 3, 1], self._take(dstream, 3))

    def test_get_active(self):
        self.assertEqual(StreamingContext.getActive(), None)

        # Verify that getActive() returns the active context
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(StreamingContext.getActive(), self.ssc)

        # Verify that getActive() returns None
        self.ssc.stop(False)
        self.assertEqual(StreamingContext.getActive(), None)

        # Verify that if the Java context is stopped, then getActive() returns None
        self.ssc = StreamingContext(self.sc, self.duration)
        self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count())
        self.ssc.start()
        self.assertEqual(StreamingContext.getActive(), self.ssc)
        self.ssc._jssc.stop(False)
        self.assertEqual(StreamingContext.getActive(), None)

    def test_get_active_or_create(self):
        # Test StreamingContext.getActiveOrCreate() without checkpoint data
        # See CheckpointTests for tests with checkpoint data
#.........这里部分代码省略.........
开发者ID:anitatailor,项目名称:spark,代码行数:103,代码来源:tests.py

示例5: StreamingContextTests

# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import queueStream [as 别名]
class StreamingContextTests(PySparkStreamingTestCase):

    duration = 0.1

    def _add_input_stream(self):
        inputs = [range(1, x) for x in range(101)]
        stream = self.ssc.queueStream(inputs)
        self._collect(stream, 1, block=False)

    def test_stop_only_streaming_context(self):
        self._add_input_stream()
        self.ssc.start()
        self.ssc.stop(False)
        self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5)

    def test_stop_multiple_times(self):
        self._add_input_stream()
        self.ssc.start()
        self.ssc.stop(False)
        self.ssc.stop(False)

    def test_queue_stream(self):
        input = [list(range(i + 1)) for i in range(3)]
        dstream = self.ssc.queueStream(input)
        result = self._collect(dstream, 3)
        self.assertEqual(input, result)

    def test_text_file_stream(self):
        d = tempfile.mkdtemp()
        self.ssc = StreamingContext(self.sc, self.duration)
        dstream2 = self.ssc.textFileStream(d).map(int)
        result = self._collect(dstream2, 2, block=False)
        self.ssc.start()
        for name in ("a", "b"):
            time.sleep(1)
            with open(os.path.join(d, name), "w") as f:
                f.writelines(["%d\n" % i for i in range(10)])
        self.wait_for(result, 2)
        self.assertEqual([list(range(10)), list(range(10))], result)

    def test_binary_records_stream(self):
        d = tempfile.mkdtemp()
        self.ssc = StreamingContext(self.sc, self.duration)
        dstream = self.ssc.binaryRecordsStream(d, 10).map(lambda v: struct.unpack("10b", bytes(v)))
        result = self._collect(dstream, 2, block=False)
        self.ssc.start()
        for name in ("a", "b"):
            time.sleep(1)
            with open(os.path.join(d, name), "wb") as f:
                f.write(bytearray(range(10)))
        self.wait_for(result, 2)
        self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result])

    def test_union(self):
        input = [list(range(i + 1)) for i in range(3)]
        dstream = self.ssc.queueStream(input)
        dstream2 = self.ssc.queueStream(input)
        dstream3 = self.ssc.union(dstream, dstream2)
        result = self._collect(dstream3, 3)
        expected = [i * 2 for i in input]
        self.assertEqual(expected, result)

    def test_transform(self):
        dstream1 = self.ssc.queueStream([[1]])
        dstream2 = self.ssc.queueStream([[2]])
        dstream3 = self.ssc.queueStream([[3]])

        def func(rdds):
            rdd1, rdd2, rdd3 = rdds
            return rdd2.union(rdd3).union(rdd1)

        dstream = self.ssc.transform([dstream1, dstream2, dstream3], func)

        self.assertEqual([2, 3, 1], self._take(dstream, 3))
开发者ID:LakeCarrot,项目名称:EC2_Initializing,代码行数:76,代码来源:tests.py

示例6: StreamingTestCase

# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import queueStream [as 别名]
class StreamingTestCase(SparkTestingBaseReuse):

    """Basic common test case for Spark Streaming tests. Provides a
    Spark Streaming context as well as some helper methods for creating
    streaming input and collecting streaming output.
    Modeled after PySparkStreamingTestCase."""

    timeout = 15  # seconds
    duration = .5

    @classmethod
    def setUpClass(cls):
        super(StreamingTestCase, cls).setUpClass()
        cls.sc.setCheckpointDir("/tmp")

    @classmethod
    def tearDownClass(cls):
        super(StreamingTestCase, cls).tearDownClass()

    @classmethod
    def _sort_result_based_on_key(cls, result):
        return map(lambda x: sorted(x), result)

    def setUp(self):
        self.ssc = StreamingContext(self.sc, self.duration)

    def tearDown(self):
        self.ssc.stop(False)

    def wait_for(self, result, n):
        start_time = time.time()
        while len(result) < n and time.time() - start_time < self.timeout:
            time.sleep(0.01)
        if len(result) < n:
            print("timeout after", self.timeout)

    def _take(self, dstream, n):
        """
        Return the first `n` elements in the stream (will start and stop).
        """
        results = []

        def take(_, rdd):
            if rdd and len(results) < n:
                results.extend(rdd.take(n - len(results)))

        dstream.foreachRDD(take)

        self.ssc.start()
        self.wait_for(results, n)
        return results

    def _collect(self, dstream, n, block=True):
        """
        Collect each RDDs into the returned list.

        :return: list, which will have the collected items.
        """
        result = []

        def get_output(_, rdd):
            if rdd and len(result) < n:
                r = rdd.collect()
                if r:
                    result.append(r)

        dstream.foreachRDD(get_output)

        if not block:
            return result

        self.ssc.start()
        self.wait_for(result, n)
        return result

    def run_func(self, input, func, expected, sort=False, input2=None):
        """
        @param input: dataset for the test. This should be list of lists
        or list of RDDs.
        @param input2: Optional second dataset for the test. If provided your
        func must take two PythonDStreams as input.
        @param func: wrapped function. This function should return
        PythonDStream.
        @param expected: expected output for this testcase.
        Warning: If output is longer than expected this will silently
        discard the additional output. TODO: fail when this happens.
        """
        if not isinstance(input[0], RDD):
            input = [self.sc.parallelize(d, 1) for d in input]
        input_stream = self.ssc.queueStream(input)
        if input2 and not isinstance(input2[0], RDD):
            input2 = [self.sc.parallelize(d, 1) for d in input2]

        # Apply test function to stream.
        if input2:
            input_stream2 = self.ssc.queueStream(input2)
            stream = func(input_stream, input_stream2)
        else:
            stream = func(input_stream)

#.........这里部分代码省略.........
开发者ID:daha,项目名称:spark-testing-base,代码行数:103,代码来源:streamingtestcase.py

示例7: SparkConf

# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import queueStream [as 别名]
# 6.2.3절

from pyspark import SparkContext, SparkConf
from pyspark.streaming.context import StreamingContext

conf = SparkConf()
sc = SparkContext(master="local[*]", appName="QueueSample", conf=conf)
ssc = StreamingContext(sc, 3)

rdd1 = sc.parallelize(["a", "b", "c"])
rdd2 = sc.parallelize(["c", "d", "e"])

queue = [rdd1, rdd2]

ds = ssc.queueStream(queue)

ds.pprint()

ssc.start()
ssc.awaitTermination()
开发者ID:oopchoi,项目名称:spark,代码行数:22,代码来源:queue_sample.py

示例8: SparkConf

# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import queueStream [as 别名]
# 6.1.1절 예제 6-3
from pyspark import SparkContext, SparkConf
from pyspark.streaming.context import StreamingContext

conf = SparkConf()
sc = SparkContext(master="local[*]", appName="SteamingSample", conf=conf)
ssc = StreamingContext(sc, 3)

rdd1 = sc.parallelize(["Spark Streaming Sample ssc"])
rdd2 = sc.parallelize(["Spark Quque Spark API"])

inputQueue = [rdd1, rdd2]

lines = ssc.queueStream(inputQueue, True)
words = lines.flatMap(lambda v: v.split(" "))
words.countByValue().pprint()

ssc.start()
ssc.awaitTermination()
开发者ID:oopchoi,项目名称:spark,代码行数:21,代码来源:streaming_sample.py

示例9: BasicOperationTests

# 需要导入模块: from pyspark.streaming.context import StreamingContext [as 别名]
# 或者: from pyspark.streaming.context.StreamingContext import queueStream [as 别名]
class BasicOperationTests(PySparkStreamingTestCase):

    def test_map(self):
        """Basic operation test for DStream.map."""
        input = [range(1, 5), range(5, 9), range(9, 13)]

        def func(dstream):
            return dstream.map(str)
        expected = [list(map(str, x)) for x in input]
        self._test_func(input, func, expected)

    def test_flatMap(self):
        """Basic operation test for DStream.flatMap."""
        input = [range(1, 5), range(5, 9), range(9, 13)]

        def func(dstream):
            return dstream.flatMap(lambda x: (x, x * 2))
        expected = [list(chain.from_iterable((map(lambda y: [y, y * 2], x))))
                    for x in input]
        self._test_func(input, func, expected)

    def test_filter(self):
        """Basic operation test for DStream.filter."""
        input = [range(1, 5), range(5, 9), range(9, 13)]

        def func(dstream):
            return dstream.filter(lambda x: x % 2 == 0)
        expected = [[y for y in x if y % 2 == 0] for x in input]
        self._test_func(input, func, expected)

    def test_count(self):
        """Basic operation test for DStream.count."""
        input = [range(5), range(10), range(20)]

        def func(dstream):
            return dstream.count()
        expected = [[len(x)] for x in input]
        self._test_func(input, func, expected)

    def test_slice(self):
        """Basic operation test for DStream.slice."""
        import datetime as dt
        self.ssc = StreamingContext(self.sc, 1.0)
        self.ssc.remember(4.0)
        input = [[1], [2], [3], [4]]
        stream = self.ssc.queueStream([self.sc.parallelize(d, 1) for d in input])

        time_vals = []

        def get_times(t, rdd):
            if rdd and len(time_vals) < len(input):
                time_vals.append(t)

        stream.foreachRDD(get_times)

        self.ssc.start()
        self.wait_for(time_vals, 4)
        begin_time = time_vals[0]

        def get_sliced(begin_delta, end_delta):
            begin = begin_time + dt.timedelta(seconds=begin_delta)
            end = begin_time + dt.timedelta(seconds=end_delta)
            rdds = stream.slice(begin, end)
            result_list = [rdd.collect() for rdd in rdds]
            return [r for result in result_list for r in result]

        self.assertEqual(set([1]), set(get_sliced(0, 0)))
        self.assertEqual(set([2, 3]), set(get_sliced(1, 2)))
        self.assertEqual(set([2, 3, 4]), set(get_sliced(1, 4)))
        self.assertEqual(set([1, 2, 3, 4]), set(get_sliced(0, 4)))

    def test_reduce(self):
        """Basic operation test for DStream.reduce."""
        input = [range(1, 5), range(5, 9), range(9, 13)]

        def func(dstream):
            return dstream.reduce(operator.add)
        expected = [[reduce(operator.add, x)] for x in input]
        self._test_func(input, func, expected)

    def test_reduceByKey(self):
        """Basic operation test for DStream.reduceByKey."""
        input = [[("a", 1), ("a", 1), ("b", 1), ("b", 1)],
                 [("", 1), ("", 1), ("", 1), ("", 1)],
                 [(1, 1), (1, 1), (2, 1), (2, 1), (3, 1)]]

        def func(dstream):
            return dstream.reduceByKey(operator.add)
        expected = [[("a", 2), ("b", 2)], [("", 4)], [(1, 2), (2, 2), (3, 1)]]
        self._test_func(input, func, expected, sort=True)

    def test_mapValues(self):
        """Basic operation test for DStream.mapValues."""
        input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)],
                 [(0, 4), (1, 1), (2, 2), (3, 3)],
                 [(1, 1), (2, 1), (3, 1), (4, 1)]]

        def func(dstream):
            return dstream.mapValues(lambda x: x + 10)
        expected = [[("a", 12), ("b", 12), ("c", 11), ("d", 11)],
#.........这里部分代码省略.........
开发者ID:ahnqirage,项目名称:spark,代码行数:103,代码来源:tests.py


注:本文中的pyspark.streaming.context.StreamingContext.queueStream方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。