當前位置: 首頁>>代碼示例>>Python>>正文


Python KafkaUtils.createStream方法代碼示例

本文整理匯總了Python中pyspark.streaming.kafka.KafkaUtils.createStream方法的典型用法代碼示例。如果您正苦於以下問題:Python KafkaUtils.createStream方法的具體用法?Python KafkaUtils.createStream怎麽用?Python KafkaUtils.createStream使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在pyspark.streaming.kafka.KafkaUtils的用法示例。


在下文中一共展示了KafkaUtils.createStream方法的7個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: bluecoat_parse

# 需要導入模塊: from pyspark.streaming.kafka import KafkaUtils [as 別名]
# 或者: from pyspark.streaming.kafka.KafkaUtils import createStream [as 別名]
def bluecoat_parse(zk, topic, db, db_table, num_of_workers, batch_size):
    """
    Parse and save bluecoat logs.

    :param zk: Apache ZooKeeper quorum
    :param topic: Apache Kafka topic (application name)
    :param db: Apache Hive database to save into
    :param db_table: table of `db` to save into
    :param num_of_workers: number of Apache Kafka workers
    :param batch_size: batch size for Apache Spark streaming context
    """
    app_name = topic
    wrks = int(num_of_workers)

    # create spark context
    sc = SparkContext(appName=app_name)
    ssc = StreamingContext(sc, int(batch_size))
    sqc = HiveContext(sc)

    tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder)

    proxy_data = tp_stream.map(lambda row: row[1]).flatMap(lambda row: row.split("\n")).filter(lambda row: rex_date.match(row)).map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace("  ", " ")).map(lambda row: split_log_entry(row)).map(lambda row: proxy_parser(row))
    saved_data = proxy_data.foreachRDD(lambda row: save_data(row, sqc, db, db_table, topic))
    ssc.start()
    ssc.awaitTermination() 
開發者ID:apache,項目名稱:incubator-spot,代碼行數:27,代碼來源:bluecoat.py

示例2: __init__

# 需要導入模塊: from pyspark.streaming.kafka import KafkaUtils [as 別名]
# 或者: from pyspark.streaming.kafka.KafkaUtils import createStream [as 別名]
def __init__(self, ssc, zkQuorum, groupId, topics):
        from common.serializer       import deserialize
        from pyspark.streaming.kafka import KafkaUtils

        self.__dstream = KafkaUtils.createStream(ssc, zkQuorum, groupId, topics,
                        keyDecoder=lambda x: x, valueDecoder=deserialize) 
開發者ID:apache,項目名稱:incubator-spot,代碼行數:8,代碼來源:streaming.py

示例3: __init__

# 需要導入模塊: from pyspark.streaming.kafka import KafkaUtils [as 別名]
# 或者: from pyspark.streaming.kafka.KafkaUtils import createStream [as 別名]
def __init__(self, ssc, zkQuorum, groupId, topics):
        from common.serializer       import deserialize
        from pyspark.streaming.kafka import KafkaUtils

        self.__dstream = KafkaUtils.createStream(ssc, zkQuorum, groupId, topics,
                            keyDecoder=lambda x: x, valueDecoder=deserialize) 
開發者ID:apache,項目名稱:incubator-spot,代碼行數:8,代碼來源:streaming.py

示例4: test_kafka_stream

# 需要導入模塊: from pyspark.streaming.kafka import KafkaUtils [as 別名]
# 或者: from pyspark.streaming.kafka.KafkaUtils import createStream [as 別名]
def test_kafka_stream(self):
        """Test the Python Kafka stream API."""
        topic = self._randomTopic()
        sendData = {"a": 3, "b": 5, "c": 10}

        self._kafkaTestUtils.createTopic(topic)
        self._kafkaTestUtils.sendMessages(topic, sendData)

        stream = KafkaUtils.createStream(self.ssc, self._kafkaTestUtils.zkAddress(),
                                         "test-streaming-consumer", {topic: 1},
                                         {"auto.offset.reset": "smallest"})
        self._validateStreamResult(sendData, stream) 
開發者ID:runawayhorse001,項目名稱:LearningApacheSpark,代碼行數:14,代碼來源:tests.py

示例5: _startContext

# 需要導入模塊: from pyspark.streaming.kafka import KafkaUtils [as 別名]
# 或者: from pyspark.streaming.kafka.KafkaUtils import createStream [as 別名]
def _startContext(self, n, compressed):
        # Start the StreamingContext and also collect the result
        dstream = FlumeUtils.createStream(self.ssc, "localhost", self._utils.getTestPort(),
                                          enableDecompression=compressed)
        result = []

        def get_output(_, rdd):
            for event in rdd.collect():
                if len(result) < n:
                    result.append(event)
        dstream.foreachRDD(get_output)
        self.ssc.start()
        return result 
開發者ID:runawayhorse001,項目名稱:LearningApacheSpark,代碼行數:15,代碼來源:tests.py

示例6: test_kinesis_stream_api

# 需要導入模塊: from pyspark.streaming.kafka import KafkaUtils [as 別名]
# 或者: from pyspark.streaming.kafka.KafkaUtils import createStream [as 別名]
def test_kinesis_stream_api(self):
        # Don't start the StreamingContext because we cannot test it in Jenkins
        kinesisStream1 = KinesisUtils.createStream(
            self.ssc, "myAppNam", "mySparkStream",
            "https://kinesis.us-west-2.amazonaws.com", "us-west-2",
            InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2)
        kinesisStream2 = KinesisUtils.createStream(
            self.ssc, "myAppNam", "mySparkStream",
            "https://kinesis.us-west-2.amazonaws.com", "us-west-2",
            InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2,
            "awsAccessKey", "awsSecretKey") 
開發者ID:runawayhorse001,項目名稱:LearningApacheSpark,代碼行數:13,代碼來源:tests.py

示例7: test_kinesis_stream

# 需要導入模塊: from pyspark.streaming.kafka import KafkaUtils [as 別名]
# 或者: from pyspark.streaming.kafka.KafkaUtils import createStream [as 別名]
def test_kinesis_stream(self):
        if not are_kinesis_tests_enabled:
            sys.stderr.write(
                "Skipped test_kinesis_stream (enable by setting environment variable %s=1"
                % kinesis_test_environ_var)
            return

        import random
        kinesisAppName = ("KinesisStreamTests-%d" % abs(random.randint(0, 10000000)))
        kinesisTestUtils = self.ssc._jvm.org.apache.spark.streaming.kinesis.KinesisTestUtils(2)
        try:
            kinesisTestUtils.createStream()
            aWSCredentials = kinesisTestUtils.getAWSCredentials()
            stream = KinesisUtils.createStream(
                self.ssc, kinesisAppName, kinesisTestUtils.streamName(),
                kinesisTestUtils.endpointUrl(), kinesisTestUtils.regionName(),
                InitialPositionInStream.LATEST, 10, StorageLevel.MEMORY_ONLY,
                aWSCredentials.getAWSAccessKeyId(), aWSCredentials.getAWSSecretKey())

            outputBuffer = []

            def get_output(_, rdd):
                for e in rdd.collect():
                    outputBuffer.append(e)

            stream.foreachRDD(get_output)
            self.ssc.start()

            testData = [i for i in range(1, 11)]
            expectedOutput = set([str(i) for i in testData])
            start_time = time.time()
            while time.time() - start_time < 120:
                kinesisTestUtils.pushData(testData)
                if expectedOutput == set(outputBuffer):
                    break
                time.sleep(10)
            self.assertEqual(expectedOutput, set(outputBuffer))
        except:
            import traceback
            traceback.print_exc()
            raise
        finally:
            self.ssc.stop(False)
            kinesisTestUtils.deleteStream()
            kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)


# Search jar in the project dir using the jar name_prefix for both sbt build and maven build because
# the artifact jars are in different directories. 
開發者ID:runawayhorse001,項目名稱:LearningApacheSpark,代碼行數:51,代碼來源:tests.py


注:本文中的pyspark.streaming.kafka.KafkaUtils.createStream方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。