本文整理汇总了Python中pyspark.streaming.kafka.KafkaUtils.createStream方法的典型用法代码示例。如果您正苦于以下问题:Python KafkaUtils.createStream方法的具体用法?Python KafkaUtils.createStream怎么用?Python KafkaUtils.createStream使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.streaming.kafka.KafkaUtils
的用法示例。
在下文中一共展示了KafkaUtils.createStream方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: bluecoat_parse
# 需要导入模块: from pyspark.streaming.kafka import KafkaUtils [as 别名]
# 或者: from pyspark.streaming.kafka.KafkaUtils import createStream [as 别名]
def bluecoat_parse(zk, topic, db, db_table, num_of_workers, batch_size):
"""
Parse and save bluecoat logs.
:param zk: Apache ZooKeeper quorum
:param topic: Apache Kafka topic (application name)
:param db: Apache Hive database to save into
:param db_table: table of `db` to save into
:param num_of_workers: number of Apache Kafka workers
:param batch_size: batch size for Apache Spark streaming context
"""
app_name = topic
wrks = int(num_of_workers)
# create spark context
sc = SparkContext(appName=app_name)
ssc = StreamingContext(sc, int(batch_size))
sqc = HiveContext(sc)
tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder)
proxy_data = tp_stream.map(lambda row: row[1]).flatMap(lambda row: row.split("\n")).filter(lambda row: rex_date.match(row)).map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace(" ", " ")).map(lambda row: split_log_entry(row)).map(lambda row: proxy_parser(row))
saved_data = proxy_data.foreachRDD(lambda row: save_data(row, sqc, db, db_table, topic))
ssc.start()
ssc.awaitTermination()
示例2: __init__
# 需要导入模块: from pyspark.streaming.kafka import KafkaUtils [as 别名]
# 或者: from pyspark.streaming.kafka.KafkaUtils import createStream [as 别名]
def __init__(self, ssc, zkQuorum, groupId, topics):
from common.serializer import deserialize
from pyspark.streaming.kafka import KafkaUtils
self.__dstream = KafkaUtils.createStream(ssc, zkQuorum, groupId, topics,
keyDecoder=lambda x: x, valueDecoder=deserialize)
示例3: __init__
# 需要导入模块: from pyspark.streaming.kafka import KafkaUtils [as 别名]
# 或者: from pyspark.streaming.kafka.KafkaUtils import createStream [as 别名]
def __init__(self, ssc, zkQuorum, groupId, topics):
from common.serializer import deserialize
from pyspark.streaming.kafka import KafkaUtils
self.__dstream = KafkaUtils.createStream(ssc, zkQuorum, groupId, topics,
keyDecoder=lambda x: x, valueDecoder=deserialize)
示例4: test_kafka_stream
# 需要导入模块: from pyspark.streaming.kafka import KafkaUtils [as 别名]
# 或者: from pyspark.streaming.kafka.KafkaUtils import createStream [as 别名]
def test_kafka_stream(self):
"""Test the Python Kafka stream API."""
topic = self._randomTopic()
sendData = {"a": 3, "b": 5, "c": 10}
self._kafkaTestUtils.createTopic(topic)
self._kafkaTestUtils.sendMessages(topic, sendData)
stream = KafkaUtils.createStream(self.ssc, self._kafkaTestUtils.zkAddress(),
"test-streaming-consumer", {topic: 1},
{"auto.offset.reset": "smallest"})
self._validateStreamResult(sendData, stream)
示例5: _startContext
# 需要导入模块: from pyspark.streaming.kafka import KafkaUtils [as 别名]
# 或者: from pyspark.streaming.kafka.KafkaUtils import createStream [as 别名]
def _startContext(self, n, compressed):
# Start the StreamingContext and also collect the result
dstream = FlumeUtils.createStream(self.ssc, "localhost", self._utils.getTestPort(),
enableDecompression=compressed)
result = []
def get_output(_, rdd):
for event in rdd.collect():
if len(result) < n:
result.append(event)
dstream.foreachRDD(get_output)
self.ssc.start()
return result
示例6: test_kinesis_stream_api
# 需要导入模块: from pyspark.streaming.kafka import KafkaUtils [as 别名]
# 或者: from pyspark.streaming.kafka.KafkaUtils import createStream [as 别名]
def test_kinesis_stream_api(self):
# Don't start the StreamingContext because we cannot test it in Jenkins
kinesisStream1 = KinesisUtils.createStream(
self.ssc, "myAppNam", "mySparkStream",
"https://kinesis.us-west-2.amazonaws.com", "us-west-2",
InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2)
kinesisStream2 = KinesisUtils.createStream(
self.ssc, "myAppNam", "mySparkStream",
"https://kinesis.us-west-2.amazonaws.com", "us-west-2",
InitialPositionInStream.LATEST, 2, StorageLevel.MEMORY_AND_DISK_2,
"awsAccessKey", "awsSecretKey")
示例7: test_kinesis_stream
# 需要导入模块: from pyspark.streaming.kafka import KafkaUtils [as 别名]
# 或者: from pyspark.streaming.kafka.KafkaUtils import createStream [as 别名]
def test_kinesis_stream(self):
if not are_kinesis_tests_enabled:
sys.stderr.write(
"Skipped test_kinesis_stream (enable by setting environment variable %s=1"
% kinesis_test_environ_var)
return
import random
kinesisAppName = ("KinesisStreamTests-%d" % abs(random.randint(0, 10000000)))
kinesisTestUtils = self.ssc._jvm.org.apache.spark.streaming.kinesis.KinesisTestUtils(2)
try:
kinesisTestUtils.createStream()
aWSCredentials = kinesisTestUtils.getAWSCredentials()
stream = KinesisUtils.createStream(
self.ssc, kinesisAppName, kinesisTestUtils.streamName(),
kinesisTestUtils.endpointUrl(), kinesisTestUtils.regionName(),
InitialPositionInStream.LATEST, 10, StorageLevel.MEMORY_ONLY,
aWSCredentials.getAWSAccessKeyId(), aWSCredentials.getAWSSecretKey())
outputBuffer = []
def get_output(_, rdd):
for e in rdd.collect():
outputBuffer.append(e)
stream.foreachRDD(get_output)
self.ssc.start()
testData = [i for i in range(1, 11)]
expectedOutput = set([str(i) for i in testData])
start_time = time.time()
while time.time() - start_time < 120:
kinesisTestUtils.pushData(testData)
if expectedOutput == set(outputBuffer):
break
time.sleep(10)
self.assertEqual(expectedOutput, set(outputBuffer))
except:
import traceback
traceback.print_exc()
raise
finally:
self.ssc.stop(False)
kinesisTestUtils.deleteStream()
kinesisTestUtils.deleteDynamoDBTable(kinesisAppName)
# Search jar in the project dir using the jar name_prefix for both sbt build and maven build because
# the artifact jars are in different directories.