本文整理汇总了Python中pyspark.streaming.StreamingContext方法的典型用法代码示例。如果您正苦于以下问题:Python streaming.StreamingContext方法的具体用法?Python streaming.StreamingContext怎么用?Python streaming.StreamingContext使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.streaming
的用法示例。
在下文中一共展示了streaming.StreamingContext方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark import streaming [as 别名]
# 或者: from pyspark.streaming import StreamingContext [as 别名]
def main():
# Adapted from https://github.com/apache/spark/tree/master/examples/src/main/python/streaming
sc = SparkContext(appName='PythonStreamingQueue')
ssc = StreamingContext(sc, 1)
# Create the queue through which RDDs can be pushed to
# a QueueInputDStream
rddQueue = []
for _ in range(5):
rddQueue += [ssc.sparkContext.parallelize([j for j in range(1, 1001)], 10)]
# Create the QueueInputDStream and use it do some processing
inputStream = ssc.queueStream(rddQueue)
mappedStream = inputStream.map(lambda x: (x % 10, 1))
reducedStream = mappedStream.reduceByKey(lambda a, b: a + b)
reducedStream.pprint()
ssc.start()
time.sleep(6)
ssc.stop(stopSparkContext=True, stopGraceFully=True)
示例2: bluecoat_parse
# 需要导入模块: from pyspark import streaming [as 别名]
# 或者: from pyspark.streaming import StreamingContext [as 别名]
def bluecoat_parse(zk, topic, db, db_table, num_of_workers, batch_size):
"""
Parse and save bluecoat logs.
:param zk: Apache ZooKeeper quorum
:param topic: Apache Kafka topic (application name)
:param db: Apache Hive database to save into
:param db_table: table of `db` to save into
:param num_of_workers: number of Apache Kafka workers
:param batch_size: batch size for Apache Spark streaming context
"""
app_name = topic
wrks = int(num_of_workers)
# create spark context
sc = SparkContext(appName=app_name)
ssc = StreamingContext(sc, int(batch_size))
sqc = HiveContext(sc)
tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder)
proxy_data = tp_stream.map(lambda row: row[1]).flatMap(lambda row: row.split("\n")).filter(lambda row: rex_date.match(row)).map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace(" ", " ")).map(lambda row: split_log_entry(row)).map(lambda row: proxy_parser(row))
saved_data = proxy_data.foreachRDD(lambda row: save_data(row, sqc, db, db_table, topic))
ssc.start()
ssc.awaitTermination()
示例3: create_streaming_context
# 需要导入模块: from pyspark import streaming [as 别名]
# 或者: from pyspark.streaming import StreamingContext [as 别名]
def create_streaming_context(spark_context, config):
"""
Create a streaming context with a custom Streaming Listener
that will log every event.
:param spark_context: Spark context
:type spark_context: pyspark.SparkContext
:param config: dict
:return: Returns a new streaming context from the given context.
:rtype: pyspark.streaming.StreamingContext
"""
ssc = streaming.StreamingContext(spark_context, config[
"spark_config"]["streaming"]["batch_interval"])
ssc.addStreamingListener(DriverStreamingListener)
directory = os_path.expanduser("~/checkpointing")
logger.info("Checkpointing to `{}`".format(directory))
# Commented out to fix a crash occurring when
# phase 1 is used. The reason of the crash is still unclear
# but Spark complains about the SSC being transferred
# to workers.
# ssc.checkpoint(directory)
return ssc
示例4: streaming_listener
# 需要导入模块: from pyspark import streaming [as 别名]
# 或者: from pyspark.streaming import StreamingContext [as 别名]
def streaming_listener(**kwargs):
'''
Initialize the Spark job.
'''
Util.get_logger('SPOT.INGEST', kwargs.pop('log_level'))
logger = logging.getLogger('SPOT.INGEST.COMMON.LISTENER')
logger.info('Initializing Spark Streaming Listener...')
dbtable = '{0}.{1}'.format(kwargs.pop('database'), kwargs['type'])
topic = kwargs.pop('topic')
sc = SparkContext(appName=kwargs['app_name'] or topic)
logger.info('Connect to Spark Cluster as job "{0}" and broadcast variables on it.'
.format(kwargs.pop('app_name') or topic))
ssc = StreamingContext(sc, batchDuration=kwargs['batch_duration'])
logger.info('Streaming data will be divided into batches of {0} seconds.'
.format(kwargs.pop('batch_duration')))
hsc = HiveContext(sc)
logger.info('Read Hive\'s configuration to integrate with data stored in it.')
import pipelines
module = getattr(pipelines, kwargs.pop('type'))
stream = module.StreamPipeline(ssc, kwargs.pop('zkquorum'),
kwargs.pop('group_id') or topic, { topic: int(kwargs.pop('partitions')) })
schema = stream.schema
segtype = stream.segtype
stream.dstream\
.map(lambda x: module.StreamPipeline.parse(x))\
.filter(lambda x: bool(x))\
.foreachRDD(lambda x: store(x, hsc, dbtable, topic, schema, segtype))
ssc.start()
logger.info('Start the execution of the streams.')
ssc.awaitTermination()
示例5: setUp
# 需要导入模块: from pyspark import streaming [as 别名]
# 或者: from pyspark.streaming import StreamingContext [as 别名]
def setUp(self):
self.sc = SparkContext('local[4]', "MLlib tests")
self.ssc = StreamingContext(self.sc, 1.0)
示例6: main
# 需要导入模块: from pyspark import streaming [as 别名]
# 或者: from pyspark.streaming import StreamingContext [as 别名]
def main():
"""Run Spark Streaming"""
conf = SparkConf()
sc = SparkContext(appName='Ozymandias', conf=conf)
sc.setLogLevel('WARN')
with open(ROOT + 'channels.json', 'r') as f:
channels = json.load(f)
topics = [t['topic'] for t in channels['channels']]
n_secs = 0.5
ssc = StreamingContext(sc, n_secs)
stream = KafkaUtils.createDirectStream(ssc, topics, {
'bootstrap.servers':'localhost:9092',
'group.id':'ozy-group',
'fetch.message.max.bytes':'15728640',
'auto.offset.reset':'largest'})
stream.map(
deserializer
).map(
image_detector
).foreachRDD(
message_sender)
ssc.start()
ssc.awaitTermination()
示例7: isException
# 需要导入模块: from pyspark import streaming [as 别名]
# 或者: from pyspark.streaming import StreamingContext [as 别名]
def isException(machine, signal):
# assunzioni: da parametrizzare come parametro o letto dinamicamente da fonte
exceptions = [(11,19)]
return (int(machine), signal) in exceptions
# Create a local StreamingContext with two working thread and batch interval of 1 second
示例8: streaming_context
# 需要导入模块: from pyspark import streaming [as 别名]
# 或者: from pyspark.streaming import StreamingContext [as 别名]
def streaming_context(sc):
return StreamingContext(sc, 1)