本文整理汇总了Python中pyspark.streaming.StreamingContext.start方法的典型用法代码示例。如果您正苦于以下问题:Python StreamingContext.start方法的具体用法?Python StreamingContext.start怎么用?Python StreamingContext.start使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.streaming.StreamingContext
的用法示例。
在下文中一共展示了StreamingContext.start方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: ss_direct_kafka_bucket_counter
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import start [as 别名]
def ss_direct_kafka_bucket_counter(brokers, topic, bucket_interval, output_msg, message_parse, valueDecoder=None):
"""Starts a Spark Streaming job from a Kafka input and parses message time
WARNING!! This function only works for spark 1.4.0+
Args:
brokers: the kafka broker that we look at for the topic
topic: the kafka topic for input
timeinterval: the time interval in seconds (int) that the job will
bucket
Returns:
None
"""
sc = SparkContext(appName="PythonKafkaBucketCounter")
ssc = StreamingContext(sc, timeinterval + 5)
if valueDecoder:
kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}, valueDecoder=valueDecoder)
else:
kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
lines = kvs.map(lambda x: x[1])
interval_counts = lines.map(lambda line: (message_parse(line), 1)).reduceByKey(lambda a, b: a + b)
output_msg_func = output_msg(sc, ssc)
interval_counts.foreachRDD(output_msg_func)
ssc.start()
ssc.awaitTermination()
开发者ID:kelvinfann,项目名称:spark-streaming-kafka-bucket-counter,代码行数:34,代码来源:spark-streaming-kafka-bucket-counter.py
示例2: BaseStreamingTestCase
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import start [as 别名]
class BaseStreamingTestCase(unittest.TestCase):
""" From https://github.com/apache/spark/blob/
master/python/pyspark/streaming/tests.py """
timeout = 10 # seconds
duration = .5
def setUp(self):
self.ssc = StreamingContext(sc, self.duration)
def tearDown(self):
self.ssc.stop(False)
def wait_for(self, result, n):
start_time = time.time()
while len(result) < n and time.time() - start_time < self.timeout:
time.sleep(0.01)
if len(result) < n:
print("timeout after", self.timeout)
def _collect(self, dstream, n):
result = []
def get_output(_, rdd):
if rdd and len(result) < n:
r = rdd.collect()
if r:
result.append(r)
dstream.foreachRDD(get_output)
self.ssc.start()
self.wait_for(result, n)
return result
示例3: main
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import start [as 别名]
def main():
sym_dict = {}
conf = SparkConf().setAppName("symbol stream")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, .1)
lines = ssc.socketTextStream("localhost", 1337)
def print_now():
print sym_dict
def predict(prices):
print prices
def add_to_dict(line):
symbol, price, volume = line.split(',')
if symbol in sym_dict:
print 'made it here'
sym_dict[symbol][0].append(price)
sym_dict[symbol][1].append(volume)
if len(sym_dict[0]) > 10:
sym_dict[0].pop(0)
sym_dict[1].pop(0)
predict(sym_dict[0])
else:
sym_dict[symbol] = [[price],[volume]]
#test = lines.map(lambda line: json.dumps(line))
test = lines.map(lambda line: line)
test.pprint()
ssc.start()
ssc.awaitTermination()
示例4: start
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import start [as 别名]
def start():
sconf = SparkConf()
sconf.set('spark.cores.max', 2)
sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
ssc = StreamingContext(sc, 2)
brokers = "192.192.0.27:9092"
topics = ['topic7']
kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})
lines1 = kafkaStreams_lines.map(lambda x: x[1]) # 注意 取tuple下的第二个即为接收到的kafka流
words = lines1.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
wordcounts = pairs.reduceByKey(lambda x, y: x + y)
wordcounts.saveAsTextFiles("/var/lib/hadoop-hdfs/spark-libin/kafka")
wordcounts.pprint()
# 统计生成的随机数的分布情况
ssc.start() # Start the computation
ssc.awaitTermination() # Wait for the computation to terminate
示例5: main
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import start [as 别名]
def main():
parser = argparse.ArgumentParser(
description='process some log messages, storing them and signaling '
'a rest server')
parser.add_argument('--mongo', help='the mongodb url',
required=True)
parser.add_argument('--rest', help='the rest endpoint to signal',
required=True)
parser.add_argument('--port', help='the port to receive from '
'(default: 1984)',
default=1984, type=int)
parser.add_argument('--appname', help='the name of the spark application '
'(default: SparkharaLogCounter)',
default='SparkharaLogCounter')
parser.add_argument('--master',
help='the master url for the spark cluster')
parser.add_argument('--socket',
help='the socket to attach for streaming text data '
'(default: caravan-pathfinder)',
default='caravan-pathfinder')
args = parser.parse_args()
mongo_url = args.mongo
rest_url = args.rest
sconf = SparkConf().setAppName(args.appname)
if args.master:
sconf.setMaster(args.master)
sc = SparkContext(conf=sconf)
ssc = StreamingContext(sc, 1)
lines = ssc.socketTextStream(args.socket, args.port)
lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url, rest_url))
ssc.start()
ssc.awaitTermination()
示例6: start_spark
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import start [as 别名]
def start_spark(timeout=None, max_items_per_rdd_sent=None):
sc = SparkContext("local[4]", "twitter.trending")
ssc = StreamingContext(sc, 5)
ssc.checkpoint('hdfs://localhost:9000/user/spark/checkpoint/')
kafka_params = {
'zookeeper.connect': config.get('zookeeper', 'host'),
'group.id': config.get('kafka', 'group_id'),
'metadata.broker.list': config.get('kafka', 'hosts')
}
ksc = KafkaUtils.createDirectStream(ssc,
[config.get('kafka', 'topic')],
kafka_params)
hashtag_counts = get_word_counts(ksc)
filtered_tweet_count = filter_tweets(hashtag_counts)
send_dstream_data(filtered_tweet_count, max_items_per_rdd_sent)
ssc.start()
if timeout:
ssc.awaitTermination(timeout)
ssc.stop(stopSparkContext=True, stopGraceFully=True)
else:
ssc.awaitTermination()
示例7: start
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import start [as 别名]
def start():
sconf = SparkConf()
sconf.set('spark.cores.max', 2)
sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf)
ssc = StreamingContext(sc, 2)
brokers = "localhost:9092"
topics = ['test']
kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers})
lines1 = kafkaStreams_lines.map(lambda x: x[1]) # 注意 取tuple下的第二个即为接收到的kafka流
words = lines1.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
wordcounts = pairs.reduceByKey(lambda x, y: x + y)
print(wordcounts)
kafkaStreams_lines.transform(storeOffsetRanges).foreachRDD(printOffsetRanges)
wordcounts.pprint()
# 统计生成的随机数的分布情况
ssc.start() # Start the computation
ssc.awaitTermination() # Wait for the computation to terminate
示例8: main
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import start [as 别名]
def main():
if len(sys.argv) != 4:
print("Usage: kafka_wordcount.py <zk> <topic> <timeout>",
file=sys.stderr)
exit(-1)
sc = SparkContext(appName="PythonStreamingKafkaWordCount")
ssc = StreamingContext(sc, 1)
timeout = None
if len(sys.argv) == 4:
zk, topic, timeout = sys.argv[1:]
timeout = int(timeout)
else:
zk, topic = sys.argv[1:]
kvs = KafkaUtils.createStream(
ssc, zk, "spark-streaming-consumer", {topic: 1})
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: (line.split(" "))
.map(lambda word: (word, 1))
.reduceByKey(lambda a, b: a+b))
counts.pprint()
kwargs = {}
if timeout:
kwargs['timeout'] = timeout
ssc.start()
ssc.awaitTermination(**kwargs)
示例9: bro_parse
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import start [as 别名]
def bro_parse(zk,topic,db,db_table,num_of_workers):
app_name = "ONI-INGEST-{0}".format(topic)
wrks = int(num_of_workers)
# create spark context
sc = SparkContext(appName=app_name)
ssc = StreamingContext(sc,1)
sqc = HiveContext(sc)
# create DStream for each topic partition.
topic_dstreams = [ KafkaUtils.createStream(ssc, zk, app_name, {topic: 1}, keyDecoder=oni_decoder, valueDecoder=oni_decoder) for _ in range (wrks) ]
tp_stream = ssc.union(*topic_dstreams)
# Parallelism in Data Processing
#processingDStream = tp_stream(wrks)
# parse the RDD content.
proxy_logs = tp_stream.map(lambda x: proxy_parser(x[1]))
# save RDD into hive .
proxy_logs.foreachRDD(lambda x: save_to_hive(x,sqc,db,db_table,topic))
ssc.start()
ssc.awaitTermination()
示例10: main
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import start [as 别名]
def main():
sc = SparkContext(appName="IntrusionDetector")
ssc = StreamingContext(sc, batch_durations)
kvs = KafkaUtils.createDirectStream(ssc, [input_topic], {"metadata.broker.list": broker})
kvs.foreachRDD(processRDD)
ssc.start()
ssc.awaitTermination()
示例11: kafka_spark_streaming_sql_main
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import start [as 别名]
def kafka_spark_streaming_sql_main(app_name, brokers, topic, interval_seconds, sql_function):
sc = SparkContext(appName=app_name)
sqlContext = SQLContext(sc)
# ssc = StreamingContext(sc, interval_seconds)
ssc = StreamingContext(sc, 10)
kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
kvs.foreachRDD(sql_function)
ssc.start()
ssc.awaitTermination()
示例12: read_tweets
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import start [as 别名]
def read_tweets():
sc = SparkContext(appName="sentimentProducer")
ssc = StreamingContext(sc,600) # Test 60 segundos
brokers = "localhost:9092"
kvs = KafkaUtils.createDirectStream(ssc, ["test"], {"metadata.broker.list": brokers})
kvs.foreachRDD(create_format)
producer.flush()
ssc.start()
ssc.awaitTermination()
示例13: invoke
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import start [as 别名]
def invoke():
# object to keep track of offsets
ConfigInitializer.basic_config()
# app name
application_name = "mon_metrics_kafka"
my_spark_conf = SparkConf().setAppName(application_name)
spark_context = SparkContext(conf=my_spark_conf)
# read at the configured interval
spark_streaming_context = \
StreamingContext(spark_context, cfg.CONF.service.stream_interval)
kafka_stream = MonMetricsKafkaProcessor.get_kafka_stream(
cfg.CONF.messaging.topic,
spark_streaming_context)
# transform to recordstore
MonMetricsKafkaProcessor.transform_to_recordstore(kafka_stream)
# catch interrupt, stop streaming context gracefully
# signal.signal(signal.SIGINT, signal_handler)
# start processing
spark_streaming_context.start()
# FIXME: stop spark context to relinquish resources
# FIXME: specify cores, so as not to use all the resources on the cluster.
# FIXME: HA deploy multiple masters, may be one on each control node
try:
# Wait for the Spark driver to "finish"
spark_streaming_context.awaitTermination()
except Exception as e:
MonMetricsKafkaProcessor.log_debug(
"Exception raised during Spark execution : " + str(e))
# One exception that can occur here is the result of the saved
# kafka offsets being obsolete/out of range. Delete the saved
# offsets to improve the chance of success on the next execution.
# TODO(someone) prevent deleting all offsets for an application,
# but just the latest revision
MonMetricsKafkaProcessor.log_debug(
"Deleting saved offsets for chance of success on next execution")
MonMetricsKafkaProcessor.reset_kafka_offsets(application_name)
# delete pre hourly processor offsets
if cfg.CONF.stage_processors.pre_hourly_processor_enabled:
PreHourlyProcessor.reset_kafka_offsets()
示例14: main
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import start [as 别名]
def main():
conf = SparkConf().setAppName("kafka_source_mongo_sink_pymongo_filtered")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 1)
try:
kafka_streams = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"splash_json": 2})
kafka_streams.foreachRDD(process_rdd)
except Exception as e:
print e
ssc.start()
ssc.awaitTermination()
示例15: sparkTask
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import start [as 别名]
def sparkTask():
from textblob import TextBlob
import re
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
sc = SparkContext()
ssc = StreamingContext(sc, 1)
quotes = ssc.socketTextStream("localhost", 9999)
dataSentencesPolarity = quotes.map(lambda x: TextBlob(re.sub('[^A-Za-z0-9 \.\']+', '',x))).map(lambda y: (str(y.upper())[:60], y.sentiment.polarity))
dataSentencesPolarity.pprint()
ssc.start() # Start the computation
ssc.awaitTermination(20) # Wait for the computation to terminate