本文整理汇总了Python中pyspark.streaming.StreamingContext.getOrCreate方法的典型用法代码示例。如果您正苦于以下问题:Python StreamingContext.getOrCreate方法的具体用法?Python StreamingContext.getOrCreate怎么用?Python StreamingContext.getOrCreate使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.streaming.StreamingContext
的用法示例。
在下文中一共展示了StreamingContext.getOrCreate方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_transform_function_serializer_failure
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
def test_transform_function_serializer_failure(self):
inputd = tempfile.mkdtemp()
self.cpd = tempfile.mkdtemp("test_transform_function_serializer_failure")
def setup():
conf = SparkConf().set("spark.default.parallelism", 1)
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 0.5)
# A function that cannot be serialized
def process(time, rdd):
sc.parallelize(range(1, 10))
ssc.textFileStream(inputd).foreachRDD(process)
return ssc
self.ssc = StreamingContext.getOrCreate(self.cpd, setup)
try:
self.ssc.start()
except:
import traceback
failure = traceback.format_exc()
self.assertTrue(
"It appears that you are attempting to reference SparkContext" in failure)
return
self.fail("using SparkContext in process should fail because it's not Serializable")
示例2: main
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
def main(conf):
ssc = None
if conf.CHECK_POINT:
ssc = StreamingContext.getOrCreate(conf.CHECK_POINT_PATH, lambda: createContext(conf))
else:
ssc = createContext(conf)
ssc.start()
ssc.awaitTermination()
return ssc
示例3: createContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
def createContext():
uBATCH_INTERVAL = 10
sc = SparkContext(SPARK_MASTER, appName="StreamingKafka")
sc.broadcast(batchUserPostDict)
sc.broadcast(batchPostUserDict)
#sc = SparkContext("local[*]", appName="StreamingKafka")
# streaming batch interval of 5 sec first, and reduce later to 1 sec or lower
ssc = StreamingContext(sc, uBATCH_INTERVAL)
ssc.checkpoint(CHECKPOINT_DIR) # set checkpoint directory in HDFS
#ssc.checkpoint(10 * uBATCH_INTERVAL)
return ssc
ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, createContext)
示例4: StreamingContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
ssc = StreamingContext(sc, STREAMING_INTERVAL)
lines = ssc.textFileStream('hdfs://{}/data/on_time/streaming/'.format(MASTER_URL))
ssc.checkpoint(CHECKPOINT_DIR)
# main split-combine-apply logic put here
pairs = lines.map(lambda x: x.split(",")).map(lambda x: (x[8], 1))
runningCounts = pairs.updateStateByKey(updateFunction)
sortedCounts = runningCounts.transform(lambda rdd: rdd.sortBy(lambda (airport, freq): freq, ascending=False))
sortedCounts.foreachRDD(process)
return ssc
ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, createContext)
ssc.start()
finish_delay = 0
while True:
res = ssc.awaitTerminationOrTimeout(2)
if (dfstream_num >= TOTAL_FILES):
finish_delay += 1
# stop logic
if res or empty_count >= 10 or finish_delay >= 10:
# stopped elsewhere
print 'Finished processing all {} files, terminate the streaming job!!!!!!'.format(TOTAL_FILES)
ssc.stop(stopSparkContext=True, stopGraceFully=True)
break
示例5: getDroppedWordsCounter
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
# Get or register the droppedWordsCounter Accumulator
droppedWordsCounter = getDroppedWordsCounter(rdd.context)
# Use blacklist to drop words and use droppedWordsCounter to count them
def filterFunc(wordCount):
if wordCount[0] in blacklist.value:
droppedWordsCounter.add(wordCount[1])
False
else:
True
counts = "Counts at time %s %s" % (time, rdd.filter(filterFunc).collect())
print(counts)
print("Dropped %d word(s) totally" % droppedWordsCounter.value)
print("Appending to " + os.path.abspath(outputPath))
with open(outputPath, 'a') as f:
f.write(counts + "\n")
wordCounts.foreachRDD(echo)
return ssc
if __name__ == "__main__":
if len(sys.argv) != 5:
print("Usage: recoverable_network_wordcount.py <hostname> <port> "
"<checkpoint-directory> <output-file>", file=sys.stderr)
exit(-1)
host, port, checkpoint, output = sys.argv[1:]
ssc = StreamingContext.getOrCreate(checkpoint,
lambda: createContext(host, int(port), output))
ssc.start()
ssc.awaitTermination()
示例6: createContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
def createContext(brokers, topic, checkpointDir):
# If you do not see this printed, that means the StreamingContext has been loaded
# from the new checkpoint
sc = SparkContext(appName="PythonStreamingRecoverableNetworkWordCount")
ssc = StreamingContext(sc, 1)
kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
lines = kvs.map(lambda x: x[1])
wordCounts = lines.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a+b)
#wordCounts.foreachRDD(echo)
wordCounts.pprint()
ssc.checkpoint(checkpointDir)
return ssc
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: direct_kafka_wordcount_checkpoint.py <broker_list> <topic> <checkpointDir>", file=sys.stderr)
exit(-1)
brokers, topic, checkpointDir = sys.argv[1:]
ssc = StreamingContext.getOrCreate(checkpointDir,
lambda: createContext(brokers, topic, checkpointDir))
ssc.start()
ssc.awaitTermination()
示例7: file
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
ArrDelay = arrived_data.map(lambda m: ((m.AirlineID, airline_lookup.value[str(m.AirlineID)]), m.ArrDelay))
# sum elements and number of elements, to store them in a intermediate file (k, (sum(v), len(v)))
collectDelays = ArrDelay.map(lambda (key, value): (key, [value])).reduceByKey(add).map(lambda (key, values): (key, [sum(values), len(values)]))
#debug
collectDelays.pprint()
# Saving data in hdfs
collectDelays.saveAsTextFiles(OUTPUT_DIR)
#main function
if __name__ == "__main__":
# Configure Spark. Create a new context or restore from checkpoint
ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, functionToCreateContext)
# get this spark context
sc = ssc.sparkContext
# http://stackoverflow.com/questions/24686474/shipping-python-modules-in-pyspark-to-other-nodes
sc.addPyFile("common.py")
# Create a Transformed DStream. Read Kafka from first offset
# creating a stream
# :param ssc: StreamingContext object
# :param zkQuorum: Zookeeper quorum (hostname:port,hostname:port,..).
# :param groupId: The group id for this consumer.
# :param topics: Dict of (topic_name -> numPartitions) to consume.
# Each partition is consumed in its own thread.
# :param kafkaParams: Additional params for Kafka
示例8: list
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
)
rankingProfitability = topTenProfitableAreas.transform(
lambda rdd: rdd.map(lambda x: ("ranking2", x)).groupByKey().map(lambda x: (x[0], list(x[1])))
)
# topTenWithDelays.map(lambda x: ('TOP-TEN-WITH-DELAYS', x)).pprint()
status = rankingTripCount.updateStateByKey(updateFunc)
status.pprint()
status2 = rankingProfitability.updateStateByKey(updateFunc2)
status2.pprint()
return ssc
if __name__ == "__main__":
if len(sys.argv) != 8:
print(
"Usage: spark_taxi.py <hostname> <port> " "<checkpoint-directory> <output-file1> <output-file2>",
file=sys.stderr,
)
exit(-1)
host, port, checkpoint, output1, output2, windowDuration, slideDuration = sys.argv[1:]
ssc = StreamingContext.getOrCreate(
checkpoint, lambda: createContext(host, int(port), output1, output2, int(windowDuration), int(slideDuration))
)
ssc.start()
ssc.awaitTermination()
示例9: updateFunction
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
def updateFunction(new_values, last_sum):
return sum(new_values) + (last_sum or 0)
def functionToCreateContext():
sc = SparkContext(appName="StreamingExampleWithKafka")
ssc = StreamingContext(sc, 10)
ssc.checkpoint("checkpoint")
opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"}
kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts)
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).updateStateByKey(updateFunction)
counts.pprint()
return ssc
ssc = StreamingContext.getOrCreate("checkpoint", functionToCreateContext)
ssc.start()
ssc.awaitTermination()
示例10: int
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
# temperature is inside the body as an amqp value
return int(data["body"]["section"])
def getMax(a,b):
if (a > b):
return a
else:
return b
def createStreamingContext():
conf = SparkConf().setMaster("local[2]").setAppName("amqp_temperature")
conf.set("spark.streaming.receiver.writeAheadLog.enable", "true")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 1)
ssc.checkpoint("/tmp/spark-streaming-amqp")
receiveStream = AMQPUtils.createStream(ssc, "localhost", 5672, "temperature")
temperature = receiveStream.map(getTemperature)
max = temperature.reduceByWindow(getMax, None, 5, 5)
max.pprint()
return ssc
ssc = StreamingContext.getOrCreate("/tmp/spark-streaming-amqp", createStreamingContext)
ssc.start()
ssc.awaitTermination()
示例11: SparkContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
print "Creating new context"
sc = SparkContext(appName="StreamingWordCount")
ssc = StreamingContext(sc, 2)
lines = ssc.socketTextStream(host, port)
def countWords(newValues, lastSum):
if lastSum is None:
lastSum = 0
return sum(newValues, lastSum)
word_counts = lines.flatMap(lambda line: line.split(" "))\
.filter(lambda w: w.startswith("#"))\
.map(lambda word: (word, 1))\
.updateStateByKey(countWords)
word_counts.pprint()
return ssc
if __name__ == "__main__":
host, port, checkpoint_dir = sys.argv[1:]
print checkpoint_dir
ssc = StreamingContext.getOrCreate(checkpoint_dir,
lambda: create_context(host, int(port)))
ssc.start()
ssc.awaitTermination()
示例12: updateFunction
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
def updateFunction(newValues, runningCount):
if runningCount is None:
runningCount = 0
return sum(newValues, runningCount)
def partitionCount(datum):
datatype = datum['dataunit'].keys()[0]
if datatype.endswith('property'):
return ('\\'.join((datatype,
datum['dataunit'][datatype]['property'].keys()[0])), 1)
else:
return (datatype, 1)
if __name__ == "__main__":
ssc = StreamingContext.getOrCreate(checkpointDir, buildContext)
zkQuorum = "localhost:2181"
kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {KAFKA_TOPIC: 1})
#data stream of data dictionaries
ds = kvs.map(lambda data: ast.literal_eval(data[1]))
ds.pprint()
pprint(type(ds))
# print type(ds)
data.append(ds)
if len(data) > 0 and len(data) % (n*13) == 0:
path = "..\\speed\\"
name = "speed" + str(len(data)/n*13) + ".json"
with open (path+name, 'w') as outfile:
示例13: threaded_function
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
#meat
data_from_ticket_mechine.map(ticket_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(ticket_mechine_RDD_handler)
data_from_camera_mechine.map(camera_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(camera_mechine_RDD_handler)
ssc.checkpoint(checkpointDirectory) # set checkpoint directory
return ssc
def threaded_function(arg):
from rpyc.utils.server import ThreadedServer
t = ThreadedServer(MyService, port = 18862, protocol_config = {"allow_all_attrs" : True, 'allow_pickle' : True})
t.start()
if __name__ == "__main__":
from threading import Thread
thread = Thread(target = threaded_function, args=(None,))
thread.start()
conf = SparkConf().setMaster("localhost")
ssc = StreamingContext.getOrCreate(checkpointDirectory, functionToCreateContext)
ssc.start()
ssc.awaitTermination()
示例14: test_get_or_create_and_get_active_or_create
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
def test_get_or_create_and_get_active_or_create(self):
inputd = tempfile.mkdtemp()
outputd = tempfile.mkdtemp() + "/"
def updater(vs, s):
return sum(vs, s or 0)
def setup():
conf = SparkConf().set("spark.default.parallelism", 1)
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 2)
dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1))
wc = dstream.updateStateByKey(updater)
wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test")
wc.checkpoint(2)
self.setupCalled = True
return ssc
# Verify that getOrCreate() calls setup() in absence of checkpoint files
self.cpd = tempfile.mkdtemp("test_streaming_cps")
self.setupCalled = False
self.ssc = StreamingContext.getOrCreate(self.cpd, setup)
self.assertTrue(self.setupCalled)
self.ssc.start()
def check_output(n):
while not os.listdir(outputd):
if self.ssc.awaitTerminationOrTimeout(0.5):
raise Exception("ssc stopped")
time.sleep(1) # make sure mtime is larger than the previous one
with open(os.path.join(inputd, str(n)), 'w') as f:
f.writelines(["%d\n" % i for i in range(10)])
while True:
if self.ssc.awaitTerminationOrTimeout(0.5):
raise Exception("ssc stopped")
p = os.path.join(outputd, max(os.listdir(outputd)))
if '_SUCCESS' not in os.listdir(p):
# not finished
continue
ordd = self.ssc.sparkContext.textFile(p).map(lambda line: line.split(","))
d = ordd.values().map(int).collect()
if not d:
continue
self.assertEqual(10, len(d))
s = set(d)
self.assertEqual(1, len(s))
m = s.pop()
if n > m:
continue
self.assertEqual(n, m)
break
check_output(1)
check_output(2)
# Verify the getOrCreate() recovers from checkpoint files
self.ssc.stop(True, True)
time.sleep(1)
self.setupCalled = False
self.ssc = StreamingContext.getOrCreate(self.cpd, setup)
self.assertFalse(self.setupCalled)
self.ssc.start()
check_output(3)
# Verify that getOrCreate() uses existing SparkContext
self.ssc.stop(True, True)
time.sleep(1)
self.sc = SparkContext(conf=SparkConf())
self.setupCalled = False
self.ssc = StreamingContext.getOrCreate(self.cpd, setup)
self.assertFalse(self.setupCalled)
self.assertTrue(self.ssc.sparkContext == self.sc)
# Verify the getActiveOrCreate() recovers from checkpoint files
self.ssc.stop(True, True)
time.sleep(1)
self.setupCalled = False
self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup)
self.assertFalse(self.setupCalled)
self.ssc.start()
check_output(4)
# Verify that getActiveOrCreate() returns active context
self.setupCalled = False
self.assertEqual(StreamingContext.getActiveOrCreate(self.cpd, setup), self.ssc)
self.assertFalse(self.setupCalled)
# Verify that getActiveOrCreate() uses existing SparkContext
self.ssc.stop(True, True)
time.sleep(1)
self.sc = SparkContext(conf=SparkConf())
self.setupCalled = False
self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup)
self.assertFalse(self.setupCalled)
self.assertTrue(self.ssc.sparkContext == self.sc)
# Verify that getActiveOrCreate() calls setup() in absence of checkpoint files
self.ssc.stop(True, True)
#.........这里部分代码省略.........