当前位置: 首页>>代码示例>>Python>>正文


Python StreamingContext.getOrCreate方法代码示例

本文整理汇总了Python中pyspark.streaming.StreamingContext.getOrCreate方法的典型用法代码示例。如果您正苦于以下问题:Python StreamingContext.getOrCreate方法的具体用法?Python StreamingContext.getOrCreate怎么用?Python StreamingContext.getOrCreate使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.streaming.StreamingContext的用法示例。


在下文中一共展示了StreamingContext.getOrCreate方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_transform_function_serializer_failure

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
    def test_transform_function_serializer_failure(self):
        inputd = tempfile.mkdtemp()
        self.cpd = tempfile.mkdtemp("test_transform_function_serializer_failure")

        def setup():
            conf = SparkConf().set("spark.default.parallelism", 1)
            sc = SparkContext(conf=conf)
            ssc = StreamingContext(sc, 0.5)

            # A function that cannot be serialized
            def process(time, rdd):
                sc.parallelize(range(1, 10))

            ssc.textFileStream(inputd).foreachRDD(process)
            return ssc

        self.ssc = StreamingContext.getOrCreate(self.cpd, setup)
        try:
            self.ssc.start()
        except:
            import traceback
            failure = traceback.format_exc()
            self.assertTrue(
                "It appears that you are attempting to reference SparkContext" in failure)
            return

        self.fail("using SparkContext in process should fail because it's not Serializable")
开发者ID:JingchengDu,项目名称:spark,代码行数:29,代码来源:test_dstream.py

示例2: main

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
def main(conf):
    ssc = None
    if conf.CHECK_POINT:
        ssc = StreamingContext.getOrCreate(conf.CHECK_POINT_PATH, lambda: createContext(conf))
    else:
        ssc = createContext(conf)
    ssc.start()
    ssc.awaitTermination()
    return ssc
开发者ID:sekaiamber,项目名称:KSE-Sample,代码行数:11,代码来源:submit.py

示例3: createContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
        def createContext():
            uBATCH_INTERVAL = 10
            sc = SparkContext(SPARK_MASTER, appName="StreamingKafka")
            sc.broadcast(batchUserPostDict)
            sc.broadcast(batchPostUserDict)
            #sc = SparkContext("local[*]", appName="StreamingKafka")
            # streaming batch interval of 5 sec first, and reduce later to 1 sec or lower
            ssc = StreamingContext(sc, uBATCH_INTERVAL)
            ssc.checkpoint(CHECKPOINT_DIR)   # set checkpoint directory in HDFS
            #ssc.checkpoint(10 * uBATCH_INTERVAL)
            return ssc

            ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, createContext)
开发者ID:lingding0,项目名称:HottestTopicOnReddit,代码行数:15,代码来源:hotred_stream.py

示例4: StreamingContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
        ssc = StreamingContext(sc, STREAMING_INTERVAL)
        lines = ssc.textFileStream('hdfs://{}/data/on_time/streaming/'.format(MASTER_URL))

        ssc.checkpoint(CHECKPOINT_DIR)

        # main split-combine-apply logic put here
        pairs = lines.map(lambda x: x.split(",")).map(lambda x: (x[8], 1))
        runningCounts = pairs.updateStateByKey(updateFunction)

        sortedCounts = runningCounts.transform(lambda rdd: rdd.sortBy(lambda (airport, freq): freq, ascending=False))
        sortedCounts.foreachRDD(process)

        return ssc


ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, createContext)
ssc.start()
finish_delay = 0
while True:
    res = ssc.awaitTerminationOrTimeout(2)
    if (dfstream_num >= TOTAL_FILES):
        finish_delay += 1

    # stop logic
    if res or empty_count >= 10 or finish_delay >= 10:
        # stopped elsewhere
        print 'Finished processing all {} files, terminate the streaming job!!!!!!'.format(TOTAL_FILES)
        ssc.stop(stopSparkContext=True, stopGraceFully=True)
        break

开发者ID:paullo0106,项目名称:cloud_computing_capstone,代码行数:31,代码来源:streaming_consumer.py

示例5: getDroppedWordsCounter

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
        # Get or register the droppedWordsCounter Accumulator
        droppedWordsCounter = getDroppedWordsCounter(rdd.context)

        # Use blacklist to drop words and use droppedWordsCounter to count them
        def filterFunc(wordCount):
            if wordCount[0] in blacklist.value:
                droppedWordsCounter.add(wordCount[1])
                False
            else:
                True

        counts = "Counts at time %s %s" % (time, rdd.filter(filterFunc).collect())
        print(counts)
        print("Dropped %d word(s) totally" % droppedWordsCounter.value)
        print("Appending to " + os.path.abspath(outputPath))
        with open(outputPath, 'a') as f:
            f.write(counts + "\n")

    wordCounts.foreachRDD(echo)
    return ssc

if __name__ == "__main__":
    if len(sys.argv) != 5:
        print("Usage: recoverable_network_wordcount.py <hostname> <port> "
              "<checkpoint-directory> <output-file>", file=sys.stderr)
        exit(-1)
    host, port, checkpoint, output = sys.argv[1:]
    ssc = StreamingContext.getOrCreate(checkpoint,
                                       lambda: createContext(host, int(port), output))
    ssc.start()
    ssc.awaitTermination()
开发者ID:yuantuo,项目名称:pysparkexample,代码行数:33,代码来源:recoverableStream.py

示例6: createContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
def createContext(brokers, topic, checkpointDir):
    # If you do not see this printed, that means the StreamingContext has been loaded
    # from the new checkpoint

    sc = SparkContext(appName="PythonStreamingRecoverableNetworkWordCount")
    ssc = StreamingContext(sc, 1)

    kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
    lines = kvs.map(lambda x: x[1])
    wordCounts = lines.flatMap(lambda line: line.split(" ")) \
        .map(lambda word: (word, 1)) \
        .reduceByKey(lambda a, b: a+b)

    #wordCounts.foreachRDD(echo)
    wordCounts.pprint()
    ssc.checkpoint(checkpointDir)
    return ssc

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: direct_kafka_wordcount_checkpoint.py <broker_list> <topic> <checkpointDir>", file=sys.stderr)
        exit(-1)

    brokers, topic, checkpointDir = sys.argv[1:]

    ssc = StreamingContext.getOrCreate(checkpointDir,
                                       lambda: createContext(brokers, topic, checkpointDir))
    ssc.start()
    ssc.awaitTermination()

开发者ID:obaidcuet,项目名称:spark,代码行数:31,代码来源:direct_kafka_wordcount_with_checkpoint.py

示例7: file

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
    ArrDelay = arrived_data.map(lambda m: ((m.AirlineID, airline_lookup.value[str(m.AirlineID)]), m.ArrDelay))
    
    # sum elements and number of elements, to store them in a intermediate file (k, (sum(v), len(v)))
    collectDelays = ArrDelay.map(lambda (key, value): (key, [value])).reduceByKey(add).map(lambda (key, values): (key, [sum(values), len(values)]))
    
    #debug
    collectDelays.pprint()
    
    # Saving data in hdfs
    collectDelays.saveAsTextFiles(OUTPUT_DIR)
    

#main function
if __name__ == "__main__":
    # Configure Spark. Create a new context or restore from checkpoint
    ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, functionToCreateContext)
    
    # get this spark context
    sc = ssc.sparkContext
    
    # http://stackoverflow.com/questions/24686474/shipping-python-modules-in-pyspark-to-other-nodes
    sc.addPyFile("common.py")

    # Create a Transformed DStream. Read Kafka from first offset
    # creating a stream
    # :param ssc:  StreamingContext object
    # :param zkQuorum:  Zookeeper quorum (hostname:port,hostname:port,..).
    # :param groupId:  The group id for this consumer.
    # :param topics:  Dict of (topic_name -> numPartitions) to consume.
    #                 Each partition is consumed in its own thread.
    # :param kafkaParams: Additional params for Kafka
开发者ID:bunop,项目名称:ccc-capstone,代码行数:33,代码来源:top10_airlines.py

示例8: list

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
    )

    rankingProfitability = topTenProfitableAreas.transform(
        lambda rdd: rdd.map(lambda x: ("ranking2", x)).groupByKey().map(lambda x: (x[0], list(x[1])))
    )

    #    topTenWithDelays.map(lambda x: ('TOP-TEN-WITH-DELAYS', x)).pprint()

    status = rankingTripCount.updateStateByKey(updateFunc)
    status.pprint()
    status2 = rankingProfitability.updateStateByKey(updateFunc2)
    status2.pprint()

    return ssc


if __name__ == "__main__":
    if len(sys.argv) != 8:
        print(
            "Usage: spark_taxi.py <hostname> <port> " "<checkpoint-directory> <output-file1> <output-file2>",
            file=sys.stderr,
        )
        exit(-1)

    host, port, checkpoint, output1, output2, windowDuration, slideDuration = sys.argv[1:]
    ssc = StreamingContext.getOrCreate(
        checkpoint, lambda: createContext(host, int(port), output1, output2, int(windowDuration), int(slideDuration))
    )
    ssc.start()
    ssc.awaitTermination()
开发者ID:ykwim,项目名称:scep,代码行数:32,代码来源:spark_taxi.py

示例9: updateFunction

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils


def updateFunction(new_values, last_sum):
    return sum(new_values) + (last_sum or 0)


def functionToCreateContext():
    sc = SparkContext(appName="StreamingExampleWithKafka")
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")
    opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"}
    kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts)
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).updateStateByKey(updateFunction)
    counts.pprint()
    return ssc


ssc = StreamingContext.getOrCreate("checkpoint", functionToCreateContext)


ssc.start()
ssc.awaitTermination()
开发者ID:bithu30,项目名称:myRepo,代码行数:28,代码来源:streamingWordCountWithState.py

示例10: int

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
    # temperature is inside the body as an amqp value
    return int(data["body"]["section"])

def getMax(a,b):
    if (a > b):
        return a
    else:
        return b

def createStreamingContext():
    conf = SparkConf().setMaster("local[2]").setAppName("amqp_temperature")
    conf.set("spark.streaming.receiver.writeAheadLog.enable", "true")

    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, 1)
    ssc.checkpoint("/tmp/spark-streaming-amqp")

    receiveStream = AMQPUtils.createStream(ssc, "localhost", 5672, "temperature")

    temperature = receiveStream.map(getTemperature)
    max = temperature.reduceByWindow(getMax, None, 5, 5)

    max.pprint()

    return ssc

ssc = StreamingContext.getOrCreate("/tmp/spark-streaming-amqp", createStreamingContext)

ssc.start()
ssc.awaitTermination()
开发者ID:eformat,项目名称:dstream-amqp,代码行数:32,代码来源:amqp_temperature.py

示例11: SparkContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
    print "Creating new context"
    sc = SparkContext(appName="StreamingWordCount")
    ssc = StreamingContext(sc, 2)

    lines = ssc.socketTextStream(host, port)

    def countWords(newValues, lastSum):
      if lastSum is None:
        lastSum = 0
      return sum(newValues, lastSum)  

    word_counts = lines.flatMap(lambda line: line.split(" "))\
                  .filter(lambda w: w.startswith("#"))\
                  .map(lambda word: (word, 1))\
                  .updateStateByKey(countWords)

    word_counts.pprint()

    return ssc


if __name__ == "__main__":

    host, port, checkpoint_dir = sys.argv[1:]

    print checkpoint_dir
    ssc = StreamingContext.getOrCreate(checkpoint_dir,
                                       lambda: create_context(host, int(port)))    
    ssc.start()
    ssc.awaitTermination()
开发者ID:faameem,项目名称:apache,代码行数:32,代码来源:streamingtwitter-getorcreate.py

示例12: updateFunction

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
def updateFunction(newValues, runningCount):
    if runningCount is None:
       runningCount = 0
    return sum(newValues, runningCount)

def partitionCount(datum):
    datatype = datum['dataunit'].keys()[0]
    if datatype.endswith('property'):
        return ('\\'.join((datatype,
            datum['dataunit'][datatype]['property'].keys()[0])), 1)
    else:
        return (datatype, 1)
	
if __name__ == "__main__":
	ssc = StreamingContext.getOrCreate(checkpointDir, buildContext)

	zkQuorum = "localhost:2181"
	kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {KAFKA_TOPIC: 1})
	#data stream of data dictionaries
	ds = kvs.map(lambda data: ast.literal_eval(data[1]))

	ds.pprint()
	pprint(type(ds))
	# print type(ds)
	
	data.append(ds)
	if len(data) > 0 and len(data) % (n*13) == 0:
		path = "..\\speed\\"
		name = "speed" + str(len(data)/n*13) + ".json"
		with open (path+name, 'w') as outfile:
开发者ID:balajikvijayan,项目名称:DataEngineering,代码行数:32,代码来源:speed_consumer.py

示例13: threaded_function

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
    
    
    #meat
    data_from_ticket_mechine.map(ticket_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(ticket_mechine_RDD_handler)
    data_from_camera_mechine.map(camera_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(camera_mechine_RDD_handler)
    
    ssc.checkpoint(checkpointDirectory)   # set checkpoint directory
    return ssc


    
def threaded_function(arg):
    from rpyc.utils.server import ThreadedServer
    t = ThreadedServer(MyService, port = 18862, protocol_config = {"allow_all_attrs" : True, 'allow_pickle' : True})
    t.start()

if __name__ == "__main__":
    from threading import Thread
    
    thread = Thread(target = threaded_function, args=(None,))
    thread.start()

    conf = SparkConf().setMaster("localhost")
    ssc = StreamingContext.getOrCreate(checkpointDirectory, functionToCreateContext)
    ssc.start()
    ssc.awaitTermination()


    

开发者ID:sklaw,项目名称:spark_project,代码行数:28,代码来源:stream_handler_v1.py

示例14: test_get_or_create_and_get_active_or_create

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import getOrCreate [as 别名]
    def test_get_or_create_and_get_active_or_create(self):
        inputd = tempfile.mkdtemp()
        outputd = tempfile.mkdtemp() + "/"

        def updater(vs, s):
            return sum(vs, s or 0)

        def setup():
            conf = SparkConf().set("spark.default.parallelism", 1)
            sc = SparkContext(conf=conf)
            ssc = StreamingContext(sc, 2)
            dstream = ssc.textFileStream(inputd).map(lambda x: (x, 1))
            wc = dstream.updateStateByKey(updater)
            wc.map(lambda x: "%s,%d" % x).saveAsTextFiles(outputd + "test")
            wc.checkpoint(2)
            self.setupCalled = True
            return ssc

        # Verify that getOrCreate() calls setup() in absence of checkpoint files
        self.cpd = tempfile.mkdtemp("test_streaming_cps")
        self.setupCalled = False
        self.ssc = StreamingContext.getOrCreate(self.cpd, setup)
        self.assertTrue(self.setupCalled)

        self.ssc.start()

        def check_output(n):
            while not os.listdir(outputd):
                if self.ssc.awaitTerminationOrTimeout(0.5):
                    raise Exception("ssc stopped")
            time.sleep(1)  # make sure mtime is larger than the previous one
            with open(os.path.join(inputd, str(n)), 'w') as f:
                f.writelines(["%d\n" % i for i in range(10)])

            while True:
                if self.ssc.awaitTerminationOrTimeout(0.5):
                    raise Exception("ssc stopped")
                p = os.path.join(outputd, max(os.listdir(outputd)))
                if '_SUCCESS' not in os.listdir(p):
                    # not finished
                    continue
                ordd = self.ssc.sparkContext.textFile(p).map(lambda line: line.split(","))
                d = ordd.values().map(int).collect()
                if not d:
                    continue
                self.assertEqual(10, len(d))
                s = set(d)
                self.assertEqual(1, len(s))
                m = s.pop()
                if n > m:
                    continue
                self.assertEqual(n, m)
                break

        check_output(1)
        check_output(2)

        # Verify the getOrCreate() recovers from checkpoint files
        self.ssc.stop(True, True)
        time.sleep(1)
        self.setupCalled = False
        self.ssc = StreamingContext.getOrCreate(self.cpd, setup)
        self.assertFalse(self.setupCalled)
        self.ssc.start()
        check_output(3)

        # Verify that getOrCreate() uses existing SparkContext
        self.ssc.stop(True, True)
        time.sleep(1)
        self.sc = SparkContext(conf=SparkConf())
        self.setupCalled = False
        self.ssc = StreamingContext.getOrCreate(self.cpd, setup)
        self.assertFalse(self.setupCalled)
        self.assertTrue(self.ssc.sparkContext == self.sc)

        # Verify the getActiveOrCreate() recovers from checkpoint files
        self.ssc.stop(True, True)
        time.sleep(1)
        self.setupCalled = False
        self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup)
        self.assertFalse(self.setupCalled)
        self.ssc.start()
        check_output(4)

        # Verify that getActiveOrCreate() returns active context
        self.setupCalled = False
        self.assertEqual(StreamingContext.getActiveOrCreate(self.cpd, setup), self.ssc)
        self.assertFalse(self.setupCalled)

        # Verify that getActiveOrCreate() uses existing SparkContext
        self.ssc.stop(True, True)
        time.sleep(1)
        self.sc = SparkContext(conf=SparkConf())
        self.setupCalled = False
        self.ssc = StreamingContext.getActiveOrCreate(self.cpd, setup)
        self.assertFalse(self.setupCalled)
        self.assertTrue(self.ssc.sparkContext == self.sc)

        # Verify that getActiveOrCreate() calls setup() in absence of checkpoint files
        self.ssc.stop(True, True)
#.........这里部分代码省略.........
开发者ID:JingchengDu,项目名称:spark,代码行数:103,代码来源:test_dstream.py


注:本文中的pyspark.streaming.StreamingContext.getOrCreate方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。