当前位置: 首页>>代码示例>>Python>>正文


Python StreamingContext.socketTextStream方法代码示例

本文整理汇总了Python中pyspark.streaming.StreamingContext.socketTextStream方法的典型用法代码示例。如果您正苦于以下问题:Python StreamingContext.socketTextStream方法的具体用法?Python StreamingContext.socketTextStream怎么用?Python StreamingContext.socketTextStream使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.streaming.StreamingContext的用法示例。


在下文中一共展示了StreamingContext.socketTextStream方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def main():
    parser = argparse.ArgumentParser(
        description='process some log messages, storing them and signaling '
                    'a rest server')
    parser.add_argument('--mongo', help='the mongodb url',
                        required=True)
    parser.add_argument('--rest', help='the rest endpoint to signal',
                        required=True)
    parser.add_argument('--port', help='the port to receive from '
                        '(default: 1984)',
                        default=1984, type=int)
    parser.add_argument('--appname', help='the name of the spark application '
                        '(default: SparkharaLogCounter)',
                        default='SparkharaLogCounter')
    parser.add_argument('--master',
                        help='the master url for the spark cluster')
    parser.add_argument('--socket',
                        help='the socket to attach for streaming text data '
                        '(default: caravan-pathfinder)',
                        default='caravan-pathfinder')
    args = parser.parse_args()
    mongo_url = args.mongo
    rest_url = args.rest

    sconf = SparkConf().setAppName(args.appname)
    if args.master:
        sconf.setMaster(args.master)
    sc = SparkContext(conf=sconf)
    ssc = StreamingContext(sc, 1)

    lines = ssc.socketTextStream(args.socket, args.port)
    lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url, rest_url))

    ssc.start()
    ssc.awaitTermination()
开发者ID:mattf,项目名称:sparkhara-sources,代码行数:37,代码来源:caravan_master.py

示例2: createStreamingContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def createStreamingContext():

    # Create a local StreamingContext with two working thread and batch interval of 1 second
    sc = SparkContext("spark://%s:7077" % MASTER_NAME, appName="GlutenTweet", pyFiles=PYFILES)
    ssc = StreamingContext(sc, 2)

    # Create a DStream of raw data
    raw = ssc.socketTextStream(MASTER_IP, 9999)

    # Convert into models
    tweets = raw.map(lambda r: Tweet(raw_json=r))

    # Store models
    tweets.foreachRDD(storeTweetsRDD)

    # Sliding window analysis
    window = tweets.window(20*60, 30)
    hashtagCounts = analysisHahtagCount(window)
    streamTop(hashtagCounts).pprint()

    # Keyword extraction - note tweets is immutable
    tweetsKeyword = tweets.map(lambda t: keywordExtraction(t))

    # Update models
    tweetsKeyword.foreachRDD(updateTweetsRDD)

    # Sliding window analysis
    window2 = tweetsKeyword.window(20*60, 30)
    keywordCounts = analysisKeywordCount(window2)
    streamTop(keywordCounts).pprint()

    ssc.checkpoint(CHECKPOINT_DIR)
    return ssc
开发者ID:ecesena,项目名称:spark-tutorial,代码行数:35,代码来源:app.py

示例3: main

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def main():
    sym_dict = {}
    conf = SparkConf().setAppName("symbol stream")
    sc = SparkContext(conf=conf)
    ssc = StreamingContext(sc, .1)

    lines = ssc.socketTextStream("localhost", 1337)
    
    def print_now():
        print sym_dict

    def predict(prices):
        print prices

    def add_to_dict(line):
        symbol, price, volume = line.split(',') 
        if symbol in sym_dict:
            print 'made it here'
            sym_dict[symbol][0].append(price)
            sym_dict[symbol][1].append(volume)
            if len(sym_dict[0]) > 10:
                sym_dict[0].pop(0)
                sym_dict[1].pop(0)
                predict(sym_dict[0])
        else:
            sym_dict[symbol] = [[price],[volume]]
    
    
    #test = lines.map(lambda line: json.dumps(line)) 
    test = lines.map(lambda line: line)
    test.pprint()
    ssc.start()
    ssc.awaitTermination()
开发者ID:redame,项目名称:quote_streaming,代码行数:35,代码来源:spark_stream.py

示例4: createContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def createContext(host, port, outputPath):
    # If you do not see this printed, that means the StreamingContext has been loaded
    # from the new checkpoint
    print "Creating new context"
    if os.path.exists(outputPath):
        os.remove(outputPath)
    sc = SparkContext(appName="PythonStreamingRecoverableNetworkWordCount")
    ssc = StreamingContext(sc, 120)

    # Create a socket stream on target ip:port and count the
    # words in input stream of \n delimited text (eg. generated by 'nc')
    lines = ssc.socketTextStream(host, port)
    print '\n\n\nconnectionMade\n\n\n'
    addresses = lines.map(splitLine)
    transcationsum = addresses.map(lambda x: (x[0], (1, x[1]))).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))

    def echo(time, rdd):
        counts = "Counts at time %s %s" % (time, rdd.collect())
        print counts
        print "Appending to " + os.path.abspath(outputPath)
        with open(outputPath, 'a') as f:
            f.write(counts + "\n")

    transcationsum.foreachRDD(echo)
    return ssc
开发者ID:samchorlton,项目名称:python,代码行数:27,代码来源:bitcoin_ip_count.py

示例5: start_spark_streaming

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def start_spark_streaming():
    question_db = QuestionDatabase(QB_QUESTION_DB)
    features = {name: instantiate_feature(name, question_db) for name in FEATURE_NAMES}

    sc = create_sc()
    b_features = sc.broadcast(features)
    ssc = StreamingContext(sc, 5)

    ssc.socketTextStream('localhost', 9999) \
        .repartition(QB_STREAMING_CORES - 1) \
        .flatMap(lambda line: generate_guesses(line, b_features)) \
        .map(lambda sg: evaluate_features(sg, b_features)) \
        .foreachRDD(score_and_save)

    ssc.start()
    ssc.awaitTermination()
    sc.stop()
开发者ID:cequencer,项目名称:qb,代码行数:19,代码来源:streaming.py

示例6: sparkTask

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def sparkTask():
    from textblob import TextBlob
    import re    
    from pyspark import SparkContext
    from pyspark.streaming import StreamingContext
    sc = SparkContext()
    ssc = StreamingContext(sc, 1)
    quotes = ssc.socketTextStream("localhost", 9999)
    dataSentencesPolarity = quotes.map(lambda x: TextBlob(re.sub('[^A-Za-z0-9 \.\']+', '',x))).map(lambda y: (str(y.upper())[:60], y.sentiment.polarity))
    dataSentencesPolarity.pprint()
    ssc.start()             # Start the computation
    ssc.awaitTermination(20)  # Wait for the computation to terminate    
开发者ID:sh19871122,项目名称:TM_2014-2015S2,代码行数:14,代码来源:twitterTestStreaming.py

示例7: __init__

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
class xStreamProcessor:
    ip = socket.gethostbyname(socket.gethostname())
    port = 9999
    dstream = None
    sc = None
    ssc = None

    #def __init__(self,ip=None,port=None,spark_master = 'spark://localhost:7077'):
    def __init__(self,ip=None,port=None,spark_master = 'mesos://10.0.2.85:5050'):
        if ip is not None:
            self.ip = ip
        if port is not None:
            self.port = port
        self.sc = SparkContext(master=spark_master,appName='StreamProcessor')
        self.ssc = StreamingContext(self.sc, 1)
        #self.ssc.checkpoint(directory=None)
        hiveContext = HiveContext(self.sc)
        hiveContext.sql('DROP TABLE IF EXISTS default.tweet_stream')
        hiveContext.sql('CREATE TABLE IF NOT EXISTS default.tweet_stream (ip STRING, port STRING, date_time STRING, user STRING, msg STRING)')

        hiveContext.sql('DROP TABLE IF EXISTS default.email_stream')
        hiveContext.sql('CREATE TABLE IF NOT EXISTS default.email_stream (ip STRING, port STRING, date_time STRING, \
        fr STRING,to STRING, subject STRING, content STRING, subject_sentiment INT, content_sentiment INT, \
        subject_power INT, content_power INT,  subject_topic INT, content_topic INT, fraud_score DOUBLE)')

        hiveContext.sql('DROP TABLE IF EXISTS default.email_graph')
        hiveContext.sql('CREATE TABLE IF NOT EXISTS default.email_graph (fr STRING,to STRING, dt STRING)')

        hiveContext.sql('DROP TABLE IF EXISTS default.trans_stream')
        hiveContext.sql('CREATE TABLE IF NOT EXISTS default.trans_stream (ip STRING,port STRING, date_time STRING, user STRING, amount DOUBLE, \
        big_trans INT, is_in_odd_day INT, is_at_odd_time INT)')

        self.dstream = self.ssc.socketTextStream(self.ip, self.port)


        self.process_stream()

        self.ssc.start()
        self.ssc.awaitTermination()

    def process_stream(self):
        parts = self.dstream.flatMap(lambda line: line.split("|"))
        words = parts.map(lambda p: p[3])
        pairs = words.map(lambda word: (word, 1))
        wordCounts = pairs.reduceByKey(lambda x, y: x + y)

        # Print the first ten elements of each RDD generated in this DStream to the console
        wordCounts.pprint()
开发者ID:jingfengli,项目名称:hunting_criminals_demo,代码行数:50,代码来源:StreamProcessor.py

示例8: functionToCreateContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def functionToCreateContext():
    sc = SparkContext("local[*]", "streaming_part")
    sc.setLogLevel("ERROR")
    
    ssc = StreamingContext(sc, 5)
    
    data_from_ticket_mechine = ssc.socketTextStream("localhost", 9999)
    data_from_camera_mechine = ssc.socketTextStream("localhost", 9998)
    
    
    #meat
    data_from_ticket_mechine.map(ticket_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(ticket_mechine_RDD_handler)
    data_from_camera_mechine.map(camera_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(camera_mechine_RDD_handler)
    
    ssc.checkpoint(checkpointDirectory)   # set checkpoint directory
    return ssc
开发者ID:sklaw,项目名称:spark_project,代码行数:18,代码来源:stream_handler_v1.py

示例9: main

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def main():

    master = 'local[2]'
    app_name = 'reduce_demo1'

    # print(range(0, 3))

    sc = SparkContext(master, app_name)
    ssc = StreamingContext(sc, 15)

    host = 'localhost'
    port = 9999
    stream = ssc.socketTextStream(host, port)
    stream.foreachRDD(fun_union_in_dstream_foreachRDD)

    ssc.start()
    ssc.awaitTermination()
开发者ID:tsingfu,项目名称:xuetangx-streaming-app,代码行数:19,代码来源:test_streaming_reduce_demo1.py

示例10: main

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def main():
    parser = argparse.ArgumentParser(
        description='process some log messages, storing them and signaling '
                    'a rest server')
    parser.add_argument('--mongo', help='the mongodb url',
                        required=True)
    parser.add_argument('--rest', help='the rest endpoint to signal',
                        required=True)
    parser.add_argument('--port', help='the port to receive from '
                        '(default: 1984)',
                        default=1984, type=int)
    parser.add_argument('--appname', help='the name of the spark application '
                        '(default: SparkharaLogCounter)',
                        default='SparkharaLogCounter')
    parser.add_argument('--master',
                        help='the master url for the spark cluster')
    parser.add_argument('--socket',
                        help='the socket ip address to attach for streaming '
                        'text data (default: caravan-pathfinder)',
                        default='caravan-pathfinder')
    parser.add_argument('--model',
                        help='the serialized model to use',
                        default='model.json')
    args = parser.parse_args()
    mongo_url = args.mongo
    rest_url = args.rest
    model = args.model

    sconf = SparkConf().setAppName(args.appname)
    if args.master:
        sconf.setMaster(args.master)
    sc = SparkContext(conf=sconf)
    ssc = StreamingContext(sc, 1)
    somv = fromJSON(model)
    som = sc.broadcast(somv)

    log4j = sc._jvm.org.apache.log4j
    log4j.LogManager.getRootLogger().setLevel(log4j.Level.WARN)

    lines = ssc.socketTextStream(args.socket, args.port)
    lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url,
                                                 rest_url, som))

    ssc.start()
    ssc.awaitTermination()
开发者ID:sparkhara,项目名称:whirlwind-caravan,代码行数:47,代码来源:whirlwind_caravan.py

示例11: consumer

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def consumer():
    def process(time, rdd):
        global words
        words += Counter(dict(rdd.collect()))

    sc = SparkContext(appName='graaftel')
    ssc = StreamingContext(sc, 5)

    lines = ssc.socketTextStream(os.getenv('PRODUCER_SERVICE_HOST', 'localhost'),
                                 int(os.getenv('PRODUCER_SERVICE_PORT', 8080)))
    counts = lines.flatMap(lambda line: line.encode('ascii', 'ignore').lower().split()) \
                  .map(lambda word: word.translate(None, string.punctuation)) \
                  .filter(lambda word: word not in stop_words) \
                  .map(lambda word: (word, 1)) \
                  .reduceByKey(add)
    counts.foreachRDD(process)

    ssc.start()
    ssc.awaitTermination()
开发者ID:sparkhara,项目名称:graaftel,代码行数:21,代码来源:app.py

示例12: create_context

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def create_context(host, port):
    print "Creating new context"
    sc = SparkContext(appName="StreamingWordCount")
    ssc = StreamingContext(sc, 2)

    lines = ssc.socketTextStream(host, port)

    def countWords(newValues, lastSum):
      if lastSum is None:
        lastSum = 0
      return sum(newValues, lastSum)  

    word_counts = lines.flatMap(lambda line: line.split(" "))\
                  .filter(lambda w: w.startswith("#"))\
                  .map(lambda word: (word, 1))\
                  .updateStateByKey(countWords)

    word_counts.pprint()

    return ssc
开发者ID:faameem,项目名称:apache,代码行数:22,代码来源:streamingtwitter-getorcreate.py

示例13: createContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def createContext(host, port, outputPath):
    # If you do not see this printed, that means the StreamingContext has been loaded
    # from the new checkpoint
    print("Creating new context")
    if os.path.exists(outputPath):
        os.remove(outputPath)
    sc = SparkContext(appName="PythonStreamingRecoverableNetworkWordCount")
    ssc = StreamingContext(sc, 1)

    # Create a socket stream on target ip:port and count the
    # words in input stream of \n delimited text (eg. generated by 'nc')
    lines = ssc.socketTextStream(host, port)
    words = lines.flatMap(lambda line: line.split(" "))
    wordCounts = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)

    def echo(time, rdd):
        # Get or register the blacklist Broadcast
        blacklist = getWordBlacklist(rdd.context)
        # Get or register the droppedWordsCounter Accumulator
        droppedWordsCounter = getDroppedWordsCounter(rdd.context)

        # Use blacklist to drop words and use droppedWordsCounter to count them
        def filterFunc(wordCount):
            if wordCount[0] in blacklist.value:
                droppedWordsCounter.add(wordCount[1])
                False
            else:
                True

        counts = "Counts at time %s %s" % (time, rdd.filter(filterFunc).collect())
        print(counts)
        print("Dropped %d word(s) totally" % droppedWordsCounter.value)
        print("Appending to " + os.path.abspath(outputPath))
        with open(outputPath, 'a') as f:
            f.write(counts + "\n")

    wordCounts.foreachRDD(echo)
    return ssc
开发者ID:yuantuo,项目名称:pysparkexample,代码行数:40,代码来源:recoverableStream.py

示例14: StreamingContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
ssc = StreamingContext(sc, 5)

def add_tuples(a, b):
    return tuple(sum(p) for p in zip(a,b))

def calc_values(sumx,sumy,sumxy,sumxsquare,n):
    xmean=sumx/n
    ymean=sumy/n
    xymean=sumxy/n
    xsquaremean=sumxsquare/n
    beta=(xymean-(xmean*ymean))/(xsquaremean-(xmean*xmean))
    alpha=ymean-(beta*xmean)    
    return (alpha,beta)

def stream_function(points):
    points_float=points.map(lambda (x,y):(float(x),float(y)))    
    points_mean = points_float.map(lambda (x,y): (x, y, (x*y), (x*x), 1)).reduce(add_tuples)
    meanvalues=calc_values(*points_mean)
    alpha=meanvalues[0]
    beta=meanvalues[1]
    rdd = sc.parallelize([(alpha,beta)], numSlices=1)
    rdd.saveAsTextFile(output + '/' + datetime.datetime.now().isoformat().replace(':', '-'))
    

lines = ssc.socketTextStream("cmpt732.csil.sfu.ca", inputs)

points = lines.map(lambda line: line.split())
points.foreachRDD(stream_function)
ssc.start()
ssc.awaitTermination(timeout=300)
开发者ID:amitiwari,项目名称:DataFrameStreaming,代码行数:32,代码来源:read_stream.py

示例15: globals

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
    return globals()['sqlContextSingletonInstance']


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: sql_network_wordcount.py <hostname> <port> ", file=sys.stderr)
        exit(-1)
    host, port = sys.argv[1:]
    sc = SparkContext(appName="PythonSqlNetworkWordCount")
    ssc = StreamingContext(sc, 1)

    # Create a socket stream on target ip:port and count the
    # words in input stream of \n delimited text (eg. generated by 'nc')
    lines = ssc.socketTextStream(host, int(port))
    words = lines.flatMap(lambda line: line.split(" "))

    # Convert RDDs of the words DStream to DataFrame and run SQL query
    def process(time, rdd):
        print("========= %s =========" % str(time))

        try:
            # Get the singleton instance of SQLContext
            sqlContext = getSqlContextInstance(rdd.context)

            # Convert RDD[String] to RDD[Row] to DataFrame
            rowRdd = rdd.map(lambda w: Row(word=w))
            wordsDataFrame = sqlContext.createDataFrame(rowRdd)

            # Register as table
开发者ID:91Phoenix,项目名称:spark,代码行数:33,代码来源:sql_network_wordcount.py


注:本文中的pyspark.streaming.StreamingContext.socketTextStream方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。