当前位置: 首页>>代码示例>>Python>>正文


Python StreamingContext.textFileStream方法代码示例

本文整理汇总了Python中pyspark.streaming.StreamingContext.textFileStream方法的典型用法代码示例。如果您正苦于以下问题:Python StreamingContext.textFileStream方法的具体用法?Python StreamingContext.textFileStream怎么用?Python StreamingContext.textFileStream使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.streaming.StreamingContext的用法示例。


在下文中一共展示了StreamingContext.textFileStream方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: setup

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
        def setup():
            conf = SparkConf().set("spark.default.parallelism", 1)
            sc = SparkContext(conf=conf)
            ssc = StreamingContext(sc, 0.5)

            # A function that cannot be serialized
            def process(time, rdd):
                sc.parallelize(range(1, 10))

            ssc.textFileStream(inputd).foreachRDD(process)
            return ssc
开发者ID:JingchengDu,项目名称:spark,代码行数:13,代码来源:test_dstream.py

示例2: createContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
def createContext():

        conf = SparkConf().setMaster('spark://{}:7077'.format(MASTER_URL)).set('spark.executor.memory', '2g')
        sc = SparkContext(conf=conf)

        ssc = StreamingContext(sc, STREAMING_INTERVAL)
        lines = ssc.textFileStream('hdfs://{}/data/on_time/streaming/'.format(MASTER_URL))

        ssc.checkpoint(CHECKPOINT_DIR)

        # main split-combine-apply logic put here
        pairs = lines.map(lambda x: x.split(",")).map(lambda x: (x[8], 1))
        runningCounts = pairs.updateStateByKey(updateFunction)

        sortedCounts = runningCounts.transform(lambda rdd: rdd.sortBy(lambda (airport, freq): freq, ascending=False))
开发者ID:paullo0106,项目名称:cloud_computing_capstone,代码行数:17,代码来源:streaming_consumer.py

示例3: createContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
def createContext():

        conf = SparkConf().setMaster('spark://{}:7077'.format(MASTER_URL)).set('spark.executor.memory', '2g')
        sc = SparkContext(conf=conf)

        ssc = StreamingContext(sc, STREAMING_INTERVAL)
        lines = ssc.textFileStream('hdfs://{}/data/on_time/streaming/'.format(MASTER_URL))

        ssc.checkpoint(CHECKPOINT_DIR)

        # main split-combine-apply logic put here
	# filter out header and other invalid rows
	rdd = lines.map(lambda line: line.split(',')).filter(lambda words: len(words) > 56)
        # extract first field (for filtering header), Carrier, Orig, Dest, and delay fields
	rdd2 = rdd.map(lambda x: (x[0], x[8], x[11], x[18], x[52], x[53], x[54], x[55], x[56])).map(lambda line: [str(w.replace('"','')) for w in line]).filter(lambda row: row[0] != 'Year' and any(row[4:]))
	rdd2.pprint()

    	# sum up delay fields for each row
	sum_delay_rdd = rdd2.map(sum_delay)
	sum_delay_rdd.pprint()

    	# sum up delay for each (orig, dest, carrier) pair
	combined_rdd = sum_delay_rdd.updateStateByKey(updateFunction)
	combined_rdd.pprint()

    	# calculate avg delay
	avg_rdd = combined_rdd.transform(lambda rdd: rdd.map(lambda (x, y): ((x[0], x[1]), (y[0]/float(y[1]), x[2]))))
	avg_rdd.pprint()

    	# group by (orig, dest)
	avg_rdd_by_route = avg_rdd.groupByKey()

    	# sort by on time performance for each (orig, dest) route and take top 10
	route_sorted_carrier = avg_rdd_by_route.mapValues(lambda x: sorted(list(x))[:10])
	aa = route_sorted_carrier.flatMapValues(lambda x: x)

    	aa.pprint()
	aa.foreachRDD(process)

        return ssc
开发者ID:paullo0106,项目名称:cloud_computing_capstone,代码行数:42,代码来源:streaming2-3.py

示例4: SparkContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
sc = SparkContext(conf=conf)

ssc = StreamingContext(sc, 2)   # Create a streaming context with batch interval of 1 sec
ssc.checkpoint("checkpoint")

####### Example . #######
# map() and reduceByKey() on DStream 
# Run this example, and then copy the file to 
directory = sys.argv[1]
print(directory)

# create DStream from text file
# Note: the spark streaming checks for any updates to this directory.
# So first, start this program, and then copy the log file logs/access_log.log to 'directory' location
log_data = ssc.textFileStream(directory)

# Parse each line using a utility class
access_log_dstream = log_data.map(ApacheAccessLog.parse_from_log_line).filter(lambda parsed_line: parsed_line is not None)

# map each ip with value 1. So the stream becomes (ip, 1)
ip_dstream = access_log_dstream.map(lambda parsed_line: (parsed_line.ip, 1)) 

ip_count = ip_dstream.reduceByKey(lambda x,y: x+y)
ip_count.pprint(num = 30)



####### Example  #######
# Join two Dstreams
开发者ID:WeiFoo,项目名称:CSC591_ADBI,代码行数:31,代码来源:stream2.py

示例5: SparkConf

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
To run
  ./pyspark.submit.sh spark-streaming-foreachRDD-and-foreachPartition.py
"""

from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext

from quiet_logs import quiet_logs

if __name__ == "__main__":
    conf = SparkConf().setAppName("Reading files from a directory")
    sc   = SparkContext(conf=conf)
    ssc  = StreamingContext(sc, 2)

    quiet_logs(sc)

    lines = ssc.textFileStream('./streamingData')

    # Split each line into words
    words = lines.flatMap(lambda line: line.split(" "))

    # Count each word in each batch
    pairs = words.map(lambda word: (word, 1))

    wordCounts = pairs.reduceByKey(lambda x, y: x + y)

    # Print the first ten elements of each RDD generated in this DStream to the console
    wordCounts.pprint()

    ssc.start()             # Start the computation
    ssc.awaitTermination()  # Wait for the computation to terminate
开发者ID:danielsan,项目名称:Spark-Streaming-Examples,代码行数:33,代码来源:spark-streaming-reading-files-from-a-folder.py

示例6: getText

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
def getText(line):
  j = json.loads(line);
  return((j["created_at"], j["id"], j["text"]))

def postToSolr(t, rdd):
  solr_url = "http://auisbigdatabox.cloudapp.net:8983/solr/project/update?commitWithin=5000"
  d = json.dumps(rdd.map(lambda x: x).collect())
  print(d)
  h = {'content-type': 'application/json'}
  r = requests.post(solr_url, data=d, headers=h)
  print(r.status_code)
  #r2 = requests.get("http://auisbigdatabox.cloudapp.net:8983/solr/admin/cores?action=RELOAD&core=project_shard1_replica1")
  #print(r2.status_code)


sc = SparkContext()

dwsent = dict(sc.textFile("/data/dictionary.tsv").map(lambda line: line.split("\t")) \
  .map(lambda line: (line[2], line[5])).collect())
shows = dict(sc.textFile("/data/shows.txt").map(lambda line: (line.lower().encode('ascii'), line)).collect())


ssc = StreamingContext(sc, 1)
ds = ssc.textFileStream("/project/rawdata")
ods = ds.window(30, 30).map(lambda line: getText(line)) \
  .map(lambda line: {"created_dt": line[0], "show_s": getShow(line[2]), "sentiment_s": getLineSentiment(line[2]), "text_t": line[2]})
ods.foreachRDD(postToSolr)
  
ssc.start()
ssc.awaitTermination()
开发者ID:auimendoza,项目名称:coursework,代码行数:32,代码来源:readstream.py

示例7: SparkContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import StreamingLinearRegressionWithSGD
# $example off$
from Functions import parsePoint, Nlabels2, makePredOVO

if __name__ == "__main__":

    sc = SparkContext('local[*]',appName="PythonLogisticRegressionWithLBFGSExample")
    ssc = StreamingContext(sc, 1)
    sqlContext = SQLContext(sc)

    labels = ['Politics','Finance','Sports','Sci&Tech','Entertainment','Crime']
    labels_num = [[0.0,1.0],[0.0,2.0],[0.0,3.0],[0.0,4.0],[0.0,5.0],[1.0,2.0],[1.0,3.0],[1.0,4.0],[1.0,5.0],
                  [2.0,3.0],[2.0,4.0],[2.0,5.0],[3.0,4.0],[3.0,5.0],[4.0,5.0]]
    data = ssc.textFileStream("/training/data/dir") #SPECIFY THE TRAINING DATA DIRECTORY HERE
    testData = ssc.textFileStream("/testing/data/dir") #SPECIFY THE TESTING DATA DIRECTORY HERE
    data = data.mapPartitions(lambda x: csv.reader(x, delimiter='`', quotechar='|'))
    testData = testData.mapPartitions(lambda x: csv.reader(x, delimiter='`', quotechar='|'))
    #Model details
    num_features = 300
    #model_name = "Models\\GoogleNews-vectors-negative300.bin\\GoogleNews-vectors-negative300.bin"
    #model = Word2Vec.load_word2vec_format(model_name, binary=True)
    #model.init_sims(replace=True)
    # model_name = "../Models/ModelforStreaming300format_label" # Word2Vec Model
    # model = Word2Vec.load_word2vec_format(model_name)
    model_name = "../Models/ModelforStreaming300_label" # Word2Vec Model
    model = Word2Vec.load(model_name)
    index2word_set = set(model.index2word)
    f = lambda j: parsePoint(j, index2word_set,model, num_features)
    parsedData = data.map(f).cache()
开发者ID:PirayaW,项目名称:twitter-news-classification-spark,代码行数:33,代码来源:StreamingLogReg.py

示例8: SparkContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
from __future__ import print_function
__author__ = 'abc'
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
if __name__ == "__main__":

    sc = SparkContext("local[2]",appName="PythonStreamingHDFSWordCount")
    ssc = StreamingContext(sc, 1)
    lines = ssc.textFileStream('/Volumes/work/data/kaggle/test/')
    lines.pprint()
    counts = lines.flatMap(lambda line: line.split(" "))\
                  .map(lambda x: (x, 1))\
                  .reduceByKey(lambda a, b: a+b)
    counts.pprint()
    ssc.start()
    ssc.awaitTermination()
开发者ID:abhishek-ch,项目名称:evolveML,代码行数:18,代码来源:BasicRead.py

示例9: SparkContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from blockchain import *

logFile = "logfile"  # Should be some file on your system
sc = SparkContext("local[1]", "Simple App")

logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)

ssc = StreamingContext(sc, 1)

blocks = ssc.textFileStream("file:///Users/jeremyrubin/Desktop/sparkybitcoin/test")
# blocks = sc.wholeTextFiles("blocks")

blockJSONs = blocks.map(lambda c: Block.of_string(c, 0))


# blocks.pprint()

n_txs_by_n_inputs = (
    blockJSONs.flatMap(lambda f: f.txns.to_list()).map(lambda x: (x.tx_in_count, 1)).reduceByKey(lambda x, y: x + y)
)
n_txs_by_n_inputs.pprint()  # saveAsTextFiles("outputs/HE", "txt")
# print n_txs_by_n_inputs.collect()


ssc.start()  # Start the computation
开发者ID:JeremyRubin,项目名称:BTCSpark,代码行数:33,代码来源:sparkstreamingblock.py

示例10: globals

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
    if ('sqlContextSingletonInstance' not in globals()):
        globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
    return globals()['sqlContextSingletonInstance']

#
# main program
#
if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: pubnub_dir_streaming data_dir ", file=sys.stderr)
        exit(-1)
    data_dir = sys.argv[1]
    sc = SparkContext(appName="pubnub_dir_streaming_app")
    ssc = StreamingContext(sc, 30)

    devicesRDD = ssc.textFileStream(data_dir)

    # Convert RDDs of the JSON DStream to DataFrame and run SQL query
    def process(time, rdd):
        try:
            for item in rdd.collect():
                insert_into_dbs(["Cassandra"], item)
        except Exception as e:
            print(traceback.format_exc())
    #
    # process each RRD on the dirver side
    #
    devicesRDD.foreachRDD(process)
    ssc.start()
    ssc.awaitTermination()
开发者ID:agilemobiledev,项目名称:examples-2,代码行数:32,代码来源:pubnub_dir_streaming.py

示例11: SparkContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
# -*- coding:utf-8 -*-
from pyspark.context import SparkContext
from pyspark.streaming import StreamingContext
from pyspark import StorageLevel

sc = SparkContext("yarn-client", "Chap7-2")
ssc = StreamingContext(sc, 10)

lines = ssc.textFileStream("hdfs:///user/y_tadayasu/data/") 

words = lines.flatMap(lambda line: line.split(" ")).filter(lambda x:x)
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)

wordCounts.pprint()

ssc.start()             
ssc.awaitTermination() 
开发者ID:tadayasu,项目名称:spark-sample-program,代码行数:20,代码来源:chap7-2.py

示例12: SparkContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
    from pyspark import SparkContext, SQLContext
    from pyspark.mllib.classification import StreamingLogisticRegressionWithSGD
    from pyspark.mllib.regression import LabeledPoint
    from pyspark.streaming import StreamingContext

    sc = SparkContext('local[*]', appName="PythonLogisticRegressionWithLBFGSExample")
    ssc = StreamingContext(sc, 30)
    sqlContext = SQLContext(sc)

    labels = ['Politics', 'Finance', 'Sports', 'Sci&Tech', 'Entertainment', 'Crime']
    labels_num = [[0.0, 1.0], [0.0, 2.0], [0.0, 3.0], [0.0, 4.0], [0.0, 5.0],
                  [1.0, 2.0], [1.0, 3.0], [1.0, 4.0], [1.0, 5.0],
                  [2.0, 3.0], [2.0, 4.0], [2.0, 5.0],
                  [3.0, 4.0], [3.0, 5.0],
                  [4.0, 5.0]]
    data = ssc.textFileStream("streaming_data/")  # SPECIFY THE TRAINING DATA DIRECTORY HERE
    testData = ssc.textFileStream("streaming_test_data/")  # SPECIFY THE TESTING DATA DIRECTORY HERE
    unlabelledData = ssc.textFileStream("streaming_unlabelled_data/")
    signal = ssc.textFileStream("streaming_signal/")
    data = data.mapPartitions(lambda x: csv.reader(x, delimiter='`', quotechar='|'))
    testData = testData.mapPartitions(lambda x: csv.reader(x, delimiter='`', quotechar='|'))
    unlabelledData = unlabelledData.mapPartitions(lambda x: csv.reader(x, delimiter='`', quotechar='|'))

    clear = False
    num_features = 300
    googleModel = False      # change model here
    if googleModel:
        # model_name = "Models\\GoogleNews-vectors-negative300.bin\\GoogleNews-vectors-negative300.bin"
        model_name = "Models/GoogleNews-vectors-negative300.bin"
        word2vec_model = Word2Vec.load_word2vec_format(model_name, binary=True)
        # model.init_sims(replace=True)
开发者ID:PirayaW,项目名称:twitter-news-classification-spark,代码行数:33,代码来源:streamingModel.py

示例13: SparkContext

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext

conf = (SparkConf()
         .setMaster("local[4]")
         .setAppName("MyApp")
         .set("spark.executor.memory", "1g")
         .set('spark.local.dir', './target/tmp'))
sc = SparkContext(conf = conf)
ssc = StreamingContext(sc, 10)

words = ssc.textFileStream("./gen/data/nums")
#print 'count', words.count()

root = words.map(lambda line: (int(line), 1))

root.saveAsTextFile('./target/result3')

#root.reduceByKey(lambda a, b: a + b) \
#  .saveAsTextFile('./target/result')

#root.reduceByKey(lambda a, b: a * b) \
#  .saveAsTextFile('./target/result2')

ssc.stop()
开发者ID:murer,项目名称:sandbox,代码行数:27,代码来源:count-nums-streamming.py

示例14: VectorManager

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
if __name__ == "__main__":

    vm = VectorManager(None)
    vm.load()    
    dc = DataClassifier(None, None)
    dc.loadModel()
    
    sc = SparkContext(appName="PythonStreaming")
    ssc = StreamingContext(sc, 30)
    
    streamPath = 'hdfs://157.26.83.52/user/wdroz/stream'    
    #streamPath = '/tmp/stream'    
    '''
    Data each line is {text : 'bla bla bla', source : 'bla bla', date : '2015-30-03 08:53:00'}
    '''
    lines = ssc.textFileStream(streamPath)
    
    dico = lines.map(lambda x: literal_eval(x))
    
    lema = dico.map(lambda x: (x, lemmatize(TextBlob(x['text']))))
    
    vect = lema.mapValues(lambda x: vm.processList(x))
    
    pred = vect.mapValues(lambda x: dc.predict(x))
    
    pred.foreachRDD(printPrediction)
        
    
    ssc.start()
    ssc.awaitTermination()
开发者ID:sh19871122,项目名称:TM_2014-2015S2,代码行数:32,代码来源:mainStream.py

示例15: classify

# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
def classify(x):
    sample = x.take(x.count())
    for i in sample:
        answer = model.predict(sample)
        print "==============================="
        print answer


if __name__ == "__main__":
    sc = SparkContext("local[2]", "streaming_gbts")

    # 加载GBTs模型
    model = parse_xiaodai_streaming.load_model(GradientBoostedTreesModel, path="$SPARK_HOME/gbts_xiaodai_1")

    # create a local streamingcontext with two
    # working thread and batch interval of 1 second
    ssc = StreamingContext(sc, 1)

    data_process = parse_xiaodai_streaming.data_process
    file_stream = ssc.textFileStream("/data/mllib/streaming/")
    col_rm = [0, 1, 2, 4, 25, 26] + range(19, 24)
    testData = file_stream.map(lambda line: data_process(line, col_rm))

    testData.pprint()
    testData.foreachRDD(classify)
    # print parse_xiaodai_streaming.evaluat_model(model, testData)

    ssc.start()
    ssc.awaitTermination(100)
开发者ID:yidun55,项目名称:mllib,代码行数:31,代码来源:xiaodai_streaming_hdfs_gbts.py


注:本文中的pyspark.streaming.StreamingContext.textFileStream方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。