本文整理汇总了Python中pyspark.streaming.StreamingContext.textFileStream方法的典型用法代码示例。如果您正苦于以下问题:Python StreamingContext.textFileStream方法的具体用法?Python StreamingContext.textFileStream怎么用?Python StreamingContext.textFileStream使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.streaming.StreamingContext
的用法示例。
在下文中一共展示了StreamingContext.textFileStream方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: setup
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
def setup():
conf = SparkConf().set("spark.default.parallelism", 1)
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 0.5)
# A function that cannot be serialized
def process(time, rdd):
sc.parallelize(range(1, 10))
ssc.textFileStream(inputd).foreachRDD(process)
return ssc
示例2: createContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
def createContext():
conf = SparkConf().setMaster('spark://{}:7077'.format(MASTER_URL)).set('spark.executor.memory', '2g')
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, STREAMING_INTERVAL)
lines = ssc.textFileStream('hdfs://{}/data/on_time/streaming/'.format(MASTER_URL))
ssc.checkpoint(CHECKPOINT_DIR)
# main split-combine-apply logic put here
pairs = lines.map(lambda x: x.split(",")).map(lambda x: (x[8], 1))
runningCounts = pairs.updateStateByKey(updateFunction)
sortedCounts = runningCounts.transform(lambda rdd: rdd.sortBy(lambda (airport, freq): freq, ascending=False))
示例3: createContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
def createContext():
conf = SparkConf().setMaster('spark://{}:7077'.format(MASTER_URL)).set('spark.executor.memory', '2g')
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, STREAMING_INTERVAL)
lines = ssc.textFileStream('hdfs://{}/data/on_time/streaming/'.format(MASTER_URL))
ssc.checkpoint(CHECKPOINT_DIR)
# main split-combine-apply logic put here
# filter out header and other invalid rows
rdd = lines.map(lambda line: line.split(',')).filter(lambda words: len(words) > 56)
# extract first field (for filtering header), Carrier, Orig, Dest, and delay fields
rdd2 = rdd.map(lambda x: (x[0], x[8], x[11], x[18], x[52], x[53], x[54], x[55], x[56])).map(lambda line: [str(w.replace('"','')) for w in line]).filter(lambda row: row[0] != 'Year' and any(row[4:]))
rdd2.pprint()
# sum up delay fields for each row
sum_delay_rdd = rdd2.map(sum_delay)
sum_delay_rdd.pprint()
# sum up delay for each (orig, dest, carrier) pair
combined_rdd = sum_delay_rdd.updateStateByKey(updateFunction)
combined_rdd.pprint()
# calculate avg delay
avg_rdd = combined_rdd.transform(lambda rdd: rdd.map(lambda (x, y): ((x[0], x[1]), (y[0]/float(y[1]), x[2]))))
avg_rdd.pprint()
# group by (orig, dest)
avg_rdd_by_route = avg_rdd.groupByKey()
# sort by on time performance for each (orig, dest) route and take top 10
route_sorted_carrier = avg_rdd_by_route.mapValues(lambda x: sorted(list(x))[:10])
aa = route_sorted_carrier.flatMapValues(lambda x: x)
aa.pprint()
aa.foreachRDD(process)
return ssc
示例4: SparkContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 2) # Create a streaming context with batch interval of 1 sec
ssc.checkpoint("checkpoint")
####### Example . #######
# map() and reduceByKey() on DStream
# Run this example, and then copy the file to
directory = sys.argv[1]
print(directory)
# create DStream from text file
# Note: the spark streaming checks for any updates to this directory.
# So first, start this program, and then copy the log file logs/access_log.log to 'directory' location
log_data = ssc.textFileStream(directory)
# Parse each line using a utility class
access_log_dstream = log_data.map(ApacheAccessLog.parse_from_log_line).filter(lambda parsed_line: parsed_line is not None)
# map each ip with value 1. So the stream becomes (ip, 1)
ip_dstream = access_log_dstream.map(lambda parsed_line: (parsed_line.ip, 1))
ip_count = ip_dstream.reduceByKey(lambda x,y: x+y)
ip_count.pprint(num = 30)
####### Example #######
# Join two Dstreams
示例5: SparkConf
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
To run
./pyspark.submit.sh spark-streaming-foreachRDD-and-foreachPartition.py
"""
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
from quiet_logs import quiet_logs
if __name__ == "__main__":
conf = SparkConf().setAppName("Reading files from a directory")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 2)
quiet_logs(sc)
lines = ssc.textFileStream('./streamingData')
# Split each line into words
words = lines.flatMap(lambda line: line.split(" "))
# Count each word in each batch
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)
# Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.pprint()
ssc.start() # Start the computation
ssc.awaitTermination() # Wait for the computation to terminate
开发者ID:danielsan,项目名称:Spark-Streaming-Examples,代码行数:33,代码来源:spark-streaming-reading-files-from-a-folder.py
示例6: getText
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
def getText(line):
j = json.loads(line);
return((j["created_at"], j["id"], j["text"]))
def postToSolr(t, rdd):
solr_url = "http://auisbigdatabox.cloudapp.net:8983/solr/project/update?commitWithin=5000"
d = json.dumps(rdd.map(lambda x: x).collect())
print(d)
h = {'content-type': 'application/json'}
r = requests.post(solr_url, data=d, headers=h)
print(r.status_code)
#r2 = requests.get("http://auisbigdatabox.cloudapp.net:8983/solr/admin/cores?action=RELOAD&core=project_shard1_replica1")
#print(r2.status_code)
sc = SparkContext()
dwsent = dict(sc.textFile("/data/dictionary.tsv").map(lambda line: line.split("\t")) \
.map(lambda line: (line[2], line[5])).collect())
shows = dict(sc.textFile("/data/shows.txt").map(lambda line: (line.lower().encode('ascii'), line)).collect())
ssc = StreamingContext(sc, 1)
ds = ssc.textFileStream("/project/rawdata")
ods = ds.window(30, 30).map(lambda line: getText(line)) \
.map(lambda line: {"created_dt": line[0], "show_s": getShow(line[2]), "sentiment_s": getLineSentiment(line[2]), "text_t": line[2]})
ods.foreachRDD(postToSolr)
ssc.start()
ssc.awaitTermination()
示例7: SparkContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import StreamingLinearRegressionWithSGD
# $example off$
from Functions import parsePoint, Nlabels2, makePredOVO
if __name__ == "__main__":
sc = SparkContext('local[*]',appName="PythonLogisticRegressionWithLBFGSExample")
ssc = StreamingContext(sc, 1)
sqlContext = SQLContext(sc)
labels = ['Politics','Finance','Sports','Sci&Tech','Entertainment','Crime']
labels_num = [[0.0,1.0],[0.0,2.0],[0.0,3.0],[0.0,4.0],[0.0,5.0],[1.0,2.0],[1.0,3.0],[1.0,4.0],[1.0,5.0],
[2.0,3.0],[2.0,4.0],[2.0,5.0],[3.0,4.0],[3.0,5.0],[4.0,5.0]]
data = ssc.textFileStream("/training/data/dir") #SPECIFY THE TRAINING DATA DIRECTORY HERE
testData = ssc.textFileStream("/testing/data/dir") #SPECIFY THE TESTING DATA DIRECTORY HERE
data = data.mapPartitions(lambda x: csv.reader(x, delimiter='`', quotechar='|'))
testData = testData.mapPartitions(lambda x: csv.reader(x, delimiter='`', quotechar='|'))
#Model details
num_features = 300
#model_name = "Models\\GoogleNews-vectors-negative300.bin\\GoogleNews-vectors-negative300.bin"
#model = Word2Vec.load_word2vec_format(model_name, binary=True)
#model.init_sims(replace=True)
# model_name = "../Models/ModelforStreaming300format_label" # Word2Vec Model
# model = Word2Vec.load_word2vec_format(model_name)
model_name = "../Models/ModelforStreaming300_label" # Word2Vec Model
model = Word2Vec.load(model_name)
index2word_set = set(model.index2word)
f = lambda j: parsePoint(j, index2word_set,model, num_features)
parsedData = data.map(f).cache()
示例8: SparkContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
from __future__ import print_function
__author__ = 'abc'
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
if __name__ == "__main__":
sc = SparkContext("local[2]",appName="PythonStreamingHDFSWordCount")
ssc = StreamingContext(sc, 1)
lines = ssc.textFileStream('/Volumes/work/data/kaggle/test/')
lines.pprint()
counts = lines.flatMap(lambda line: line.split(" "))\
.map(lambda x: (x, 1))\
.reduceByKey(lambda a, b: a+b)
counts.pprint()
ssc.start()
ssc.awaitTermination()
示例9: SparkContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from blockchain import *
logFile = "logfile" # Should be some file on your system
sc = SparkContext("local[1]", "Simple App")
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org").setLevel(logger.Level.ERROR)
logger.LogManager.getLogger("akka").setLevel(logger.Level.ERROR)
ssc = StreamingContext(sc, 1)
blocks = ssc.textFileStream("file:///Users/jeremyrubin/Desktop/sparkybitcoin/test")
# blocks = sc.wholeTextFiles("blocks")
blockJSONs = blocks.map(lambda c: Block.of_string(c, 0))
# blocks.pprint()
n_txs_by_n_inputs = (
blockJSONs.flatMap(lambda f: f.txns.to_list()).map(lambda x: (x.tx_in_count, 1)).reduceByKey(lambda x, y: x + y)
)
n_txs_by_n_inputs.pprint() # saveAsTextFiles("outputs/HE", "txt")
# print n_txs_by_n_inputs.collect()
ssc.start() # Start the computation
示例10: globals
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
if ('sqlContextSingletonInstance' not in globals()):
globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
return globals()['sqlContextSingletonInstance']
#
# main program
#
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: pubnub_dir_streaming data_dir ", file=sys.stderr)
exit(-1)
data_dir = sys.argv[1]
sc = SparkContext(appName="pubnub_dir_streaming_app")
ssc = StreamingContext(sc, 30)
devicesRDD = ssc.textFileStream(data_dir)
# Convert RDDs of the JSON DStream to DataFrame and run SQL query
def process(time, rdd):
try:
for item in rdd.collect():
insert_into_dbs(["Cassandra"], item)
except Exception as e:
print(traceback.format_exc())
#
# process each RRD on the dirver side
#
devicesRDD.foreachRDD(process)
ssc.start()
ssc.awaitTermination()
示例11: SparkContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
# -*- coding:utf-8 -*-
from pyspark.context import SparkContext
from pyspark.streaming import StreamingContext
from pyspark import StorageLevel
sc = SparkContext("yarn-client", "Chap7-2")
ssc = StreamingContext(sc, 10)
lines = ssc.textFileStream("hdfs:///user/y_tadayasu/data/")
words = lines.flatMap(lambda line: line.split(" ")).filter(lambda x:x)
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)
wordCounts.pprint()
ssc.start()
ssc.awaitTermination()
示例12: SparkContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
from pyspark import SparkContext, SQLContext
from pyspark.mllib.classification import StreamingLogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from pyspark.streaming import StreamingContext
sc = SparkContext('local[*]', appName="PythonLogisticRegressionWithLBFGSExample")
ssc = StreamingContext(sc, 30)
sqlContext = SQLContext(sc)
labels = ['Politics', 'Finance', 'Sports', 'Sci&Tech', 'Entertainment', 'Crime']
labels_num = [[0.0, 1.0], [0.0, 2.0], [0.0, 3.0], [0.0, 4.0], [0.0, 5.0],
[1.0, 2.0], [1.0, 3.0], [1.0, 4.0], [1.0, 5.0],
[2.0, 3.0], [2.0, 4.0], [2.0, 5.0],
[3.0, 4.0], [3.0, 5.0],
[4.0, 5.0]]
data = ssc.textFileStream("streaming_data/") # SPECIFY THE TRAINING DATA DIRECTORY HERE
testData = ssc.textFileStream("streaming_test_data/") # SPECIFY THE TESTING DATA DIRECTORY HERE
unlabelledData = ssc.textFileStream("streaming_unlabelled_data/")
signal = ssc.textFileStream("streaming_signal/")
data = data.mapPartitions(lambda x: csv.reader(x, delimiter='`', quotechar='|'))
testData = testData.mapPartitions(lambda x: csv.reader(x, delimiter='`', quotechar='|'))
unlabelledData = unlabelledData.mapPartitions(lambda x: csv.reader(x, delimiter='`', quotechar='|'))
clear = False
num_features = 300
googleModel = False # change model here
if googleModel:
# model_name = "Models\\GoogleNews-vectors-negative300.bin\\GoogleNews-vectors-negative300.bin"
model_name = "Models/GoogleNews-vectors-negative300.bin"
word2vec_model = Word2Vec.load_word2vec_format(model_name, binary=True)
# model.init_sims(replace=True)
示例13: SparkContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext
conf = (SparkConf()
.setMaster("local[4]")
.setAppName("MyApp")
.set("spark.executor.memory", "1g")
.set('spark.local.dir', './target/tmp'))
sc = SparkContext(conf = conf)
ssc = StreamingContext(sc, 10)
words = ssc.textFileStream("./gen/data/nums")
#print 'count', words.count()
root = words.map(lambda line: (int(line), 1))
root.saveAsTextFile('./target/result3')
#root.reduceByKey(lambda a, b: a + b) \
# .saveAsTextFile('./target/result')
#root.reduceByKey(lambda a, b: a * b) \
# .saveAsTextFile('./target/result2')
ssc.stop()
示例14: VectorManager
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
if __name__ == "__main__":
vm = VectorManager(None)
vm.load()
dc = DataClassifier(None, None)
dc.loadModel()
sc = SparkContext(appName="PythonStreaming")
ssc = StreamingContext(sc, 30)
streamPath = 'hdfs://157.26.83.52/user/wdroz/stream'
#streamPath = '/tmp/stream'
'''
Data each line is {text : 'bla bla bla', source : 'bla bla', date : '2015-30-03 08:53:00'}
'''
lines = ssc.textFileStream(streamPath)
dico = lines.map(lambda x: literal_eval(x))
lema = dico.map(lambda x: (x, lemmatize(TextBlob(x['text']))))
vect = lema.mapValues(lambda x: vm.processList(x))
pred = vect.mapValues(lambda x: dc.predict(x))
pred.foreachRDD(printPrediction)
ssc.start()
ssc.awaitTermination()
示例15: classify
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import textFileStream [as 别名]
def classify(x):
sample = x.take(x.count())
for i in sample:
answer = model.predict(sample)
print "==============================="
print answer
if __name__ == "__main__":
sc = SparkContext("local[2]", "streaming_gbts")
# 加载GBTs模型
model = parse_xiaodai_streaming.load_model(GradientBoostedTreesModel, path="$SPARK_HOME/gbts_xiaodai_1")
# create a local streamingcontext with two
# working thread and batch interval of 1 second
ssc = StreamingContext(sc, 1)
data_process = parse_xiaodai_streaming.data_process
file_stream = ssc.textFileStream("/data/mllib/streaming/")
col_rm = [0, 1, 2, 4, 25, 26] + range(19, 24)
testData = file_stream.map(lambda line: data_process(line, col_rm))
testData.pprint()
testData.foreachRDD(classify)
# print parse_xiaodai_streaming.evaluat_model(model, testData)
ssc.start()
ssc.awaitTermination(100)