本文整理汇总了Python中pyspark.streaming.StreamingContext.socketTextStream方法的典型用法代码示例。如果您正苦于以下问题:Python StreamingContext.socketTextStream方法的具体用法?Python StreamingContext.socketTextStream怎么用?Python StreamingContext.socketTextStream使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.streaming.StreamingContext
的用法示例。
在下文中一共展示了StreamingContext.socketTextStream方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def main():
parser = argparse.ArgumentParser(
description='process some log messages, storing them and signaling '
'a rest server')
parser.add_argument('--mongo', help='the mongodb url',
required=True)
parser.add_argument('--rest', help='the rest endpoint to signal',
required=True)
parser.add_argument('--port', help='the port to receive from '
'(default: 1984)',
default=1984, type=int)
parser.add_argument('--appname', help='the name of the spark application '
'(default: SparkharaLogCounter)',
default='SparkharaLogCounter')
parser.add_argument('--master',
help='the master url for the spark cluster')
parser.add_argument('--socket',
help='the socket to attach for streaming text data '
'(default: caravan-pathfinder)',
default='caravan-pathfinder')
args = parser.parse_args()
mongo_url = args.mongo
rest_url = args.rest
sconf = SparkConf().setAppName(args.appname)
if args.master:
sconf.setMaster(args.master)
sc = SparkContext(conf=sconf)
ssc = StreamingContext(sc, 1)
lines = ssc.socketTextStream(args.socket, args.port)
lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url, rest_url))
ssc.start()
ssc.awaitTermination()
示例2: createStreamingContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def createStreamingContext():
# Create a local StreamingContext with two working thread and batch interval of 1 second
sc = SparkContext("spark://%s:7077" % MASTER_NAME, appName="GlutenTweet", pyFiles=PYFILES)
ssc = StreamingContext(sc, 2)
# Create a DStream of raw data
raw = ssc.socketTextStream(MASTER_IP, 9999)
# Convert into models
tweets = raw.map(lambda r: Tweet(raw_json=r))
# Store models
tweets.foreachRDD(storeTweetsRDD)
# Sliding window analysis
window = tweets.window(20*60, 30)
hashtagCounts = analysisHahtagCount(window)
streamTop(hashtagCounts).pprint()
# Keyword extraction - note tweets is immutable
tweetsKeyword = tweets.map(lambda t: keywordExtraction(t))
# Update models
tweetsKeyword.foreachRDD(updateTweetsRDD)
# Sliding window analysis
window2 = tweetsKeyword.window(20*60, 30)
keywordCounts = analysisKeywordCount(window2)
streamTop(keywordCounts).pprint()
ssc.checkpoint(CHECKPOINT_DIR)
return ssc
示例3: main
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def main():
sym_dict = {}
conf = SparkConf().setAppName("symbol stream")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, .1)
lines = ssc.socketTextStream("localhost", 1337)
def print_now():
print sym_dict
def predict(prices):
print prices
def add_to_dict(line):
symbol, price, volume = line.split(',')
if symbol in sym_dict:
print 'made it here'
sym_dict[symbol][0].append(price)
sym_dict[symbol][1].append(volume)
if len(sym_dict[0]) > 10:
sym_dict[0].pop(0)
sym_dict[1].pop(0)
predict(sym_dict[0])
else:
sym_dict[symbol] = [[price],[volume]]
#test = lines.map(lambda line: json.dumps(line))
test = lines.map(lambda line: line)
test.pprint()
ssc.start()
ssc.awaitTermination()
示例4: createContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def createContext(host, port, outputPath):
# If you do not see this printed, that means the StreamingContext has been loaded
# from the new checkpoint
print "Creating new context"
if os.path.exists(outputPath):
os.remove(outputPath)
sc = SparkContext(appName="PythonStreamingRecoverableNetworkWordCount")
ssc = StreamingContext(sc, 120)
# Create a socket stream on target ip:port and count the
# words in input stream of \n delimited text (eg. generated by 'nc')
lines = ssc.socketTextStream(host, port)
print '\n\n\nconnectionMade\n\n\n'
addresses = lines.map(splitLine)
transcationsum = addresses.map(lambda x: (x[0], (1, x[1]))).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
def echo(time, rdd):
counts = "Counts at time %s %s" % (time, rdd.collect())
print counts
print "Appending to " + os.path.abspath(outputPath)
with open(outputPath, 'a') as f:
f.write(counts + "\n")
transcationsum.foreachRDD(echo)
return ssc
示例5: start_spark_streaming
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def start_spark_streaming():
question_db = QuestionDatabase(QB_QUESTION_DB)
features = {name: instantiate_feature(name, question_db) for name in FEATURE_NAMES}
sc = create_sc()
b_features = sc.broadcast(features)
ssc = StreamingContext(sc, 5)
ssc.socketTextStream('localhost', 9999) \
.repartition(QB_STREAMING_CORES - 1) \
.flatMap(lambda line: generate_guesses(line, b_features)) \
.map(lambda sg: evaluate_features(sg, b_features)) \
.foreachRDD(score_and_save)
ssc.start()
ssc.awaitTermination()
sc.stop()
示例6: sparkTask
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def sparkTask():
from textblob import TextBlob
import re
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
sc = SparkContext()
ssc = StreamingContext(sc, 1)
quotes = ssc.socketTextStream("localhost", 9999)
dataSentencesPolarity = quotes.map(lambda x: TextBlob(re.sub('[^A-Za-z0-9 \.\']+', '',x))).map(lambda y: (str(y.upper())[:60], y.sentiment.polarity))
dataSentencesPolarity.pprint()
ssc.start() # Start the computation
ssc.awaitTermination(20) # Wait for the computation to terminate
示例7: __init__
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
class xStreamProcessor:
ip = socket.gethostbyname(socket.gethostname())
port = 9999
dstream = None
sc = None
ssc = None
#def __init__(self,ip=None,port=None,spark_master = 'spark://localhost:7077'):
def __init__(self,ip=None,port=None,spark_master = 'mesos://10.0.2.85:5050'):
if ip is not None:
self.ip = ip
if port is not None:
self.port = port
self.sc = SparkContext(master=spark_master,appName='StreamProcessor')
self.ssc = StreamingContext(self.sc, 1)
#self.ssc.checkpoint(directory=None)
hiveContext = HiveContext(self.sc)
hiveContext.sql('DROP TABLE IF EXISTS default.tweet_stream')
hiveContext.sql('CREATE TABLE IF NOT EXISTS default.tweet_stream (ip STRING, port STRING, date_time STRING, user STRING, msg STRING)')
hiveContext.sql('DROP TABLE IF EXISTS default.email_stream')
hiveContext.sql('CREATE TABLE IF NOT EXISTS default.email_stream (ip STRING, port STRING, date_time STRING, \
fr STRING,to STRING, subject STRING, content STRING, subject_sentiment INT, content_sentiment INT, \
subject_power INT, content_power INT, subject_topic INT, content_topic INT, fraud_score DOUBLE)')
hiveContext.sql('DROP TABLE IF EXISTS default.email_graph')
hiveContext.sql('CREATE TABLE IF NOT EXISTS default.email_graph (fr STRING,to STRING, dt STRING)')
hiveContext.sql('DROP TABLE IF EXISTS default.trans_stream')
hiveContext.sql('CREATE TABLE IF NOT EXISTS default.trans_stream (ip STRING,port STRING, date_time STRING, user STRING, amount DOUBLE, \
big_trans INT, is_in_odd_day INT, is_at_odd_time INT)')
self.dstream = self.ssc.socketTextStream(self.ip, self.port)
self.process_stream()
self.ssc.start()
self.ssc.awaitTermination()
def process_stream(self):
parts = self.dstream.flatMap(lambda line: line.split("|"))
words = parts.map(lambda p: p[3])
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)
# Print the first ten elements of each RDD generated in this DStream to the console
wordCounts.pprint()
示例8: functionToCreateContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def functionToCreateContext():
sc = SparkContext("local[*]", "streaming_part")
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 5)
data_from_ticket_mechine = ssc.socketTextStream("localhost", 9999)
data_from_camera_mechine = ssc.socketTextStream("localhost", 9998)
#meat
data_from_ticket_mechine.map(ticket_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(ticket_mechine_RDD_handler)
data_from_camera_mechine.map(camera_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(camera_mechine_RDD_handler)
ssc.checkpoint(checkpointDirectory) # set checkpoint directory
return ssc
示例9: main
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def main():
master = 'local[2]'
app_name = 'reduce_demo1'
# print(range(0, 3))
sc = SparkContext(master, app_name)
ssc = StreamingContext(sc, 15)
host = 'localhost'
port = 9999
stream = ssc.socketTextStream(host, port)
stream.foreachRDD(fun_union_in_dstream_foreachRDD)
ssc.start()
ssc.awaitTermination()
示例10: main
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def main():
parser = argparse.ArgumentParser(
description='process some log messages, storing them and signaling '
'a rest server')
parser.add_argument('--mongo', help='the mongodb url',
required=True)
parser.add_argument('--rest', help='the rest endpoint to signal',
required=True)
parser.add_argument('--port', help='the port to receive from '
'(default: 1984)',
default=1984, type=int)
parser.add_argument('--appname', help='the name of the spark application '
'(default: SparkharaLogCounter)',
default='SparkharaLogCounter')
parser.add_argument('--master',
help='the master url for the spark cluster')
parser.add_argument('--socket',
help='the socket ip address to attach for streaming '
'text data (default: caravan-pathfinder)',
default='caravan-pathfinder')
parser.add_argument('--model',
help='the serialized model to use',
default='model.json')
args = parser.parse_args()
mongo_url = args.mongo
rest_url = args.rest
model = args.model
sconf = SparkConf().setAppName(args.appname)
if args.master:
sconf.setMaster(args.master)
sc = SparkContext(conf=sconf)
ssc = StreamingContext(sc, 1)
somv = fromJSON(model)
som = sc.broadcast(somv)
log4j = sc._jvm.org.apache.log4j
log4j.LogManager.getRootLogger().setLevel(log4j.Level.WARN)
lines = ssc.socketTextStream(args.socket, args.port)
lines.foreachRDD(lambda rdd: process_generic(rdd, mongo_url,
rest_url, som))
ssc.start()
ssc.awaitTermination()
示例11: consumer
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def consumer():
def process(time, rdd):
global words
words += Counter(dict(rdd.collect()))
sc = SparkContext(appName='graaftel')
ssc = StreamingContext(sc, 5)
lines = ssc.socketTextStream(os.getenv('PRODUCER_SERVICE_HOST', 'localhost'),
int(os.getenv('PRODUCER_SERVICE_PORT', 8080)))
counts = lines.flatMap(lambda line: line.encode('ascii', 'ignore').lower().split()) \
.map(lambda word: word.translate(None, string.punctuation)) \
.filter(lambda word: word not in stop_words) \
.map(lambda word: (word, 1)) \
.reduceByKey(add)
counts.foreachRDD(process)
ssc.start()
ssc.awaitTermination()
示例12: create_context
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def create_context(host, port):
print "Creating new context"
sc = SparkContext(appName="StreamingWordCount")
ssc = StreamingContext(sc, 2)
lines = ssc.socketTextStream(host, port)
def countWords(newValues, lastSum):
if lastSum is None:
lastSum = 0
return sum(newValues, lastSum)
word_counts = lines.flatMap(lambda line: line.split(" "))\
.filter(lambda w: w.startswith("#"))\
.map(lambda word: (word, 1))\
.updateStateByKey(countWords)
word_counts.pprint()
return ssc
示例13: createContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
def createContext(host, port, outputPath):
# If you do not see this printed, that means the StreamingContext has been loaded
# from the new checkpoint
print("Creating new context")
if os.path.exists(outputPath):
os.remove(outputPath)
sc = SparkContext(appName="PythonStreamingRecoverableNetworkWordCount")
ssc = StreamingContext(sc, 1)
# Create a socket stream on target ip:port and count the
# words in input stream of \n delimited text (eg. generated by 'nc')
lines = ssc.socketTextStream(host, port)
words = lines.flatMap(lambda line: line.split(" "))
wordCounts = words.map(lambda x: (x, 1)).reduceByKey(lambda x, y: x + y)
def echo(time, rdd):
# Get or register the blacklist Broadcast
blacklist = getWordBlacklist(rdd.context)
# Get or register the droppedWordsCounter Accumulator
droppedWordsCounter = getDroppedWordsCounter(rdd.context)
# Use blacklist to drop words and use droppedWordsCounter to count them
def filterFunc(wordCount):
if wordCount[0] in blacklist.value:
droppedWordsCounter.add(wordCount[1])
False
else:
True
counts = "Counts at time %s %s" % (time, rdd.filter(filterFunc).collect())
print(counts)
print("Dropped %d word(s) totally" % droppedWordsCounter.value)
print("Appending to " + os.path.abspath(outputPath))
with open(outputPath, 'a') as f:
f.write(counts + "\n")
wordCounts.foreachRDD(echo)
return ssc
示例14: StreamingContext
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
ssc = StreamingContext(sc, 5)
def add_tuples(a, b):
return tuple(sum(p) for p in zip(a,b))
def calc_values(sumx,sumy,sumxy,sumxsquare,n):
xmean=sumx/n
ymean=sumy/n
xymean=sumxy/n
xsquaremean=sumxsquare/n
beta=(xymean-(xmean*ymean))/(xsquaremean-(xmean*xmean))
alpha=ymean-(beta*xmean)
return (alpha,beta)
def stream_function(points):
points_float=points.map(lambda (x,y):(float(x),float(y)))
points_mean = points_float.map(lambda (x,y): (x, y, (x*y), (x*x), 1)).reduce(add_tuples)
meanvalues=calc_values(*points_mean)
alpha=meanvalues[0]
beta=meanvalues[1]
rdd = sc.parallelize([(alpha,beta)], numSlices=1)
rdd.saveAsTextFile(output + '/' + datetime.datetime.now().isoformat().replace(':', '-'))
lines = ssc.socketTextStream("cmpt732.csil.sfu.ca", inputs)
points = lines.map(lambda line: line.split())
points.foreachRDD(stream_function)
ssc.start()
ssc.awaitTermination(timeout=300)
示例15: globals
# 需要导入模块: from pyspark.streaming import StreamingContext [as 别名]
# 或者: from pyspark.streaming.StreamingContext import socketTextStream [as 别名]
if ('sqlContextSingletonInstance' not in globals()):
globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
return globals()['sqlContextSingletonInstance']
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: sql_network_wordcount.py <hostname> <port> ", file=sys.stderr)
exit(-1)
host, port = sys.argv[1:]
sc = SparkContext(appName="PythonSqlNetworkWordCount")
ssc = StreamingContext(sc, 1)
# Create a socket stream on target ip:port and count the
# words in input stream of \n delimited text (eg. generated by 'nc')
lines = ssc.socketTextStream(host, int(port))
words = lines.flatMap(lambda line: line.split(" "))
# Convert RDDs of the words DStream to DataFrame and run SQL query
def process(time, rdd):
print("========= %s =========" % str(time))
try:
# Get the singleton instance of SQLContext
sqlContext = getSqlContextInstance(rdd.context)
# Convert RDD[String] to RDD[Row] to DataFrame
rowRdd = rdd.map(lambda w: Row(word=w))
wordsDataFrame = sqlContext.createDataFrame(rowRdd)
# Register as table