本文整理汇总了Python中pyspark.sql.SQLContext.sql方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.sql方法的具体用法?Python SQLContext.sql怎么用?Python SQLContext.sql使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.SQLContext
的用法示例。
在下文中一共展示了SQLContext.sql方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main():
reviews_parquet = sys.argv[1]
metadata_parquet = sys.argv[2]
users_ascores_file = sys.argv[3]
products_ascores_file = sys.argv[4]
conf = SparkConf().setAppName('Amazon Cassandra Injector').setMaster("local").set("spark.cassandra.connection.host", "localhost")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
sqlContext.read.parquet(reviews_parquet).registerTempTable('amazon_reviews')
reviews = sqlContext.sql("""SELECT * FROM amazon_reviews""").rdd.cache()
reviews_by_reviewer = reviews.map(process_review).map(lambda j: (j["reviewerid"], j))
users_ascores = sc.textFile(users_ascores_file).map(ast.literal_eval).map(lambda (r_id, score, histo): (r_id, (score, histo)))
reviews_joined = reviews_by_reviewer.join(users_ascores).map(lambda (reviewerid, (j, (score, histo))): fillin_review(j, score))
# join with meth2_users_ascores. join on reviewerid -> ascore is reviewer ascore
reviews_joined.saveToCassandra("amzdb", "reviews")
# reviewers need their alternative score
reviewers = reviews.map(process_reviewer).map(lambda j: (j["reviewerid"], j))
# join with meth2_user_ascores. Get ascore and overall_histogram
reviewers_joined = reviewers.join(users_ascores).map(lambda (reviewerid, (j, (score, histo))): fillin_reviewer(j, score, histo))
reviewers_joined.saveToCassandra("amzdb", "reviewers")
# products need their overall score/histogram, and adjuted score/histogram
sqlContext.read.parquet(metadata_parquet).registerTempTable('amazon_metadata')
products = sqlContext.sql("""SELECT * FROM amazon_metadata""").rdd.map(process_product).map(lambda j: (j["asin"], j))
# join with meth2_product_ascores
products_ascores = sc.textFile(products_ascores_file).map(ast.literal_eval).map(lambda (asin, o_s, a_s, o_h, a_h, n): (asin, (o_s, o_h, a_s, a_h)))
products_joined = products.join(products_ascores).map(lambda (asin, (j, (o_s, o_h, a_s, a_h))): fillin_product(j, o_s, o_h, a_s, a_h))
products_joined.saveToCassandra("amzdb", "products")
示例2: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main(sc):
path = "events"
#text_file = sc.textFile(path)
sqlContext = SQLContext(sc)
events = sqlContext.jsonFile(path)
events = events.select(events["events.event"]).flatMap(lambda p: p.event)
events = events.map(lambda p: Row(
id=p.id,\
title=p.title, \
lat=p.latitude, \
long=p.longitude, \
postal_code=p.postal_code, \
start_time=datetime.strptime(p.start_time, "%Y-%m-%d %H:%M:%S"), \
stop_time=p.stop_time))
events_df = sqlContext.createDataFrame(events)
events_df.registerTempTable("events")
sqlContext.registerFunction("to_hour", lambda x: x.hour)
sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year))
e = sqlContext.sql("select title, str_date(start_time) as event_date,
to_hour(start_time) as hour, postal_code from events where postal_code is not null and start_time is not null")
events_grouped = sqlContext.sql("select event_date, hour, postal_code,
count(*) from events_filtered group by event_date,hour,postal_code order by postal_code,hour")
grouped_csv = events_grouped.map(toCSV)
grouped_csv.saveAsTextFile('events_cluster')
示例3: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main():
sqlContext = SQLContext(sc)
data = sc.textFile("hdfs://spark1:9000/user/convert_out/ct_20110218.csv", 200).map(lambda line: line.split(","))
rows = data.filter(lambda x: x[0] != 'SYMBOL')
df = rows.map(lambda p: (p[0].strip(), transform_time(p[1].strip(), p[2].strip()), float(p[3].strip()), float(p[4].strip())))
symbols = df.map(lambda x: Row(symbol=x[0], time=x[1], price=x[2], volume=x[3]))
schemaSymbols = sqlContext.inferSchema(symbols)
schemaSymbols.registerTempTable("symbols")
trades = sqlContext.sql("""SELECT symbol, time, sum(price*volume)/sum(volume) as price, sum(volume) as volume from
symbols group by symbol, time""")
trades = trades.map(lambda x: Row(symbol=x[0], time=x[1], price=x[2], volume=x[3]))
schemaTrades = sqlContext.inferSchema(trades)
schemaTrades.registerTempTable("trades")
# remove limit after test
syms = sqlContext.sql("SELECT distinct symbol from trades")
syms = syms.collect()
df_dict = {}
print type(syms)
for sym in syms:
sym = sym.symbol.strip()
print sym
sym_data = sqlContext.sql("SELECT symbol, time, price, volume FROM trades WHERE symbol = '{}' ORDER BY symbol, time".format(sym))
sym_data = sym_data.collect()
print len(sym_data)
sym_df = pd.DataFrame(sym_data, columns=['symbol', 'time', 'price', 'volume'])
# Predictive model did not like original volume values, so use rescaled value
sym_df['volume10k'] = np.round(sym_df['volume'] / 10000, 3)
for i in range(1,11):
sym_df['price_t-'+str(i)] = sym_df['price'].shift(i)
for i in range(1,11):
#sym_df['volume_t-'+str(i)] = sym_df['volume'].shift(i)
sym_df['volume10k_t-'+str(i)] = sym_df['volume10k'].shift(i)
# add labels for price and volume
sym_df['price_label'] = sym_df['price'].shift(-1)
sym_df['volume_label'] = sym_df['volume'].shift(-1)
sym_df['price_dir_label'] = np.where(sym_df.price_label > sym_df.price, 1, 0)
sym_df['volume_dir_label'] = np.where(sym_df.volume_label > sym_df.volume, 1, 0)
sym_df = sym_df.dropna()
df_dict[sym] = sym_df
print sym_df
train(sym,sym_df)
# print for testing
print len(df_dict)
print df_dict.keys()
print type(df_dict[sym])
示例4: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main():
conf = SparkConf().setAppName('symbols').set("spark.storage.blockManagerSlaveTimeoutMs", 60000)
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
data = sc.textFile("hdfs://spark1:9000/user/convert_out/ct_20110218.csv", 200).map(lambda line: line.split(",")).cache()
rows = data.filter(lambda x: x[0] != 'SYMBOL')
df = rows.map(lambda p: (p[0].strip(), transform_time(p[1].strip(), p[2].strip()), float(p[3].strip()), float(p[4].strip())))
#df = df.filter(lambda x: x[1] != 0)
symbols = df.map(lambda x: Row(symbol=x[0], time=x[1], price=x[2], volume=x[3]))
schemaSymbols = sqlContext.inferSchema(symbols)
schemaSymbols.registerTempTable("symbols")
trades = sqlContext.sql("""SELECT symbol, time, sum(price*volume)/sum(volume) as price, sum(volume) as volume from
symbols group by symbol, time""")
trades = trades.map(lambda x: Row(symbol=x[0], time=x[1], price=x[2], volume=x[3]))
schemaTrades = sqlContext.inferSchema(trades)
schemaTrades.registerTempTable("trades")
# remove limit after test
syms = sqlContext.sql("SELECT distinct symbol from trades")
syms = syms.collect()
df_dict = {}
print type(syms)
for sym in syms:
sym = sym.symbol.strip()
print sym
sym_data = sqlContext.sql("SELECT symbol, time, price, volume FROM trades WHERE symbol = '{}' ORDER BY symbol, time".format(sym))
sym_data = sym_data.collect()
print len(sym_data)
sym_df = pd.DataFrame(sym_data, columns=['symbol', 'time', 'price', 'volume'])
for i in range(1,11):
sym_df['price_t-'+str(i)] = sym_df['price'].shift(i)
for i in range(1,11):
sym_df['volume_t-'+str(i)] = sym_df['volume'].shift(i)
# add labels for price and volume
sym_df['price_label'] = sym_df['price'].shift(-1)
sym_df['volume_label'] = sym_df['volume'].shift(-1)
sym_df['price_label'] = np.where(sym_df.price_label > sym_df.price, 1, 0)
sym_df['volume_label'] = np.where(sym_df.volume_label > sym_df.volume, 1, 0)
sym_df = sym_df.dropna()
df_dict[sym] = sym_df
print sym_df
# print for testing
print len(df_dict)
print df_dict.keys()
print type(df_dict[sym])
sc.stop()
示例5: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main(sc):
sqlContext = SQLContext(sc)
taxiFile = sc.textFile("taxizip/taxizipaa.csv")
header = taxiFile.first()
taxiHeader = taxiFile.filter(lambda l: "vendor_id" in l)
taxiNoHeader = taxiFile.subtract(taxiHeader)
taxi_temp = taxiNoHeader.map(lambda k: k.split(","))
taxi_rdd = taxi_temp.map(lambda p: Row(vendor_id=p[0],
pickup_datetime=datetime.strptime(p[1], "%Y-%m-%d %H:%M:%S"),
dropoff_datetime=datetime.strptime(p[2], "%Y-%m-%d %H:%M:%S"),
passenger_count=int(p[3] if p[3]!="" else 0),
trip_distance=float(p[4] if p[4]!="" else 0),
pickup_longitude=float(p[5] if p[5]!="" else 0) ,
pickup_latitude=float(p[6] if p[6]!="" else 0),
rate_code=p[7],
store_and_fwd_flag=p[8],
dropoff_longitude=float(p[9] if p[9]!="" else 0),
dropoff_latitude=float(p[10] if p[10]!="" else 0),
payment_type=p[11],
fare_amount=float(p[12] if p[12]!="" else 0),
surcharge=float(p[13] if p[13]!="" else 0),
mta_tax=float(p[14] if p[14]!="" else 0),
tip_amount=float(p[15] if p[15]!="" else 0),
tolls_amount=float(p[16] if p[16]!="" else 0),
total_amount=float(p[17] if p[17]!="" else 0),
zipcode=p[18]))
taxi_df = sqlContext.createDataFrame(taxi_rdd)
taxi_df.registerTempTable("taxi")
sqlContext.registerFunction("to_hour", lambda x: x.hour)
sqlContext.registerFunction("str_date", lambda x: str(x.month) + "-" + str(x.day) + "-" + str(x.year))
th = sqlContext.sql("SELECT to_hour(dropoff_datetime) as hour, dropoff_datetime as trip_date, dropoff_longitude as lng,dropoff_latitude as lat,zipcode FROM taxi where dropoff_longitude!=0 and dropoff_latitude!=0")
th.registerTempTable("taxi_hr")
sqlContext.cacheTable("taxi_hr")
grouped_taxi = sqlContext.sql("select hour, zipcode,str_date(trip_date), count(*) as c from taxi_hr group by hour,zipcode,str_date(trip_date) order by c desc")
grouped_taxi.show(100)
#save this intermediate result to a file as csv
grouped_csv = grouped_taxi.map(toCSV)
grouped_csv.saveAsTextFile('results')
grouped_taxi.registerTempTable("taxi_grouped")
sqlContext.cacheTable("taxi_grouped")
示例6: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main(sc):
sqlContext = SQLContext(sc)
tasteProfileRdd = sc.textFile("userTaste/*")
songRdd = sc.textFile("songsDict/*")
# Load a text file and convert each line to a Row.
tasteProfile = tasteProfileRdd.filter(lambda l:len(l) > 0)
parsedSplits = tasteProfile.map(lambda l: l.split('\t'))
userTaste = parsedSplits.map(lambda p: Row(userId=p[0], songId=p[1], playCount=p[2]))
individualSong = songRdd.map(lambda l:l.split('|'))
songData = individualSong.map(lambda s: Row(songId=s[0],featureSet=s[1]))
# Infer the schema, and register the DataFrame as a table.
schemaUserTaste = sqlContext.inferSchema(userTaste)
schemaUserTaste.registerTempTable("userTaste")
schemaSongData = sqlContext.inferSchema(songData)
schemaSongData.registerTempTable("songData")
test2 = sqlContext.sql("select * from songData limit 5")
songIds = test2.map(lambda p: "songIds: " + s.songId)
#test1 = sqlContext.sql("SELECT distinct * FROM userTaste limit 5")
#songIds = test1.map(lambda p: "songIds: " + p.songId)
for i in songIds.collect():
print i
示例7: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main(argv):
Conf = (SparkConf().setAppName("SimpleGraph"))
sc = SparkContext(conf=Conf)
sqlContext = SQLContext(sc)
dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/data/"+argv[1]+".parquet"
rawDF = sqlContext.read.parquet(dirPath).registerTempTable("comments")
# This is where the magic happens
# SQL self join to join users who have interacted with one another
df = sqlContext.sql("""
SELECT t1.subreddit as Subreddit,
t1.id as OrigId , t2.id as RespId,
t1.author AS OrigAuth, t2.author AS RespAuth,
t1.score AS OrigScore, t2.score AS RespScore,
t1.ups AS OrigUps, t2.ups AS RespUps,
t1.downs AS OrigDowns, t2.downs AS RespDowns,
t1.controversiality AS OrigControv, t2.controversiality AS RespControv
FROM comments t1 INNER JOIN comments t2 ON CONCAT("t1_",t1.id) = t2.parent_id where t1.author!='[deleted]' and t2.author!='[deleted]'
""")
# write it into parquet ? Why ? Cause it compresses the data and is really fast to read from !
df.write.parquet("hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/data/"+argv[1]+"-selfjoin.parquet")
示例8: Spark_MapReduce_Parents
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def Spark_MapReduce_Parents(keyword, tokensofprevlevel, graphcache):
#tokensofprevlevelkeyword=tokensofprevlevel
#tokensofprevlevelkeyword.append(keyword)
md5hashparents = hashlib.md5(keyword).hexdigest()
#md5hashparents = keyword
md5hashparents = md5hashparents + "$parents"
picklef_keyword=open("RecursiveGlossOverlap_MapReduce_Parents_Persisted.txt","w")
asfer_pickle_string_dump(keyword,picklef_keyword)
picklef_keyword.close()
cachevalue=graphcache.get(md5hashparents)
if cachevalue:
print "Spark_MapReduce_Parents(): hash = ", md5hashparents, "; returning from cache"
return cachevalue
else:
#picklelock.acquire()
spcon = SparkContext("local[2]","Spark_MapReduce_Parents")
#picklef_keyword=open("RecursiveGlossOverlap_MapReduce_Parents_Persisted.txt","w")
#asfer_pickle_string_dump(keyword,picklef_keyword)
#picklef_keyword.close()
paralleldata = spcon.parallelize(tokensofprevlevel).cache()
#k=paralleldata.map(lambda keyword: mapFunction_Parents(keyword,tokensofprevlevel)).reduceByKey(reduceFunction_Parents)
k=paralleldata.map(mapFunction_Parents).reduceByKey(reduceFunction_Parents)
sqlContext=SQLContext(spcon)
parents_schema=sqlContext.createDataFrame(k.collect())
parents_schema.registerTempTable("Interview_RecursiveGlossOverlap_Parents")
query_results=sqlContext.sql("SELECT * FROM Interview_RecursiveGlossOverlap_Parents")
dict_query_results=dict(query_results.collect())
#print "Spark_MapReduce_Parents() - SparkSQL DataFrame query results:"
#picklelock.release()
graphcache.set(md5hashparents,dict_query_results[1])
spcon.stop()
print "graphcache_mapreduce_parents updated:", graphcache
return dict_query_results[1]
开发者ID:shrinivaasanka,项目名称:asfer-github-code,代码行数:37,代码来源:InterviewAlgorithmWithIntrinisicMerit_SparkMapReducer.py
示例9: log_mapreducer
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def log_mapreducer(logfilename, pattern, filt="None"):
spcon=SparkContext()
if filt == "None":
input=open(logfilename,'r')
paralleldata=spcon.parallelize(input.readlines())
patternlines=paralleldata.filter(lambda patternline: pattern in patternline)
print "pattern lines",patternlines.collect()
matches=patternlines.map(mapFunction).reduceByKey(reduceFunction)
else:
input=spcon.textFile(logfilename)
matches=input.flatMap(lambda line:line.split()).filter(lambda line: filt in line).map(mapFunction).reduceByKey(reduceFunction)
matches_collected=matches.collect()
print "matches_collected:",matches_collected
if len(matches_collected) > 0:
sqlContext=SQLContext(spcon)
bytes_stream_schema=sqlContext.createDataFrame(matches_collected)
bytes_stream_schema.registerTempTable("USBWWAN_bytes_stream")
query_results=sqlContext.sql("SELECT * FROM USBWWAN_bytes_stream")
dict_query_results=dict(query_results.collect())
print "----------------------------------------------------------------------------------"
print "log_mapreducer(): pattern [",pattern,"] in [",logfilename,"] for filter [",filt,"]"
print "----------------------------------------------------------------------------------"
dict_matches=dict(matches_collected)
sorted_dict_matches = sorted(dict_matches.items(),key=operator.itemgetter(1), reverse=True)
print "pattern matching lines:",sorted_dict_matches
print "----------------------------------------------------------------------------------"
print "SparkSQL DataFrame query results:"
print "----------------------------------------------------------------------------------"
pprint.pprint(dict_query_results)
print "----------------------------------------------------------------------------------"
print "Cardinality of Stream Dataset:"
print "----------------------------------------------------------------------------------"
print len(dict_query_results)
spcon.stop()
return sorted_dict_matches
示例10: Spark_MapReduce
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def Spark_MapReduce(level, wordsatthislevel, graphcache):
freqterms1_local=wordsatthislevel
md5hash = hashlib.md5(",".join(wordsatthislevel)).hexdigest()
#md5hash = ",".join(wordsatthislevel)
cachevalue=graphcache.get(md5hash)
if cachevalue:
print "Spark_MapReduce(): hash = ", md5hash, "; returning from cache"
return cachevalue
else:
spcon=SparkContext("local[2]","Spark_MapReduce")
print "Spark_MapReduce(): wordsatthislevel:",wordsatthislevel
paralleldata=spcon.parallelize(wordsatthislevel).cache()
#k=paralleldata.map(lambda wordsatthislevel: mapFunction(wordsatthislevel)).reduceByKey(reduceFunction)
k=paralleldata.map(mapFunction2).reduceByKey(reduceFunction)
#k=paralleldata.map(mapFunction).reduceByKey(reduceFunction)
#dict_k=k.collect()
#s = sorted(dict_k.items(),key=operator.itemgetter(1), reverse=True)
#print "Spark MapReduce results:"
#print s
############################
sqlContext=SQLContext(spcon)
recursiveglossoverlap_schema=sqlContext.createDataFrame(k.collect())
recursiveglossoverlap_schema.registerTempTable("Interview_RecursiveGlossOverlap")
query_results=sqlContext.sql("SELECT * FROM Interview_RecursiveGlossOverlap")
dict_query_results=dict(query_results.collect())
#print "Spark_MapReduce() - SparkSQL DataFrame query results:"
#print dict_query_results[1]
graphcache.set(md5hash, dict_query_results[1])
print "graphcache_mapreduce updated:", graphcache
spcon.stop()
return dict_query_results[1]
开发者ID:shrinivaasanka,项目名称:asfer-github-code,代码行数:34,代码来源:InterviewAlgorithmWithIntrinisicMerit_SparkMapReducer.py
示例11: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main():
# Setting the cluster configuration parameters
conf = SparkConf()
conf.setMaster("spark://localhost:7077")
conf.setAppName("Tweet App")
conf.set("spark.executor.memory", "3g")
conf.set("spark.driver.memory", "4g")
# Creating a Spark Context with conf file
sc = SparkContext(conf=conf)
# Creating and SQL context to perform SQL queries
sqlContext = SQLContext(sc)
# Define the data path
curr_path = os.path.dirname(os.path.abspath(__file__))
json_name = "out.json"
json_file_path = os.path.join(curr_path +
"/../Spark_Jobs/data/",
json_name)
parquet_file_path = createSQLContext(json_file_path, sqlContext)
print(parquet_file_path)
# Read from parquet file
parquetFile = sqlContext.read.parquet(parquet_file_path)
parquetFile.registerTempTable("tweets")
counter = sqlContext.sql("SELECT count(*) as cnt FROM tweets")
print("============= Count =================")
print("Count:: " + str(counter.collect()[0].cnt))
示例12: run
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def run(self):
sc = SparkContext("local", "gender")
sqlContext = SQLContext(sc)
#StringType =(str, unicode)
_out = self.output().open('w')
#lines = sc.textFile("myUser.csv")
#fobj = self.input().open("r")
#lines = sc.textFile(fobj.name)
print(type(self.required_tasks['insert_source'].output()))
print(self.required_tasks['insert_source'])
#print(self.input()['insert_source'].input())
lines = sc.textFile("myUser.csv")
parts = lines.map(lambda l: l.split(","))
users = parts.map(lambda p: (p[0], p[1],p[2],p[3],p[4],p[5],p[6],p[7],
p[8],p[9],p[10],p[11],p[12],p[13],p[14],p[15],p[16],p[17],p[18],p[19]))
schemaString = "userId lmsUserId lmsName orgName name gender registrationDate emailId mothertounge highestEduDegree goals city state active firstAccesDate lastAccessDate allowCert yearOfBirth pincode aadharId"
print(schemaString)
_out.write(schemaString )
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
#schemaUser = sqlContext.createDataFrame(users, schema)
schemaUser = sqlContext.applySchema(users, schema)
schemaUser.registerTempTable("users")
results = sqlContext.sql("SELECT gender FROM users")
genders = results.map(lambda p : (p,1))
counts = genders.reduceByKey(lambda a, b: a + b) #.map(lambda t : ("Gender " + t(0) + " No " + t(1))).collect()
for name in counts.collect():
_out.write(str(name))
_out.close()
示例13: main
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def main(dataFile, outputPath):
conf = SparkConf().setAppName("S3 Example").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
raw_text = sc.textFile(dataFile).persist(StorageLevel.MEMORY_AND_DISK)
csv_data = raw_text.map(lambda l: l.split(","))
row_data = csv_data.map(lambda p: dataIO.dataStruc(p))
interaction_df = sqlContext.createDataFrame(row_data)
# features.save_hdfs_parquet(interaction_df, outputPath)
dataIO.save_hdfs_parquet(interaction_df, outputPath)
interaction_df.registerTempTable("interactions")
tcp_interactions = sqlContext.sql( """
SELECT duration, dst_bytes, protocol_type FROM interactions WHERE protocol_type = 'tcp' AND duration > 1000 AND dst_bytes=0
""")
tcp_interactions.show()
features.print_tcp_interactions(tcp_interactions)
dataIO.print_from_dataio()
features.print_from_feature()
sc.stop()
示例14: get_latest_data
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
def get_latest_data(self):
from pyspark.sql import SparkSession
import config
import pandas as pd
# initialise sparkContext
spark1 = SparkSession.builder \
.master(config.sp_master) \
.appName(config.sp_appname) \
.config('spark.executor.memory', config.sp_memory) \
.config("spark.cores.max", config.sp_cores) \
.getOrCreate()
sc = spark1.sparkContext
# using SQLContext to read parquet file
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
from datetime import datetime
t1 = datetime.now()
df = sqlContext.read.parquet(config.proj_path+'/datas/appid_datapoint_parquet1')
# creating and querying fron the temporory table
df1 = df.registerTempTable('dummy')
df1 = sqlContext.sql('select count(distinct application) as app_count, time_stamp, source from dummy group by source, time_stamp')
# data cleaning
self.p2_df = df1.toPandas()
dates_outlook = pd.to_datetime(pd.Series(self.p2_df.time_stamp),unit='ms')
self.p2_df.index = dates_outlook
self.p2_df['date'] = self.p2_df.index.date
self.p2_df = self.p2_df.sort_values(by='time_stamp')
t2 =datetime.now()
time_to_fetch = str(t2-t1)
示例15: TestSQL
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import sql [as 别名]
class TestSQL(PySparkTestCase):
def setUp(self):
PySparkTestCase.setUp(self)
self.sqlCtx = SQLContext(self.sc)
def test_basic_functions(self):
rdd = self.sc.parallelize(['{"foo":"bar"}', '{"foo":"baz"}'])
srdd = self.sqlCtx.jsonRDD(rdd)
srdd.count()
srdd.collect()
srdd.schemaString()
srdd.schema()
# cache and checkpoint
self.assertFalse(srdd.is_cached)
srdd.persist(StorageLevel.MEMORY_ONLY_SER)
srdd.unpersist()
srdd.cache()
self.assertTrue(srdd.is_cached)
self.assertFalse(srdd.isCheckpointed())
self.assertEqual(None, srdd.getCheckpointFile())
srdd = srdd.coalesce(2, True)
srdd = srdd.repartition(3)
srdd = srdd.distinct()
srdd.intersection(srdd)
self.assertEqual(2, srdd.count())
srdd.registerTempTable("temp")
srdd = self.sqlCtx.sql("select foo from temp")
srdd.count()
srdd.collect()