本文整理匯總了Python中pyspark.SQLContext.sql方法的典型用法代碼示例。如果您正苦於以下問題:Python SQLContext.sql方法的具體用法?Python SQLContext.sql怎麽用?Python SQLContext.sql使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.SQLContext
的用法示例。
在下文中一共展示了SQLContext.sql方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: main
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import sql [as 別名]
def main():
conf = SparkConf().setAppName('artist_career')
sc = SparkContext(conf=conf)
assert sc.version >= '1.5.1'
sqlContext=SQLContext(sc)
inputs = sys.argv[1]
output = sys.argv[2]
customSchema = StructType([StructField('SongNumber', StringType(),False),StructField('SongID', StringType(),False),StructField('AlbumID', StringType(),False),StructField('AlbumName', StringType(),False),StructField('ArtistID', StringType(),False),StructField('ArtistLatitude', StringType(),False),StructField('ArtistLocation', StringType(),False),StructField('ArtistLongitude', StringType(),False),StructField('ArtistName', StringType(),False),StructField('Danceability', StringType(),False),StructField('Duration', StringType(),False),StructField('KeySignature', StringType(),False),StructField('KeySignatureConfidence', StringType(),False),StructField('Tempo', StringType(),False),StructField('TimeSignature', StringType(),False),StructField('TimeSignatureConfidence', StringType(),False),StructField('Title', StringType(),False),StructField('Year', StringType(),False),StructField('Energy', StringType(),False),StructField('ArtistFamiliarity', StringType(),False),StructField('ArtistMbid', StringType(),False),StructField('SongHotttnesss', StringType(),False),StructField('Loudness', StringType(),False),StructField('StartOfFadeOut', StringType(),False),StructField('EndOfFadeIn', StringType(),False),StructField('ModeConfidence', StringType(),False)])
df= sqlContext.read.format('com.databricks.spark.csv').options(header='true').load(inputs,schema = customSchema)
df.registerTempTable('artist_data')
million_song=sqlContext.sql("SELECT SongNumber,SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation,ArtistLongitude,ArtistName,Danceability,Duration,KeySignature,KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,Title,Year,Energy,ArtistFamiliarity,ArtistMbid,SongHotttnesss,Loudness,StartOfFadeOut,EndOfFadeIn,ModeConfidence from artist_data where Year!=0 AND ArtistFamiliarity!='nan'")
million_song.write.format('parquet').save(output)
示例2: __init__
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import sql [as 別名]
class DataHandler:
def __init__(self):
self.conf = (SparkConf()
.setAppName("BandCard")
.set("spark.cores.max", "2")
.set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar'))
self.sc = SparkContext(conf=self.conf)
self.sqlctx = SQLContext(self.sc)
self.mysql_helper = MySQLHelper('core', host='10.9.29.212')
def load_from_mysql(self, table, database='core'):
url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database
df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load()
return df
def prepare_life_cycle(self, year, season):
'''
準備生命周期數據
從t_CMMS_ASSLIB_ASSET中獲取每日AUM數據
prepare data
saum1 (last season sum aum)
saum2 (current season sum aum)
aum_now
account_age (months)
last_tr_date (days)
:param year:
:param season: 1,2,3,4
:return:
'''
# 計算月份
print('----------------------生命周期-Start----------------------')
print('開始準備生命周期數據...')
print('開始計算月份')
if season == 1:
# date1 當前季度月份
date1 = [str(year) + '-01', str(year) + '-02', str(year) + '-03']
# date2 上一季月份
date2 = [str(year - 1) + '-10', str(year - 1) + '-11', str(year - 1) + '-12']
elif season == 4:
date1 = [str(year) + '-10', str(year) + '-11', str(year) + '-12']
date2 = [str(year) + '-07', str(year) + '-08', str(year) + '-9']
else:
date1 = [str(year) + '-0' + str(3 * season - 2), str(year) + '-0' + str(3 * season - 1),
str(year) + '-0' + str(3 * season)]
date2 = [str(year) + '-0' + str(3 * season - 5), str(year) + '-0' + str(3 * season - 4),
str(year) + '-0' + str(3 * season - 3)]
print('當前季度月份 new:', date1)
print('上一季度月份 old:', date2)
# 加載AUM表
aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').cache()
# 拚接每季度三個月斷數據
season_new = aum.filter(aum.STAT_DAT == date1[0]).unionAll(aum.filter(aum.STAT_DAT == date1[1])).unionAll(
aum.filter(aum.STAT_DAT == date1[2]))
season_old = aum.filter(aum.STAT_DAT == date2[0]).unionAll(aum.filter(aum.STAT_DAT == date2[1])).unionAll(
aum.filter(aum.STAT_DAT == date2[2]))
# 計算每季度AUM
aum_season_old = season_old.select('CUST_NO', season_old.AUM.alias('AUM1')).groupBy('CUST_NO').sum('AUM1')
aum_season_new = season_new.select('CUST_NO', season_new.AUM.alias('AUM2')).groupBy('CUST_NO').sum('AUM2')
# 兩個季度進行外聯接
'''
+-----------+---------+---------+
| CUST_NO|sum(AUM2)|sum(AUM1)|
+-----------+---------+---------+
|81005329523| null|294844.59|
|81011793167| null| 365.20|
|81015319088| null| 9640.96|
+-----------+---------+---------+
'''
union_season = aum_season_old.join(aum_season_new, 'CUST_NO', 'outer')
# 篩選當前AUM
temp_result = aum.select('CUST_NO', 'AUM', 'STAT_DAT').groupBy('CUST_NO', 'STAT_DAT').sum('AUM').sort(
'CUST_NO').sort(aum.STAT_DAT.desc())
temp_result.select('CUST_NO', temp_result['sum(AUM)'].alias('AUM'), 'STAT_DAT').registerTempTable('group_in')
aum_now_sql = "select CUST_NO,first(AUM) as AUM_NOW from group_in group by CUST_NO"
aum_now = self.sqlctx.sql(aum_now_sql)
# 清除緩存表
self.sqlctx.dropTempTable('group_in')
# 聯合
union_season_aumnow = union_season.join(aum_now, 'CUST_NO', 'outer')
#.........這裏部分代碼省略.........
示例3: print_fun
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import sql [as 別名]
from pyspark import SparkContext, SparkConf
from pyspark import SQLContext
import itertools
def print_fun(collect):
for item in itertools.chain(collect):
print "|Name: " + item[0], "|Value: " + str(item[1]), "|Attribute: " + item[2]
if __name__ == "__main__":
conf = SparkConf().setAppName("json_ops").setMaster("local[3]")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
# 將json格式直接直接parallelize為RDD
equipmentRDD = sc.parallelize(['{"name":"asset1","value":1,"attribute":"屬性1"}',
'{"name":"asset2","value":2,"attribute":"屬性2"}',
'{"name":"asset3","value":3,"attribute":"屬性3"}'])
equipment = sqlContext.jsonRDD(equipmentRDD)
equipment.registerTempTable("equipment")
assets = sqlContext.sql("SELECT * FROM equipment as eq WHERE eq.value >= 1 AND eq.value <= 2")
assets.show()
# 查詢結果進行隱射
assetMap = assets.map(lambda asset: (asset.name, asset.value, asset.attribute)).foreachPartition(print_fun)
sc.stop()
示例4: SparkContext
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import sql [as 別名]
spark_home = '/opt/cloud/spark'
os.environ['SPARK_HOME'] = spark_home
conf = (SparkConf()
.setMaster(master)
.setAppName(appName)
.set("spark.sql.parquet.binaryAsString","true")
)
sc = SparkContext(conf = conf)
sql_context = SQLContext(sc)
sql_context.registerFunction("to_mac", lambda x: normal_mac(x), StringType())
parquet_df = sql_context.read.parquet(input)
sql_context.registerDataFrameAsTable(parquet_df, "loginflowlog")
#_sql = "select to_mac(upper(usermac)),count(distinct dat) days from loginflowlog group by to_mac(upper(usermac))"
_sql = "select to_mac(upper(usermac)),count(distinct logtime) days from loginflowlog group by to_mac(upper(usermac))"
rs_df = sql_context.sql(_sql)
rs = rs_df.collect()
logger.info("---->" + str(len(rs)))
lists = []
for r in rs:
usermac = r[0]
days = r[1]
t = (usermac,days)
lists.append(t)
#logger.debug(t)
dao = MysqlDao()
_sql = "TRUNCATE TABLE user_days"
dao.insert(_sql)
示例5: range
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import sql [as 別名]
payload['data']=[]
if(len(pub_data[key]['data'])>batch_size):
total_len=len(pub_data[key]['data'])
for index in range(0,total_len/batch_size,batch_size):
if(index+batch_size>total_len):
payload['data']=pub_data[key]['data'][index:total_len-index]
else:
payload['data']=pub_data[key]['data'][index:index+batch_size]
pushToRedis(payload)
else:
payload['data']=pub_data[key]['data']
pushToRedis(payload)
# cerating publisher realted data from daily data
pub_dfp_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(device_finger_print) as dfps , collect_list(c_dfp) as dfp_counts from (select publisher_id ,id5, device_finger_print ,count(device_finger_print) as c_dfp from testtable where id5!="" group by publisher_id,id5,device_finger_print) group by publisher_id,id5')
pub_ip_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(ip) as ips , collect_list(c_ip) as ip_counts from (select publisher_id ,id5, ip ,count(ip) as c_ip from testtable where id5!="" group by publisher_id,id5,ip) group by publisher_id,id5')
pub_session_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(sessionId) as sessions , collect_list(c_session) as session_counts from (select publisher_id ,id5, sessionId ,count(sessionId) as c_session from testtable where id5!="" group by publisher_id,id5,sessionId) group by publisher_id,id5')
pub_df=pub_dfp_df.join(pub_ip_df,['publisher_id','user_identifier'],'outer')
pub_df=pub_df.join(pub_session_df,['publisher_id','user_identifier'],'outer')
pub_daily_df=sqlContext.read.json(daily_fetch_path)
if(len(pub_daily_df.columns)==0):
for dtype in pub_df.dtypes:
pub_daily_df=pub_daily_df.withColumn(dtype[0],lit(None).cast(dtype[1]))
pub_daily_df=pub_daily_df.select(pub_df.columns)
pub_df=pub_df.union(pub_daily_df)
pub_daily_df=pub_daily_df.select(pub_df.columns)
開發者ID:ajaymshr42,項目名稱:tutorialNotes,代碼行數:33,代碼來源:pb_integrated_complete_timed_preprocessing_publisher.py
示例6: print
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import sql [as 別名]
print("------- ******* Task 1 ******* -------")
columns = ['id', 'price', 'date', 'zip', 'type', 'new', 'duration', 'PAON',
'SAON', 'street', 'locality', 'town', 'district', 'county', 'ppd',
'status']
rdd = sc.textFile(datadir + "prop-prices.csv")
df = pyspark_csv.csvToDataFrame(sqlContext, rdd, columns=columns)
df.registerTempTable("properties")
df.persist()
# Task 2: let's do some basic analysis on the data.
# Find how many records we have per year, and print them out sorted by year.
print("------- ******* Task 2 ******* -------")
year_count = sqlContext.sql(
"""select year(date) as year, count(*) as count
from properties
group by year(date)
order by year(date)""").collect()
print(year_count)
# Task 3: Everyone knows that properties in London are expensive.
# Find the average property price by county,
# and print the top 10 most expensive counties
print("------- ******* Task 3 ******* -------")
county_prices = sqlContext.sql(
"""select county, avg(price) as avg_price
from properties
group by county
order by avg(price) desc
limit 10""").collect()
print(county_prices)
示例7: SparkContext
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import sql [as 別名]
import os
import sys
from pyspark import SQLContext
from pyspark import SparkContext
#os.environ["SPARK_HOME"] = "/opt/spark-1.6.1-bin-hadoop2.6"
#os.environ["HADOOP_HOME"] = "/opt/hadoop"
#os.environ["HADOOP_PREFIX"] = "/opt/hadoop"
#os.environ["HIVE_HOME"] = "/opt/hive"
sc = SparkContext('local[1]')
sql_context = SQLContext(sc)
sql_context.setConf( "spark.sql.shuffle.partitions", "1")
sql_context.sql(""" use fex_test """)
示例8: SparkContext
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import sql [as 別名]
sparkConf = (SparkConf()
.set(key="carbon.insert.batch.size", value="1000")
.set(key="spark.driver.allowMultipleContexts",value="true")
.set(key="spark.executor.extraJavaOptions", value="-Dwso2_custom_conf_dir=/home/supun/Downloads/wso2das-3.1.0/repository/conf"))
#Create a new SparkContext using the above SparkConf.
sparkCtx = SparkContext(conf=sparkConf)
# Check spark master.
print(sparkConf.get("spark.master"));
###################### Get data from DAS table #####################
sqlCtx = SQLContext(sparkCtx)
sqlCtx.sql('CREATE TEMPORARY TABLE table1 ' +
'USING org.wso2.carbon.analytics.spark.core.sources.AnalyticsRelationProvider ' +
'OPTIONS (tenantId "-1234", tableName "IRIS_DATA_STREAM")')
df = sqlCtx.sql("SELECT * FROM table1");
##################### Prepare the data #####################
assembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
outputCol="features")
assembledDf = assembler.transform(df)
assembledDf.show()
transformedDf = assembledDf.select(col("class").alias("label"), col("features"))
示例9: main
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import sql [as 別名]
def main():
conf = SparkConf().setAppName('Scenario')
sc = SparkContext(conf=conf)
assert sc.version >= '1.5.1'
sqlContext = SQLContext(sc)
read_parq=sqlContext.read.parquet(inputs).cache()
read_parq.registerTempTable('data')
scenario_1=sqlContext.sql("""
SELECT ArtistName , MAX(ArtistFamiliarity) AS MaxFamiliarity
FROM data
where ArtistFamiliarity!='nan' and ArtistLocation is NOT NULL
GROUP BY ArtistName
ORDER BY MAX(ArtistFamiliarity) DESC
LIMIT 100
""")
#scenario_1.show()
scenario_1.registerTempTable('scenario1')
joined_artist=sqlContext.sql("""
SELECT A.ArtistName , B.ArtistFamiliarity,B.SongID,B.SongHotttnesss,B.ArtistLocation,B.ArtistLatitude,B.ArtistLongitude
FROM scenario1 A
INNER JOIN data B ON A.ArtistName=B.ArtistName
""")
#joined_artist.show()
joined_artist.registerTempTable('joined_artist')
AvgFamiliarity=sqlContext.sql("""
SELECT first(ArtistName) AS Artist_Name, MIN(ArtistFamiliarity) AS MinFamiliarity,MAX(ArtistFamiliarity) AS MaxFamiliarity,ArtistLocation AS Location, first(ArtistLatitude) AS Latitude, first(ArtistLongitude) AS Longitude
FROM joined_artist
where ArtistFamiliarity!='nan' and ArtistLatitude <> '' and ArtistLatitude is not NULL
GROUP BY ArtistLocation
ORDER BY MAX(ArtistFamiliarity) DESC
""")
#AvgFamiliarity.show()
AvgFamiliarity.rdd.map(tuple).coalesce(1).saveAsTextFile(output+'/scenario1')
scenario_2=sqlContext.sql("""
SELECT ArtistName , MAX(SongHotttnesss) as MaxHotttness
FROM data
where SongHotttnesss!='nan'
GROUP BY ArtistName
ORDER BY MAX(SongHotttnesss) DESC
LIMIT 10
""")
scenario_2.registerTempTable('scenario2')
joined_artist_hotness=sqlContext.sql("""
SELECT B.Title, A.ArtistName , B.SongHotttnesss,B.ArtistFamiliarity
FROM scenario2 A
INNER JOIN data B ON A.ArtistName=B.ArtistName and A.MaxHotttness=B.SongHotttnesss
""")
#joined_artist_hotness.show()
joined_artist_hotness.rdd.map(tuple).coalesce(1).saveAsTextFile(output+'/scenario2')
#Of a particular artist
scenario_3=sqlContext.sql("""
SELECT ArtistName , Year, AVG(ArtistFamiliarity) AS AvgFamiliarity,COUNT(SongID) AS Total_Songs
FROM data
where ArtistName='Britney Spears' and Year!=0
GROUP BY Year,ArtistName
ORDER BY Year
""")
scenario_3.rdd.map(tuple).coalesce(1).saveAsTextFile(output+'/scenario3')
scenario_4=sqlContext.sql("""
SELECT ArtistName , MAX(ArtistFamiliarity)-MIN(ArtistFamiliarity) AS Difference
FROM data
GROUP BY ArtistName
""")
scenario_4.registerTempTable('scenario4')
top_diff=sqlContext.sql("""
SELECT ArtistName , MAX(Difference) as MAXDiff
FROM scenario4
GROUP BY ArtistName
ORDER BY MAXDiff DESC
LIMIT 10
""")
示例10: str
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import sql [as 別名]
lines_with_keyword = textFile.filter(lambda line: "@TMobileHelp" in line)
print "Number of tweets with TMobileHelp: " + str(lines_with_keyword.count())
print lines_with_keyword.lookup("text")
#print lines_with_keyword
#schemaTweets = sqlContext.createDataFrame(lines_with_keyword)
#schemaTweets.registerTempTable("tweets")
#row = Row("text")
#lines_with_keyword.map(row).toDF()
#lines_with_keyword.printSchema()
#print tweets.take(5)
#print keyword_onlytext.take(5)
df = sqlContext.read.json(currentFile)
#df.printSchema()
#df_distinctlang = df.select("lang").distinct()
#df_lang = sql.read.json("TwitterLanguages.json")
#df_join =df_distinctlang.join(df_lang, df_distinctlang.lang == df_lang.code).drop(df_lang.code).drop(df_lang.status)
#df_join.show()
df.registerTempTable("df")
df_text = df.select("text")
#df_text.printSchema()
df_sql = sqlContext.sql("SELECT text FROM df WHERE text like '%TMobileHelp%'")
print df_sql.collect()
#df_sql.rdd.map(lambda x: ",".join(map(str, x))).coalesce(1).saveAsTextFile("file.csv")
#df.toPandas().to_csv('mycsv.csv')
示例11: SparkConf
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import sql [as 別名]
if __name__ == "__main__":
file_path = os.path.abspath("../doc/book.txt")
print file_path
conf = SparkConf().setAppName("schema_test").setMaster("local")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
lines = sc.textFile(file_path)
# 切分
parts = lines.map(lambda lines: lines.split(","))
# 隱射表間關係(定義表結構)
book = parts.map(lambda book: Row(name=book[0], author=book[1], price=float(book[2]), publish=book[3]))
# 轉換成schema並注冊
schemaPeople = sqlContext.createDataFrame(book)
schemaPeople.registerTempTable("book")
# 定義sqk語句(查詢prize在50、60之間的書)
book = sqlContext.sql("SELECT * FROM book WHERE price > 50.0 AND price < 60 OR name LIKE '%Spark%'")
# 查詢結果進行隱射
bookMap = book.map(lambda books: (books.name, books.author, books.price, books.publish))
for book in bookMap.collect():
print "|Name: " + book[0], "|Author: " + book[1], "|Price: " + str(book[2]), "|Publish: " + book[3] + "|"
sc.stop()
示例12: dateformat
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import sql [as 別名]
{'uid': '3', 'adid': 'c','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823550766},
{'uid': '4', 'adid': 'd','guuid':'bb','guuidctime':1,'url':'','referer':'','hosid':'133','gwid':'','ua':'','ip':'','createtime':1450823268766},
]).registerAsTable("adload")
_adPlayDF=sqlContext.createDataFrame([
{'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823568766},
{'uid': '2', 'adid': 'b','guuid':'aa','createtime':1450823569766},
{'uid': '4', 'adid': 'd','guuid':'bb','createtime':1450823268766},
]).registerAsTable("adplay")
_adClickDF =sqlContext.createDataFrame([
{'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823580766},
]).registerAsTable("adclick")
'''
sqlContext.registerFunction("dateformat", lambda x:longTime2str(x),StringType())
adLoadDf=sqlContext.sql('select hosid,dateformat(createtime) day,adid,count(guuid) pv,count(distinct guuid) uv '
'from adload where createtime is not null and dateformat(createtime)=%s '
'group by adid,hosid,dateformat(createtime)' % (lastdate)).registerAsTable("radload")
adPlayDf=sqlContext.sql('select gh.hosid,dateformat(ap.createtime) day,adid,count(ap.guuid) pv,count(distinct ap.guuid) uv '
'from adplay ap left join ghid gh on ap.guuid=gh.guuid where dateformat(ap.createtime)=%s '
'group by ap.adid,gh.hosid,dateformat(ap.createtime)' % (lastdate)).registerAsTable("radplay")
# sqlContext.sql('select sum(pv) from radplay').foreach(printx)
adClick=sqlContext.sql('select gh.hosid,dateformat(ac.createtime) day,ac.adid,count(ac.guuid) pv,count(distinct ac.guuid) uv '
'from adclick ac left join ghid gh on ac.guuid=gh.guuid where dateformat(ac.createtime)=%s '
'group by ac.adid,gh.hosid,dateformat(ac.createtime)' % (lastdate)).registerAsTable("radclick")
_df=sqlContext.sql('select A.hosid,A.day,A.adid,A.pv,A.uv,B.pv,B.uv,C.pv,C.uv from radload A '
'left join radplay B on A.hosid=B.hosid and A.adid=B.adid '
示例13: SparkConf
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import sql [as 別名]
if __name__ == "__main__":
json_path = os.path.abspath("../doc/book.json")
txt_path = os.path.abspath("../doc/book.txt")
conf = SparkConf().setAppName("mul_sources").setMaster("local[2]")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
# json讀取並隱射
json_df = sqlContext.read.json(json_path)
json_df.registerTempTable("json_book")
# txt讀取並隱射
lines = sc.textFile(txt_path)
parts = lines.map(lambda lines: lines.split(","))
book = parts.map(lambda book: Row(name=book[0], author=book[1], price=float(book[2]), publish=book[3]))
schemaPeople = sqlContext.createDataFrame(book)
schemaPeople.registerTempTable("txt_book")
# sql_book = sqlContext.sql("SELECT * FROM json_book AS jbook LEFT JOIN txt_book AS tbook ON tbook.name=jbook.name")
sql_book = sqlContext.sql("SELECT * FROM json_book AS jbook , txt_book AS tbook "
"WHERE jbook.name=tbook.name ORDER BY tbook.price")
bookMap = sql_book.map(lambda books: (books.name, books.author, books.price, books.publish, books.place))
for book in bookMap.collect():
print book[0], book[1], book[2], book[3], book[4]
sc.stop()
示例14: SQLContext
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import sql [as 別名]
sqlContext = SQLContext(sc)
# path to hillary/enron avro
enr = sqlContext.read.format(
"com.databricks.spark.avro").load(
"s3n://datasets-396316040607/enron_data/*.avro").repartition(16)
hil = sqlContext.read.format(
"com.databricks.spark.avro").load(
"s3n://datasets-396316040607/hillary/*.avro").repartition(16)
# register tables
sqlContext.registerDataFrameAsTable(hil, "hillary")
sqlContext.registerDataFrameAsTable(enr, "enron")
# register udf
sqlContext.registerFunction(
"getCos", lambda x, y: get_cosine(text_to_vector(x), text_to_vector(y))
)
# do the cosine similarity on the text, get the top 1000 matches
out = sqlContext.sql("SELECT h.author h_auth, e.author e_auth, "
"e.contents e_mail, h.contents h_mail, "
"getCos(e.contents, h.contents) as cos_sim "
"from hillary as h join enron as e order by cos_sim "
"desc limit 1000")
# write back out to s3
# write back out to s3
out.save("s3n://datasets-396316040607/cos_sim/", format="json")
示例15: StructField
# 需要導入模塊: from pyspark import SQLContext [as 別名]
# 或者: from pyspark.SQLContext import sql [as 別名]
StructField('suppid', StringType(), True),
StructField('logtime', LongType(), True),
StructField('usermac', StringType(), True)
]
schema = StructType(fields)
rdd1 = rdd.map(convert_logtype).filter(lambda tup: tup != None)
# rdd1.foreach(printx)
# sc.stop()
ret_df = sqlContext.createDataFrame(rdd1, schema)
ret_df.registerTempTable("loginflowlog_overall")
_sql = "SELECT count(usermac) pv,count(distinct usermac) uv,logtype " \
"from loginflowlog_overall " \
"group by logtype"
rs_df = sqlContext.sql(_sql)
service = LoginflowlogMysqlService()
ret_overall_list = service.getRetOverall(rs_df.collect(), day)
_sql_delete = "delete from login_flow_global_count where date ='%s'" % day
_sql_insert = "insert into login_flow_global_count(date," \
"prelogin_num,prelogin_pnum,login_num,login_pnum," \
"login_click_num,login_click_pnum,forward_num,forward_pnum," \
"preArrive_num,preArrive_pnum,arrive_num,arrive_pnum) " \
"values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
service.write_mysql(ret_overall_list, _sql_delete, _sql_insert)
logger.info(len(ret_overall_list))
# detail
rdd2 = rdd.map(convert_list).filter(lambda tup: tup != None) \
.map(convert_copy_logtype).values().flatMap(list)