当前位置: 首页>>代码示例>>Python>>正文


Python SQLContext.sql方法代码示例

本文整理汇总了Python中pyspark.SQLContext.sql方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.sql方法的具体用法?Python SQLContext.sql怎么用?Python SQLContext.sql使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.SQLContext的用法示例。


在下文中一共展示了SQLContext.sql方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import sql [as 别名]
def main():
	conf = SparkConf().setAppName('artist_career')
	sc = SparkContext(conf=conf)
	assert sc.version >= '1.5.1'
	sqlContext=SQLContext(sc)
	inputs = sys.argv[1]
	output = sys.argv[2]
	customSchema = StructType([StructField('SongNumber', StringType(),False),StructField('SongID', StringType(),False),StructField('AlbumID', StringType(),False),StructField('AlbumName', StringType(),False),StructField('ArtistID', StringType(),False),StructField('ArtistLatitude', StringType(),False),StructField('ArtistLocation', StringType(),False),StructField('ArtistLongitude', StringType(),False),StructField('ArtistName', StringType(),False),StructField('Danceability', StringType(),False),StructField('Duration', StringType(),False),StructField('KeySignature', StringType(),False),StructField('KeySignatureConfidence', StringType(),False),StructField('Tempo', StringType(),False),StructField('TimeSignature', StringType(),False),StructField('TimeSignatureConfidence', StringType(),False),StructField('Title', StringType(),False),StructField('Year', StringType(),False),StructField('Energy', StringType(),False),StructField('ArtistFamiliarity', StringType(),False),StructField('ArtistMbid', StringType(),False),StructField('SongHotttnesss', StringType(),False),StructField('Loudness', StringType(),False),StructField('StartOfFadeOut', StringType(),False),StructField('EndOfFadeIn', StringType(),False),StructField('ModeConfidence', StringType(),False)])

	df= sqlContext.read.format('com.databricks.spark.csv').options(header='true').load(inputs,schema = customSchema)

	df.registerTempTable('artist_data')

	million_song=sqlContext.sql("SELECT SongNumber,SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation,ArtistLongitude,ArtistName,Danceability,Duration,KeySignature,KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,Title,Year,Energy,ArtistFamiliarity,ArtistMbid,SongHotttnesss,Loudness,StartOfFadeOut,EndOfFadeIn,ModeConfidence from artist_data where Year!=0 AND ArtistFamiliarity!='nan'")
	million_song.write.format('parquet').save(output)
开发者ID:BandeepSingh,项目名称:Million-Song-DataSet-Analysis,代码行数:17,代码来源:artist_parquet.py

示例2: __init__

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import sql [as 别名]
class DataHandler:
    def __init__(self):
        self.conf = (SparkConf()
                     .setAppName("BandCard")
                     .set("spark.cores.max", "2")
                     .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar'))
        self.sc = SparkContext(conf=self.conf)
        self.sqlctx = SQLContext(self.sc)

        self.mysql_helper = MySQLHelper('core', host='10.9.29.212')

    def load_from_mysql(self, table, database='core'):
        url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database
        df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load()
        return df




    def prepare_life_cycle(self, year, season):
        '''
        准备生命周期数据
        从t_CMMS_ASSLIB_ASSET中获取每日AUM数据
        prepare data

        saum1 (last season sum aum)
        saum2 (current season sum aum)
        aum_now
        account_age (months)
        last_tr_date (days)

        :param year:
        :param season: 1,2,3,4
        :return:
        '''

        # 计算月份
        print('----------------------生命周期-Start----------------------')

        print('开始准备生命周期数据...')
        print('开始计算月份')

        if season == 1:
            # date1 当前季度月份
            date1 = [str(year) + '-01', str(year) + '-02', str(year) + '-03']

            # date2 上一季月份
            date2 = [str(year - 1) + '-10', str(year - 1) + '-11', str(year - 1) + '-12']

        elif season == 4:
            date1 = [str(year) + '-10', str(year) + '-11', str(year) + '-12']
            date2 = [str(year) + '-07', str(year) + '-08', str(year) + '-9']

        else:
            date1 = [str(year) + '-0' + str(3 * season - 2), str(year) + '-0' + str(3 * season - 1),
                     str(year) + '-0' + str(3 * season)]
            date2 = [str(year) + '-0' + str(3 * season - 5), str(year) + '-0' + str(3 * season - 4),
                     str(year) + '-0' + str(3 * season - 3)]

        print('当前季度月份 new:', date1)
        print('上一季度月份 old:', date2)

        # 加载AUM表
        aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').cache()

        # 拼接每季度三个月断数据
        season_new = aum.filter(aum.STAT_DAT == date1[0]).unionAll(aum.filter(aum.STAT_DAT == date1[1])).unionAll(
            aum.filter(aum.STAT_DAT == date1[2]))
        season_old = aum.filter(aum.STAT_DAT == date2[0]).unionAll(aum.filter(aum.STAT_DAT == date2[1])).unionAll(
            aum.filter(aum.STAT_DAT == date2[2]))

        # 计算每季度AUM
        aum_season_old = season_old.select('CUST_NO', season_old.AUM.alias('AUM1')).groupBy('CUST_NO').sum('AUM1')
        aum_season_new = season_new.select('CUST_NO', season_new.AUM.alias('AUM2')).groupBy('CUST_NO').sum('AUM2')

        # 两个季度进行外联接
        '''
        +-----------+---------+---------+
        |    CUST_NO|sum(AUM2)|sum(AUM1)|
        +-----------+---------+---------+
        |81005329523|     null|294844.59|
        |81011793167|     null|   365.20|
        |81015319088|     null|  9640.96|
        +-----------+---------+---------+
        '''
        union_season = aum_season_old.join(aum_season_new, 'CUST_NO', 'outer')

        # 筛选当前AUM
        temp_result = aum.select('CUST_NO', 'AUM', 'STAT_DAT').groupBy('CUST_NO', 'STAT_DAT').sum('AUM').sort(
            'CUST_NO').sort(aum.STAT_DAT.desc())
        temp_result.select('CUST_NO', temp_result['sum(AUM)'].alias('AUM'), 'STAT_DAT').registerTempTable('group_in')

        aum_now_sql = "select CUST_NO,first(AUM) as AUM_NOW from group_in group by CUST_NO"

        aum_now = self.sqlctx.sql(aum_now_sql)
        # 清除缓存表
        self.sqlctx.dropTempTable('group_in')

        # 联合
        union_season_aumnow = union_season.join(aum_now, 'CUST_NO', 'outer')
#.........这里部分代码省略.........
开发者ID:summer-apple,项目名称:spark,代码行数:103,代码来源:band_card.py

示例3: print_fun

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import sql [as 别名]
from pyspark import SparkContext, SparkConf
from pyspark import SQLContext
import itertools

def print_fun(collect):
    for item in itertools.chain(collect):
        print "|Name: " + item[0], "|Value: " + str(item[1]), "|Attribute: " + item[2]

if __name__ == "__main__":
    conf = SparkConf().setAppName("json_ops").setMaster("local[3]")
    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)

    # 将json格式直接直接parallelize为RDD
    equipmentRDD = sc.parallelize(['{"name":"asset1","value":1,"attribute":"属性1"}',
                                   '{"name":"asset2","value":2,"attribute":"属性2"}',
                                   '{"name":"asset3","value":3,"attribute":"属性3"}'])

    equipment = sqlContext.jsonRDD(equipmentRDD)
    equipment.registerTempTable("equipment")

    assets = sqlContext.sql("SELECT * FROM equipment as eq WHERE eq.value >= 1 AND eq.value <= 2")

    assets.show()

    # 查询结果进行隐射
    assetMap = assets.map(lambda asset: (asset.name, asset.value, asset.attribute)).foreachPartition(print_fun)

    sc.stop()
开发者ID:imperio-wxm,项目名称:spark-by-python,代码行数:32,代码来源:json_ops.py

示例4: SparkContext

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import sql [as 别名]
    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home
    conf = (SparkConf()
            .setMaster(master)
            .setAppName(appName)
            .set("spark.sql.parquet.binaryAsString","true")
            )
    sc = SparkContext(conf = conf)
    sql_context = SQLContext(sc)
    sql_context.registerFunction("to_mac", lambda x: normal_mac(x), StringType())

    parquet_df = sql_context.read.parquet(input)
    sql_context.registerDataFrameAsTable(parquet_df, "loginflowlog")
    #_sql = "select to_mac(upper(usermac)),count(distinct dat) days from loginflowlog group by to_mac(upper(usermac))"
    _sql = "select to_mac(upper(usermac)),count(distinct logtime) days from loginflowlog group by to_mac(upper(usermac))"
    rs_df = sql_context.sql(_sql)
    rs = rs_df.collect()
    logger.info("---->" + str(len(rs)))

    lists = []
    for r in rs:
        usermac = r[0]
        days = r[1]
        t = (usermac,days)
        lists.append(t)
        #logger.debug(t)

    dao = MysqlDao()

    _sql = "TRUNCATE TABLE user_days"
    dao.insert(_sql)
开发者ID:wangcunxin,项目名称:spark_py,代码行数:33,代码来源:login_days_main.py

示例5: range

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import sql [as 别名]
            payload['data']=[]
            if(len(pub_data[key]['data'])>batch_size):
                total_len=len(pub_data[key]['data'])
                for index in range(0,total_len/batch_size,batch_size):
                    if(index+batch_size>total_len):
                        payload['data']=pub_data[key]['data'][index:total_len-index]
                    else:
                        payload['data']=pub_data[key]['data'][index:index+batch_size]
                pushToRedis(payload)
            else:
                
                payload['data']=pub_data[key]['data']
                pushToRedis(payload)
                    
    # cerating publisher realted data from daily data 
    pub_dfp_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(device_finger_print) as dfps , collect_list(c_dfp) as dfp_counts from (select publisher_id ,id5, device_finger_print ,count(device_finger_print) as c_dfp  from testtable where id5!="" group by publisher_id,id5,device_finger_print)  group by publisher_id,id5')
    pub_ip_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(ip) as ips , collect_list(c_ip) as ip_counts from (select publisher_id ,id5, ip ,count(ip) as c_ip  from testtable where id5!="" group by publisher_id,id5,ip) group by publisher_id,id5')
    pub_session_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(sessionId) as sessions , collect_list(c_session) as session_counts from (select publisher_id ,id5, sessionId ,count(sessionId) as c_session  from testtable where id5!="" group by publisher_id,id5,sessionId) group by publisher_id,id5')
    pub_df=pub_dfp_df.join(pub_ip_df,['publisher_id','user_identifier'],'outer')
    pub_df=pub_df.join(pub_session_df,['publisher_id','user_identifier'],'outer')
    pub_daily_df=sqlContext.read.json(daily_fetch_path)

    if(len(pub_daily_df.columns)==0):
        for dtype in pub_df.dtypes:
            pub_daily_df=pub_daily_df.withColumn(dtype[0],lit(None).cast(dtype[1]))
            
    pub_daily_df=pub_daily_df.select(pub_df.columns)
    pub_df=pub_df.union(pub_daily_df)


    pub_daily_df=pub_daily_df.select(pub_df.columns)
开发者ID:ajaymshr42,项目名称:tutorialNotes,代码行数:33,代码来源:pb_integrated_complete_timed_preprocessing_publisher.py

示例6: print

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import sql [as 别名]
    print("------- ******* Task 1 ******* -------")
    columns = ['id', 'price', 'date', 'zip', 'type', 'new', 'duration', 'PAON',
               'SAON', 'street', 'locality', 'town', 'district', 'county', 'ppd',
               'status']

    rdd = sc.textFile(datadir + "prop-prices.csv")
    df = pyspark_csv.csvToDataFrame(sqlContext, rdd, columns=columns)
    df.registerTempTable("properties")
    df.persist()

    # Task 2: let's do some basic analysis on the data.
    # Find how many records we have per year, and print them out sorted by year.
    print("------- ******* Task 2 ******* -------")
    year_count = sqlContext.sql(
        """select   year(date) as year, count(*) as count
        from     properties
        group by year(date)
        order by year(date)""").collect()
    print(year_count)

    # Task 3: Everyone knows that properties in London are expensive.
    # Find the average property price by county,
    # and print the top 10 most expensive counties
    print("------- ******* Task 3 ******* -------")
    county_prices = sqlContext.sql(
        """select   county, avg(price) as avg_price
        from     properties
        group by county
        order by avg(price) desc
        limit    10""").collect()
    print(county_prices)
开发者ID:eyalbenivri,项目名称:sdp-spark-workshop,代码行数:33,代码来源:lab4-solution.py

示例7: SparkContext

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import sql [as 别名]
import os
import sys

from pyspark import SQLContext
from pyspark import SparkContext

#os.environ["SPARK_HOME"] = "/opt/spark-1.6.1-bin-hadoop2.6"
#os.environ["HADOOP_HOME"] = "/opt/hadoop"
#os.environ["HADOOP_PREFIX"] = "/opt/hadoop"

#os.environ["HIVE_HOME"] = "/opt/hive"


sc = SparkContext('local[1]')
sql_context = SQLContext(sc)
sql_context.setConf( "spark.sql.shuffle.partitions", "1")
sql_context.sql(""" use fex_test """)
开发者ID:hongbin0908,项目名称:bintrade,代码行数:19,代码来源:sparklib.py

示例8: SparkContext

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import sql [as 别名]
sparkConf = (SparkConf()
             .set(key="carbon.insert.batch.size", value="1000")
             .set(key="spark.driver.allowMultipleContexts",value="true")
             .set(key="spark.executor.extraJavaOptions", value="-Dwso2_custom_conf_dir=/home/supun/Downloads/wso2das-3.1.0/repository/conf"))

#Create a new SparkContext using the above SparkConf.
sparkCtx = SparkContext(conf=sparkConf)

# Check spark master.
print(sparkConf.get("spark.master"));


###################### Get data from DAS table #####################
sqlCtx = SQLContext(sparkCtx)
sqlCtx.sql('CREATE TEMPORARY TABLE table1 ' +
           'USING org.wso2.carbon.analytics.spark.core.sources.AnalyticsRelationProvider ' +
           'OPTIONS (tenantId "-1234", tableName "IRIS_DATA_STREAM")')

df = sqlCtx.sql("SELECT * FROM table1");



##################### Prepare the data #####################
assembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
                            outputCol="features")

assembledDf = assembler.transform(df)

assembledDf.show()

transformedDf = assembledDf.select(col("class").alias("label"), col("features"))
开发者ID:SupunS,项目名称:play-ground,代码行数:33,代码来源:PySpark-Sample.py

示例9: main

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import sql [as 别名]
def main():
     conf = SparkConf().setAppName('Scenario')
     sc = SparkContext(conf=conf)
     assert sc.version >= '1.5.1'
     sqlContext = SQLContext(sc)
     read_parq=sqlContext.read.parquet(inputs).cache()

     read_parq.registerTempTable('data')
     scenario_1=sqlContext.sql("""
     SELECT ArtistName , MAX(ArtistFamiliarity) AS MaxFamiliarity
     FROM data
     where ArtistFamiliarity!='nan' and ArtistLocation is NOT NULL
     GROUP BY ArtistName
     ORDER BY MAX(ArtistFamiliarity) DESC
     LIMIT 100
     """)
     #scenario_1.show()
     scenario_1.registerTempTable('scenario1')

     joined_artist=sqlContext.sql("""
     SELECT A.ArtistName , B.ArtistFamiliarity,B.SongID,B.SongHotttnesss,B.ArtistLocation,B.ArtistLatitude,B.ArtistLongitude
     FROM scenario1 A
     INNER JOIN data B ON A.ArtistName=B.ArtistName
     """)

     #joined_artist.show()
     joined_artist.registerTempTable('joined_artist')

     AvgFamiliarity=sqlContext.sql("""
     SELECT first(ArtistName) AS Artist_Name, MIN(ArtistFamiliarity) AS MinFamiliarity,MAX(ArtistFamiliarity) AS MaxFamiliarity,ArtistLocation AS Location, first(ArtistLatitude) AS Latitude, first(ArtistLongitude) AS Longitude
     FROM joined_artist
     where ArtistFamiliarity!='nan' and ArtistLatitude <> '' and ArtistLatitude is not NULL
     GROUP BY ArtistLocation
     ORDER BY MAX(ArtistFamiliarity) DESC
     """)


     #AvgFamiliarity.show()

     AvgFamiliarity.rdd.map(tuple).coalesce(1).saveAsTextFile(output+'/scenario1')


     scenario_2=sqlContext.sql("""
     SELECT ArtistName , MAX(SongHotttnesss) as MaxHotttness
     FROM data
     where SongHotttnesss!='nan'
     GROUP BY ArtistName
     ORDER BY MAX(SongHotttnesss) DESC
     LIMIT 10
     """)
     scenario_2.registerTempTable('scenario2')

     joined_artist_hotness=sqlContext.sql("""
     SELECT B.Title, A.ArtistName , B.SongHotttnesss,B.ArtistFamiliarity
     FROM scenario2 A
     INNER JOIN data B ON A.ArtistName=B.ArtistName and A.MaxHotttness=B.SongHotttnesss
     """)
     #joined_artist_hotness.show()
     joined_artist_hotness.rdd.map(tuple).coalesce(1).saveAsTextFile(output+'/scenario2')

     #Of a particular artist
     scenario_3=sqlContext.sql("""
     SELECT ArtistName , Year, AVG(ArtistFamiliarity) AS AvgFamiliarity,COUNT(SongID) AS Total_Songs
     FROM data
     where ArtistName='Britney Spears' and Year!=0
     GROUP BY Year,ArtistName
     ORDER BY Year
     """)


     scenario_3.rdd.map(tuple).coalesce(1).saveAsTextFile(output+'/scenario3')

     scenario_4=sqlContext.sql("""
     SELECT ArtistName , MAX(ArtistFamiliarity)-MIN(ArtistFamiliarity) AS Difference
     FROM data
     GROUP BY ArtistName
     """)
     scenario_4.registerTempTable('scenario4')

     top_diff=sqlContext.sql("""
     SELECT ArtistName , MAX(Difference) as MAXDiff
     FROM scenario4
     GROUP BY ArtistName
     ORDER BY MAXDiff DESC
     LIMIT 10
     """)
开发者ID:BandeepSingh,项目名称:Million-Song-DataSet-Analysis,代码行数:88,代码来源:artist_familiarity.py

示例10: str

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import sql [as 别名]
lines_with_keyword = textFile.filter(lambda line: "@TMobileHelp" in line)

print "Number of tweets with TMobileHelp: " + str(lines_with_keyword.count())
print lines_with_keyword.lookup("text")
#print lines_with_keyword
#schemaTweets = sqlContext.createDataFrame(lines_with_keyword)
#schemaTweets.registerTempTable("tweets")
#row = Row("text")
#lines_with_keyword.map(row).toDF()
#lines_with_keyword.printSchema()
#print tweets.take(5)
#print keyword_onlytext.take(5)


df = sqlContext.read.json(currentFile)
#df.printSchema()
#df_distinctlang = df.select("lang").distinct()
#df_lang = sql.read.json("TwitterLanguages.json")
#df_join =df_distinctlang.join(df_lang, df_distinctlang.lang == df_lang.code).drop(df_lang.code).drop(df_lang.status)
#df_join.show()
df.registerTempTable("df")
df_text = df.select("text")
#df_text.printSchema()
df_sql = sqlContext.sql("SELECT text FROM df WHERE text like '%TMobileHelp%'")


print df_sql.collect()

#df_sql.rdd.map(lambda x: ",".join(map(str, x))).coalesce(1).saveAsTextFile("file.csv")
#df.toPandas().to_csv('mycsv.csv')
开发者ID:KatieWoodard,项目名称:Spark,代码行数:32,代码来源:pysparkAnalyze.py

示例11: SparkConf

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import sql [as 别名]
if __name__ == "__main__":
    file_path = os.path.abspath("../doc/book.txt")
    print file_path

    conf = SparkConf().setAppName("schema_test").setMaster("local")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    lines = sc.textFile(file_path)
    # 切分
    parts = lines.map(lambda lines: lines.split(","))

    # 隐射表间关系(定义表结构)
    book = parts.map(lambda book: Row(name=book[0], author=book[1], price=float(book[2]), publish=book[3]))

    # 转换成schema并注册
    schemaPeople = sqlContext.createDataFrame(book)
    schemaPeople.registerTempTable("book")

    # 定义sqk语句(查询prize在50、60之间的书)
    book = sqlContext.sql("SELECT * FROM book WHERE price > 50.0 AND price < 60 OR name LIKE '%Spark%'")

    # 查询结果进行隐射
    bookMap = book.map(lambda books: (books.name, books.author, books.price, books.publish))

    for book in bookMap.collect():
        print "|Name: " + book[0], "|Author: " + book[1], "|Price: " + str(book[2]), "|Publish: " + book[3] + "|"

    sc.stop()
开发者ID:imperio-wxm,项目名称:spark-by-python,代码行数:31,代码来源:schema_test.py

示例12: dateformat

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import sql [as 别名]
        {'uid': '3', 'adid': 'c','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823550766},
        {'uid': '4', 'adid': 'd','guuid':'bb','guuidctime':1,'url':'','referer':'','hosid':'133','gwid':'','ua':'','ip':'','createtime':1450823268766},
    ]).registerAsTable("adload")
    _adPlayDF=sqlContext.createDataFrame([
        {'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823568766},
        {'uid': '2', 'adid': 'b','guuid':'aa','createtime':1450823569766},
        {'uid': '4', 'adid': 'd','guuid':'bb','createtime':1450823268766},
    ]).registerAsTable("adplay")
    _adClickDF =sqlContext.createDataFrame([
        {'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823580766},
    ]).registerAsTable("adclick")
    '''
    sqlContext.registerFunction("dateformat", lambda x:longTime2str(x),StringType())

    adLoadDf=sqlContext.sql('select hosid,dateformat(createtime) day,adid,count(guuid) pv,count(distinct guuid) uv '
                            'from adload where createtime is not null and dateformat(createtime)=%s '
                            'group by adid,hosid,dateformat(createtime)' % (lastdate)).registerAsTable("radload")

    adPlayDf=sqlContext.sql('select gh.hosid,dateformat(ap.createtime) day,adid,count(ap.guuid) pv,count(distinct ap.guuid) uv '
                            'from adplay ap left join ghid gh on ap.guuid=gh.guuid where dateformat(ap.createtime)=%s '
                            'group by ap.adid,gh.hosid,dateformat(ap.createtime)' % (lastdate)).registerAsTable("radplay")

    # sqlContext.sql('select sum(pv) from radplay').foreach(printx)
    adClick=sqlContext.sql('select gh.hosid,dateformat(ac.createtime) day,ac.adid,count(ac.guuid) pv,count(distinct ac.guuid) uv '
                            'from adclick ac left join ghid gh on ac.guuid=gh.guuid where dateformat(ac.createtime)=%s '
                            'group by ac.adid,gh.hosid,dateformat(ac.createtime)' % (lastdate)).registerAsTable("radclick")



    _df=sqlContext.sql('select A.hosid,A.day,A.adid,A.pv,A.uv,B.pv,B.uv,C.pv,C.uv from radload A '
                       'left join radplay B on A.hosid=B.hosid and A.adid=B.adid '
开发者ID:wangcunxin,项目名称:spark_py,代码行数:33,代码来源:adcount.py

示例13: SparkConf

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import sql [as 别名]
if __name__ == "__main__":
    json_path = os.path.abspath("../doc/book.json")
    txt_path = os.path.abspath("../doc/book.txt")

    conf = SparkConf().setAppName("mul_sources").setMaster("local[2]")
    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)

    # json读取并隐射
    json_df = sqlContext.read.json(json_path)
    json_df.registerTempTable("json_book")

    # txt读取并隐射
    lines = sc.textFile(txt_path)
    parts = lines.map(lambda lines: lines.split(","))
    book = parts.map(lambda book: Row(name=book[0], author=book[1], price=float(book[2]), publish=book[3]))
    schemaPeople = sqlContext.createDataFrame(book)
    schemaPeople.registerTempTable("txt_book")

    # sql_book = sqlContext.sql("SELECT * FROM json_book AS jbook LEFT JOIN txt_book AS tbook ON tbook.name=jbook.name")
    sql_book = sqlContext.sql("SELECT * FROM json_book AS jbook , txt_book AS tbook "
                              "WHERE jbook.name=tbook.name ORDER BY tbook.price")

    bookMap = sql_book.map(lambda books: (books.name, books.author, books.price, books.publish, books.place))

    for book in bookMap.collect():
        print book[0], book[1], book[2], book[3], book[4]

    sc.stop()
开发者ID:imperio-wxm,项目名称:spark-by-python,代码行数:32,代码来源:mul_data_sources.py

示例14: SQLContext

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import sql [as 别名]
    sqlContext = SQLContext(sc)

    # path to hillary/enron avro
    enr = sqlContext.read.format(
        "com.databricks.spark.avro").load(
            "s3n://datasets-396316040607/enron_data/*.avro").repartition(16)
    hil = sqlContext.read.format(
        "com.databricks.spark.avro").load(
            "s3n://datasets-396316040607/hillary/*.avro").repartition(16)

    # register tables
    sqlContext.registerDataFrameAsTable(hil, "hillary")
    sqlContext.registerDataFrameAsTable(enr, "enron")

    # register udf
    sqlContext.registerFunction(
        "getCos", lambda x, y: get_cosine(text_to_vector(x), text_to_vector(y))
    )

    # do the cosine similarity on the text, get the top 1000 matches
    out = sqlContext.sql("SELECT h.author h_auth, e.author e_auth, "
                         "e.contents e_mail, h.contents h_mail, "
                         "getCos(e.contents, h.contents) as cos_sim "
                         "from hillary as h join enron as e order by cos_sim "
                         "desc limit 1000")

    # write back out to s3
    # write back out to s3
    out.save("s3n://datasets-396316040607/cos_sim/", format="json")
开发者ID:JasonSanchez,项目名称:email-like-enron,代码行数:31,代码来源:cross_product.py

示例15: StructField

# 需要导入模块: from pyspark import SQLContext [as 别名]
# 或者: from pyspark.SQLContext import sql [as 别名]
        StructField('suppid', StringType(), True),
        StructField('logtime', LongType(), True),
        StructField('usermac', StringType(), True)
    ]
    schema = StructType(fields)

    rdd1 = rdd.map(convert_logtype).filter(lambda tup: tup != None)
    # rdd1.foreach(printx)
    # sc.stop()

    ret_df = sqlContext.createDataFrame(rdd1, schema)
    ret_df.registerTempTable("loginflowlog_overall")
    _sql = "SELECT count(usermac) pv,count(distinct usermac) uv,logtype " \
           "from loginflowlog_overall " \
           "group by logtype"
    rs_df = sqlContext.sql(_sql)

    service = LoginflowlogMysqlService()
    ret_overall_list = service.getRetOverall(rs_df.collect(), day)
    _sql_delete = "delete from login_flow_global_count where date ='%s'" % day
    _sql_insert = "insert into login_flow_global_count(date," \
                  "prelogin_num,prelogin_pnum,login_num,login_pnum," \
                  "login_click_num,login_click_pnum,forward_num,forward_pnum," \
                  "preArrive_num,preArrive_pnum,arrive_num,arrive_pnum) " \
                  "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    service.write_mysql(ret_overall_list, _sql_delete, _sql_insert)
    logger.info(len(ret_overall_list))

    # detail
    rdd2 = rdd.map(convert_list).filter(lambda tup: tup != None) \
        .map(convert_copy_logtype).values().flatMap(list)
开发者ID:wangcunxin,项目名称:spark_py,代码行数:33,代码来源:loginflowlog2mysql_update.py


注:本文中的pyspark.SQLContext.sql方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。