当前位置: 首页>>代码示例>>Python>>正文


Python HiveContext.sql方法代码示例

本文整理汇总了Python中pyspark.sql.HiveContext.sql方法的典型用法代码示例。如果您正苦于以下问题:Python HiveContext.sql方法的具体用法?Python HiveContext.sql怎么用?Python HiveContext.sql使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.HiveContext的用法示例。


在下文中一共展示了HiveContext.sql方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_context_test

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import sql [as 别名]
def get_context_test():
    conf = SparkConf()
    sc = SparkContext('local[1]', conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql("""use fex_test""")
    sql_context.setConf("spark.sql.shuffle.partitions", "1")
    return sc, sql_context
开发者ID:hongbin0908,项目名称:bintrade,代码行数:9,代码来源:index.py

示例2: get_context

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import sql [as 别名]
def get_context():
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "8g")
    sc = SparkContext(appName="__file__", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql("""use fex""")
    sql_context.setConf("spark.sql.shuffle.partitions", "32")
    return sc, sql_context
开发者ID:hongbin0908,项目名称:bintrade,代码行数:12,代码来源:index.py

示例3: read_csv

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import sql [as 别名]
def read_csv(sc, file_name, sep=",", storage="hive://", header=True,
             names=None, table_name=None, infer_limit=10000):
    table_name = table_name if table_name is not None else "df" + str(uuid.uuid4())
    hc = HiveContext(sc)
    df = pd.read_csv(file_name, sep=sep, nrows=infer_limit)
    names = df.columns if not names else names
    types = []
    for i in range(len(names)):
        tp = names[i] + " "
        if df.dtypes[i] == "O":
            tp += "STRING"
        elif df.dtypes[i] == "int64":
            tp += "INT"
        else:
            tp += "DOUBLE"
        types.append(tp)
    hc.sql('drop table if exists %s' %table_name)
    qw = """CREATE TABLE IF NOT EXISTS %s (%s) row format delimited fields terminated by '%s'
LINES TERMINATED BY '\n'""" %(table_name, ','.join(types), sep)
    if header:
        qw += " tblproperties ('skip.header.line.count'='1')"
    hc.sql(qw)
    hc.sql("LOAD DATA LOCAL INPATH '%s' OVERWRITE INTO TABLE %s" %(file_name, table_name))
    rdd = hc.sql("SELECT * FROM %s" %table_name)
    ctx = hc
    if storage.startswith("parquet://"):
        path = storage.replace("parquet://", "")
        rdd.saveAsParquetFile("%s/%s" %(path, table_name))
        sq = HiveContext(sc)
        rdd = sq.parquetFile("%s/%s" %(path, table_name))
        rdd.registerTempTable(table_name)
        rdd = sq.sql("select * from %s" %table_name)
        ctx = sq
    return DataFrame(ctx, table_name, data=rdd, columns=names, dtype=types)
开发者ID:lmatthieu,项目名称:Ydata,代码行数:36,代码来源:parsers.py

示例4: main

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import sql [as 别名]
def main():
    sc = SparkContext()
    hc = HiveContext(sc)

    df = hc.sql("""{{sql}}""")
    df_writer = DataFrameWriter(df)
    df_writer.saveAsTable(name='{{tableName}}',
                          format='json',
                          mode='overwrite',
                          path='s3://data/{{tableName}}')
开发者ID:tgknight,项目名称:aws-sdk-hands-on,代码行数:12,代码来源:job.py

示例5: query12_no

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import sql [as 别名]
def query12_no(query_name, conf=None):
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)

    # SQL statements can be run by using the sql methods provided by sqlContext
    sql = "use tpcds_text_db_1_50"
    _ = sqlContext.sql(sql)

    output = execute_sql(query_name, sqlContext)
    output['describe'] = output['output'].describe().show()

    sc.stop()
    return output
开发者ID:fmacias64,项目名称:big-data-system,代码行数:15,代码来源:question2_pyspark.py

示例6: run

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import sql [as 别名]
    def run(self):
	sc = SparkContext("local", "Course Activity")
	#sqlHC is the SQLHiveContext        
	sqlHC = HiveContext(sc)
	
	lines=sqlHC.sql(""" select courseName,lmsUserId,createDateTime,
		            eventType,eventName,eventNo from logdata where 
			    eventType not in ('enrollment','instructor','admin') 
			    and lmsUserId is not NULL 
   			    and courseName is not NULL 
			    and eventNo is not NULL limit 10""")


	maplvl1=lines.flatMap(lambda p: mapp(p[0],str(p[1]),p[2].strftime('%Y-%m-%d'),p[4]))
	reduceRDD=maplvl1.reduceByKey(lambda a,b : a+b)
	with self.output().open('w') as out_file:
		for line in reduceRDD.collect():
        		out_file.write(line[0][0]+"\x01"+line[0][1]+"\x01"+line[0][2]+"\x01"+line[0][3]+"\x01"+str(line[1])+"\n")
开发者ID:Zarana-Parekh,项目名称:analytics,代码行数:20,代码来源:course_activity.py

示例7: query12_input

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import sql [as 别名]
def query12_input(query_name, conf=None, output_persist=False):
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)

    # SQL statements can be run by using the sql methods provided by sqlContext
    sql = "use tpcds_text_db_1_50"
    _ = sqlContext.sql(sql)

#    web_sales_sql = "select * from web_sales"
#    web_sales = sqlContext.sql(web_sales_sql)
#    web_sales.persist()
#    web_sales.registerAsTable("web_sales")
#    item_sql = "select * from item"
#    item = sqlContext.sql(item_sql)
#    item.persist()
#    item.registerAsTable("item")
#    date_dim_sql = "select * from date_dim"
#    date_dim = sqlContext.sql(date_dim_sql)
#    date_dim.persist()
#    date_dim.registerAsTable("date_dim")
    sqlContext.cacheTable("web_sales")
    sqlContext.cacheTable("item")
    sqlContext.cacheTable("date_dim")

    # discard the first query
    output = execute_sql(query_name, sqlContext, output_persist)
    # check the re-run statistics
    output = execute_sql(query_name, sqlContext)
    output['describe'] = output['output'].describe().show()

    sc.stop()
    return output
开发者ID:fmacias64,项目名称:big-data-system,代码行数:34,代码来源:question2_pyspark.py

示例8: ch9_sql

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import sql [as 别名]
def ch9_sql():
    # Import Spark SQL
    from pyspark.sql import HiveContext, Row
    # Or if you can't include the hive requirements 
    from pyspark.sql import SQLContext, Row

    hiveCtx = HiveContext(sc)

    input_file = hiveCtx.read.json("testweet.json")
    # Register the input_file schema RDD 
    input_file.registerTempTable("tweets")
    # Select tweets based on the retweetCount
    topTweets = hiveCtx.sql("""SELECT text, retweetCount FROM
      tweets ORDER BY retweetCount LIMIT 10""")

    topTweetText = topTweets.map(lambda row: row.text)  
    topTweetText.collect()

    topTweets.schema
    hiveCtx.cacheTable("tweets")
开发者ID:jichen3000,项目名称:codes,代码行数:22,代码来源:interactive.py

示例9: SparkContext

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import sql [as 别名]
        print "Usage: spark-submit <Python Code File>"
        sys.exit(1)

    #App name which shows up in the Spark UI
    sc = SparkContext(appName='User Recommendation')


    #Context provides connection to Hive metastore
    sqlContext = HiveContext(sc)
    

    '''
    Pulling data out of Hive.  I created a relication of 'watson_bisum_purchases' table locally to test.
    '''
    
    rdd = sqlContext.sql("SELECT person_id,deal_id,aasm_state FROM watson_bisum_purchases")


    '''
    Creating datasets.  Formating the data and also creating sample datasets in order to create and test the model. 
    '''
    
    #Formating all the data using the 'parse_rating' method above
    all_data = rdd.map(parse_rating)
    rec_list = sc.parallelize(all_data.collect())


    #Grabbing all Unique Users(used for building recommendation list)
    users = rdd.groupBy(lambda x: x.person_id).map(lambda x: x[0]).collect()

    #Grabbing all Unique Deals/Products(used for building recommendation list)
开发者ID:rodyou,项目名称:Spark_Examples,代码行数:33,代码来源:collaborative_filtering.py

示例10: main

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import sql [as 别名]
def main():
    # set up the logger
    logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'mpg_cluster.log'),
                            level=logging.INFO,
                            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                            datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # NSJOIN dayidx # only partitioned by DAY
    day_idx = beeline.get_last_partitions('mapper.nsjoin').split('=')[1]
    # BAREBONES dayidx # only partitioned by DAY
    day_bb = [x for x in beeline.show_partitions('mapper.barebones').split('\n') if '=%s' % (day_idx) in x]
    # MAPPOINTS dayidx # partitioned by DAY and UUID (pick the last uuid)
    mappoints_data = sorted([x for x in beeline.show_partitions('mapper.mappoints').split('\n') if '=%s' % (day_idx) in x])[-1].split('/')
    [day_mps, uuid_idx] = [x.split('=')[1] for x in mappoints_data]

    if day_idx != day_mps:
        logger.error('mapper.mappoints and mapper.nsjoin different day, possible data missing in the source.')
        return

    if len(day_bb) == 0:
        logger.warning('mapper.barebone data missing for this particular day.')
        #return

    logger.info('Processing data in day=%s, uuid=%s' % (day_idx, uuid_idx))

    logger.info('begin spark process.')
    getting_mappoint_data = ''' select b1.mpgid mpgid, b1.lat lat, b1.lon lon, b1.country country, b1.mpgload mpgload, b1.allowed_private_regions allowed_private_regions, b2.asnum asnum, b2.ip ip from (select mpgid, lat, lon, country, mpgload, allowed_private_regions from mapper.mappoints where day=%s and uuid="%s" and lat is not NULL and lon is not NULL and ghostonly=0 ) b1 left outer join (select collect_set(ns_ip) ip, collect_set(asnum) asnum, mpgid from (select ns_ip, mpd_uuid, mpgid, asnum, demand, day from mapper.nsjoin where day=%s and mpd_uuid="%s" and demand>0.01 order by demand desc) a group by mpgid) b2 on b2.mpgid=b1.mpgid ''' % (day_idx, uuid_idx, day_idx, uuid_idx)
    geo_total_cap_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='private' group by country, network) a ''' % day_idx
    geo_total_cap_public_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='public' group by country, network) a ''' % day_idx

    sc = SparkContext()
    hiveCtx = HiveContext(sc)

    rows = hiveCtx.sql(getting_mappoint_data)

    regInfoRows = hiveCtx.sql('select * from mapper.regioncapday where day=%s and peak_bitcap_mbps is not null and peak_flitcap_mfps is not null' % (day_idx))
    geo_total_cap = hiveCtx.sql(geo_total_cap_query)
    geo_total_cap_p = hiveCtx.sql(geo_total_cap_public_query)


    # rdd format: [regionid, [mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load, mpg-asnum, mpg-nsip]]
    region_mpginfo_pair = rows.map(lambda x: [[x.mpgid,
                                               x.lat,
                                               x.lon,
                                               x.country,
                                               x.mpgload,
                                               x.asnum,
                                               x.ip], x.allowed_private_regions])\
                                .flatMapValues(lambda x: x).map(lambda x: [x[1], x[0]])

    #region_mpginfo_pair.first()

    # rdd format: [regionid, [reg-lat, reg-lon, reg-capacity(bit mbps), reg-capacity(bit mfps), reg-country, reg-numvips, reg-service, reg-prp]]
    # ps. prp=1: private, prp=0: public
    region_latlon = regInfoRows.map(lambda x: [x.region, [x.latitude,
                                                          x.longitude,
                                                          x.peak_bitcap_mbps,
                                                          x.peak_flitcap_mfps,
                                                          x.country,
                                                          x.numvips,
                                                          'W' if x.network=='freeflow' else ('S' if x.network=='essl' else 'O'),
                                                          1 if x.prp=='private' else 0]])\
                                .filter(lambda x: x[1][6]=='W' or x[1][6]=='S')

    region_public_list = region_latlon\
        .filter(lambda x: x[1][7] == 0)\
        .map(lambda x: ('all', [[x[0]]]))\
        .reduceByKey(lambda a, b: [a[0]+b[0]])\
        .map(lambda x: x[1][0]).collect()

    region_public_list = [0] + sorted(region_public_list[0])

    # dummy region
    rdd2 = sc.parallelize([([0, [0, 0, 0.0, 0.0, 'US', 0, 'W', 1]])])
    region_latlon = region_latlon.union(rdd2)

    # perform the join into tuple of (K, (V1, V2):
    # (regionid, ([mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load], [reg-lat, reg-lon, reg-cap, reg-country, reg-numvips, reg-service]))
    # rdd  = (mpgid, regionid, [lat1, lon1, lat2, lon2, distance],
    #               reg-cap-bit(gbps), reg-cap-flit(gbps), reg-country, reg-numvips, reg-services,
    #               mpg-country, mpg-load, mpg-asnum, mpg-nsip,
    #               mpg-lat, mpg-lon)
    mpgid_reg_geo = region_mpginfo_pair.join(region_latlon).map(lambda x: [x[1][0][0],
                                                                           x[0],
                                                                           geodesic_distance(x[1][0][1],
                                                                                             x[1][0][2],
                                                                                             x[1][1][0],
                                                                                             x[1][1][1]),
                                                                           round(float(x[1][1][2])/1000.0, 3),
                                                                           round(float(x[1][1][3])/1000.0, 3),
                                                                           x[1][1][4], # reg-country
                                                                           x[1][1][5], # reg-numvips
                                                                           x[1][1][6], # reg-services
                                                                           x[1][0][3],
                                                                           x[1][0][4],
                                                                           x[1][0][5],
                                                                           x[1][0][6],
                                                                           x[1][0][1],
                                                                           x[1][0][2]])
#.........这里部分代码省略.........
开发者ID:YuTengChang,项目名称:akam_mrqos,代码行数:103,代码来源:mpgCluster_spark.py

示例11: SparkContext

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import sql [as 别名]
from pyspark import SparkContext
sc = SparkContext("local", "best_hospitals")

from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)

# Select the top 10 hospital by average avgscore
# Please note that we filter out those hospital not qualified for evaluation
df_top10_hospitals = sqlContext.sql("select Q.providerid as id, AVG(Q.normalizedscore) as avgscore \
from total_quality Q join hospitals_qualified H on Q.providerid = H.providerid \
where Q.normalizedscore is not null and H.qualified = true \
group by Q.providerid \
order by avgscore DESC").limit(10)

# Join with hospitals_qualified to get the hospital name and state
# Note: couldn't figure out how to do it in the above select statement (together with Group By) in one-shot! :-(
df_hospitals = sqlContext.table("hospitals_qualified")
df_top10_hospitals_full = df_top10_hospitals.join(df_hospitals, df_top10_hospitals.id == df_hospitals.providerid).\
    select(df_hospitals.providerid, df_hospitals.hospitalname, df_hospitals.state, df_top10_hospitals.avgscore)

df_top10_hospitals_full = df_top10_hospitals_full.orderBy(df_top10_hospitals_full.avgscore.desc())

# Save it as a table
df_top10_hospitals_full.registerTempTable("df")
sqlContext.sql("drop table if exists top_10_hospitals")
sqlContext.sql("CREATE TABLE top_10_hospitals AS SELECT * FROM df")

print
print "Top 10 hospitals"
print
rank = 1
开发者ID:patng323,项目名称:w205-ex1,代码行数:33,代码来源:best_hospitals.py

示例12: PARTITION

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import sql [as 别名]
df_final = df_add_year.withColumn('Load_date', F.current_date())

df_final.repartition(10)

# Registering data frame as a temp table for SparkSQL
hive_ctx.registerDataFrameAsTable(df_final, "EMP_TEMP")

# Target Type: APACHE HIVE
# Database   : EMPLOYEES
# Table Name : EMPLOYEE_DIM
# + ------------------------------- +
# | COlUMN NAME| TYPE   | PARTITION |
# + ------------------------------- +
# | EMP_NO     | INT    |           |
# | BIRTH_DATE | DATE   |           |
# | FIRST_NAME | STRING |           |
# | LAST_NAME  | STRING |           |
# | GENDER     | STRING |           |
# | HIRE_DATE  | DATE   |           |
# | SALARY     | INT    |           |
# | FROM_DATE  | DATE   |           |
# | TO_DATE    | DATE   |           |
# | YEAR       | INT    | PRIMARY   |
# | LOAD_DATE  | DATE   | SUB       |
# + ------------------------------- +
# Storage Format: ORC

# Inserting data into the Target table
hive_ctx.sql("INSERT OVERWRITE TABLE EMPLOYEES.EMPLOYEE_DIM PARTITION (year, Load_date) \
            SELECT EMP_NO, BIRTH_DATE, FIRST_NAME, LAST_NAME, GENDER, HIRE_DATE, \
            SALARY, FROM_DATE, TO_DATE, year, Load_date FROM EMP_TEMP")
开发者ID:uday07,项目名称:Spark-ETL,代码行数:33,代码来源:mysql_to_hive_etl.py

示例13: len

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import sql [as 别名]
# Createas a hive table and loads an input file into it
# For input you can use examples/src/main/resources/kv1.txt from the spark
# distribution
from pyspark import SparkContext
from pyspark.sql import HiveContext
import json
import sys

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print "Error usage: LoadHive [sparkmaster] [inputFile] [inputtable]"
        sys.exit(-1)
    master = sys.argv[1]
    inputFile = sys.argv[2]
    inputTable = sys.argv[3]
    sc = SparkContext(master, "LoadHive")
    hiveCtx = HiveContext(sc)
    # Load some data into hive
    hiveCtx.sql(
        "CREATE TABLE IF NOT EXISTS " +
        inputTable +
        " (key INT, value STRING)")
    hiveCtx.sql(
        "LOAD DATA LOCAL INPATH '" + inputFile + "' INTO TABLE " + inputTable)
开发者ID:153485062,项目名称:learning-spark,代码行数:26,代码来源:MakeHiveTable.py

示例14: SparkConf

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import sql [as 别名]
# coding: utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import StructType, StructField, IntegerType, FloatType, StringType

conf = SparkConf().setAppName("spark_sql_datatype_struct")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([((1, 2.0, "3.0"),)])

schema = StructType([StructField("struct", StructType([StructField("first", IntegerType(), False), StructField(
    "second", FloatType(), False), StructField("third", StringType(), False)]), False)])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
    "select struct.first, struct.second, struct.third from temp_table").collect()

sc.stop()

for row in rows:
    print row
开发者ID:Leaderman,项目名称:pyspark,代码行数:30,代码来源:spark_sql_datatype_struct.py

示例15: dict

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import sql [as 别名]
		would be stored in the Hive tables.
		
Step 4.	Then do a mapreduce to find the total number of count
		and timeAccess to the video and the videoFrame, and
		write it to the MySQL Analytics summary table.
"""


"""
Step 1. Getting the video data from Hive.
"""

sqlVideo = (
    "SELECT orgname,coursename, videosysname, videolength, videoTitle FROM coursevideos where videosysname is not null"
)
videoslist = dict(sqlContext.sql(sqlVideo).map(lambda v: ((v[0], v[1], v[2]), (v[-2], v[-1]))).collect())
# unless you actually collect() the data from the RDD, you can't operate on it.
# So, to actually use the data, call a take(x) method or a collect() method on the RDD before you start.
# otherwise, to use RDD functions, DO NOT OPERATE ON IT IMMEDIATELY. it will all be done when the data is finally collected.

# 1. To iterate over the data like a list, do a collect or take function first.
# 2. To access the elements of a Row type object (result of collect), use the normal subscripts. That works just fine.
# But here, to make it easier, we decided to change the Rows to tuples in a dictionary so that they can be accessed via moduleSysName or videoSysName

# for video in videoslist:
# 	print video, videoslist[video][0], videoslist[video][1]


"""
Step 2. Getting the event data from Hive.
"""
开发者ID:qjyzwlz,项目名称:big_data_analysis,代码行数:33,代码来源:videoDifficultySpark.py


注:本文中的pyspark.sql.HiveContext.sql方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。