当前位置: 首页>>代码示例>>Python>>正文


Python HiveContext.cacheTable方法代码示例

本文整理汇总了Python中pyspark.sql.HiveContext.cacheTable方法的典型用法代码示例。如果您正苦于以下问题:Python HiveContext.cacheTable方法的具体用法?Python HiveContext.cacheTable怎么用?Python HiveContext.cacheTable使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.HiveContext的用法示例。


在下文中一共展示了HiveContext.cacheTable方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: ch9_sql

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import cacheTable [as 别名]
def ch9_sql():
    # Import Spark SQL
    from pyspark.sql import HiveContext, Row
    # Or if you can't include the hive requirements 
    from pyspark.sql import SQLContext, Row

    hiveCtx = HiveContext(sc)

    input_file = hiveCtx.read.json("testweet.json")
    # Register the input_file schema RDD 
    input_file.registerTempTable("tweets")
    # Select tweets based on the retweetCount
    topTweets = hiveCtx.sql("""SELECT text, retweetCount FROM
      tweets ORDER BY retweetCount LIMIT 10""")

    topTweetText = topTweets.map(lambda row: row.text)  
    topTweetText.collect()

    topTweets.schema
    hiveCtx.cacheTable("tweets")
开发者ID:jichen3000,项目名称:codes,代码行数:22,代码来源:interactive.py

示例2: query12_input

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import cacheTable [as 别名]
def query12_input(query_name, conf=None, output_persist=False):
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)

    # SQL statements can be run by using the sql methods provided by sqlContext
    sql = "use tpcds_text_db_1_50"
    _ = sqlContext.sql(sql)

#    web_sales_sql = "select * from web_sales"
#    web_sales = sqlContext.sql(web_sales_sql)
#    web_sales.persist()
#    web_sales.registerAsTable("web_sales")
#    item_sql = "select * from item"
#    item = sqlContext.sql(item_sql)
#    item.persist()
#    item.registerAsTable("item")
#    date_dim_sql = "select * from date_dim"
#    date_dim = sqlContext.sql(date_dim_sql)
#    date_dim.persist()
#    date_dim.registerAsTable("date_dim")
    sqlContext.cacheTable("web_sales")
    sqlContext.cacheTable("item")
    sqlContext.cacheTable("date_dim")

    # discard the first query
    output = execute_sql(query_name, sqlContext, output_persist)
    # check the re-run statistics
    output = execute_sql(query_name, sqlContext)
    output['describe'] = output['output'].describe().show()

    sc.stop()
    return output
开发者ID:fmacias64,项目名称:big-data-system,代码行数:34,代码来源:question2_pyspark.py

示例3: convert

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import cacheTable [as 别名]
mytable.registerTempTable("temp_mytable")
"""


def convert(val):
    return val.upper()

hc.registerFunction("temp_convert", convert)

convertRDD = hc.sql(
    "select temp_convert(col1) as col1, col2, col3 from temp_source")

convertRDD.registerAsTable("temp_mytable")


hc.cacheTable("temp_mytable")


def printRows(rows):
    for row in rows:
        print row

datas = hc.sql("select * from temp_mytable").collect()

printRows(datas)

datas = hc.sql("select col1 from temp_mytable").collect()

printRows(datas)

# hc.uncacheTable("temp_mytable")
开发者ID:Leaderman,项目名称:pyspark,代码行数:33,代码来源:spark_sql_cache.py

示例4: COUNT

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import cacheTable [as 别名]
sqlContext.sql('set spark.sql.shuffle.partitions=%i' % numPartitions)


#comments = sqlContext.read.json('data/test/*/')
#comments = sqlContext.read.json('data/micro_fake.json')
comments = sqlContext.read.json('s3n://%s:%[email protected]/micro_fake.json' % (acc, sec))
#comments = sqlContext.read.json('s3n://%s:%[email protected]/test/*/*' % (acc, sec))
#comments = sqlContext.read.json('s3n://%s:%[email protected]/comments/2007/*' % (acc, sec))
#comments = sqlContext.read.json('s3n://%s:%[email protected]/comments/200*/*' % (acc, sec))
#comments = sqlContext.read.json('s3n://%s:%[email protected]/comments/*/*' % (acc, sec))

polcomments = comments.filter(comments.subreddit=='politics')

polcomments2 = polcomments.repartition(numPartitions)
polcomments2.registerTempTable('comments')
sqlContext.cacheTable('comments')

# Removed when filtering to single subreddit
# COLLECT_LIST(subreddit) AS subreddits,
# COUNT(DISTINCT(subreddit)) AS total_subreddits,

user_pivot = sqlContext.sql('''SELECT
                            author,
                            MIN(CAST((FROM_UNIXTIME(INT(created_utc))) AS TIMESTAMP)) AS first_post_datetime,
                            MAX(CAST((FROM_UNIXTIME(INT(created_utc))) AS TIMESTAMP)) AS last_post_datetime,
                            COLLECT_LIST(CAST((FROM_UNIXTIME(INT(created_utc))) AS TIMESTAMP)) AS post_datetimes,
                            COLLECT_LIST(id) AS post_ids,
                            COUNT(*) AS total_posts
                       FROM comments
                       GROUP BY author''')
#user_pivot2 = user_pivot.repartition(numPartitions)
开发者ID:boazgurdin,项目名称:reddit-churn,代码行数:33,代码来源:pivot_partitioned.py

示例5: SparkConf

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import cacheTable [as 别名]
# coding=utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.types import Row

conf = SparkConf().setAppName("spark_sql_cache_table_extend")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

dataRDD = sc.textFile("/user/hdfs/rawlog/app_weibomobile03x4ts1kl_mwb_interface/").map(lambda line: line.split(
    ",")).filter(lambda words: len(words) >= 3).map(lambda words: Row(col1=words[0], col2=words[1], col3=words[2]))

sourceRDD = hc.inferSchema(dataRDD)

sourceRDD.registerAsTable("source")

hc.cacheTable("source")

hc.sql("select count(*) from source").collect()

hc.sql("select col2, max(col3) from source group by col2").collect()

hc.sql("select col3, min(col2) from source group by col3").collect()

# hc.uncacheTable("source")

sc.stop()
开发者ID:Leaderman,项目名称:pyspark,代码行数:32,代码来源:spark_sql_cache_table_extend.py

示例6: main

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import cacheTable [as 别名]

#.........这里部分代码省略.........
                 ) cs_or_ws_sales,
                 item,
                 date_dim,
                 customer
         where   sold_date_sk = d_date_sk
                 and item_sk = i_item_sk
                 and i_category = 'Jewelry'
                 and i_class = 'football'
                 and c_customer_sk = cs_or_ws_sales.customer_sk
                 and d_moy = 3
                 and d_year = 2000
                 group by  c_customer_sk
                , c_current_addr_sk
         )
         , my_revenue as (
         select c_customer_sk,
                sum(ss_ext_sales_price) as revenue
         from   my_customers,
                store_sales,
                customer_address,
                store,
                date_dim
         where  c_current_addr_sk = ca_address_sk
                and ca_county = s_county
                and ca_state = s_state
                and ss_sold_date_sk = d_date_sk
                and c_customer_sk = ss_customer_sk
                and d_month_seq between (1203)
                                   and  (1205)
         group by c_customer_sk
         )
         , segments as
         (select cast((revenue/50) as int) as segment
          from   my_revenue
         )
          select  segment, count(*) as num_customers, segment*50 as segment_base
         from segments
         group by segment
         order by segment, num_customers
         limit 100
         """

    # cache runs
    if run == 2 or run == 3:
        # tables in query12 used for collecting stats
        hc.cacheTable('web_sales')
        hc.cacheTable('item')
        hc.cacheTable('date_dim')

        # to circumvent lazy computation and force cache, we run a query
        # that involves the above cached tables
        if run == 2:
            # we will avoid running query12 now since we want to run
            # it below and collect stats
            # Instead, we run query54 which involves all the above 3
            # cached tables
            df = hc.sql(query54)

        # to force the caching of the outputRDD
        elif run == 3:
            # running the same query used to collect stats: query12
            # since we want to cache the output
            df = hc.sql(query12)
            df.cache()

        df.show()
        time.sleep(120)

    # record stats befor starting
    nw_before = map(get_network_bytes, hosts)
    st_before = map(get_storage_bytes, hosts)
    time_before = time.time()

    # actually run the query for collecting stastics
    hc.sql(query12).show()

    # record stat after completion
    time_after = time.time()
    nw_after = map(get_network_bytes, hosts)
    st_after = map(get_storage_bytes, hosts) 

    # calculate the difference in stats
    nw_read_hosti = 0
    nw_write_hosti = 0
    st_read_hosti = 0
    st_write_hosti = 0
    for i in range(len(hosts)):
        nw_read_hosti += nw_after[i][0] - nw_before[i][0]
        nw_write_hosti += nw_after[i][1] - nw_before[i][1]
        st_read_hosti += st_after[i][0] - st_before[i][0]
        st_write_hosti += st_after[i][1] - st_before[i][1]

    # output the stats
    print time_after - time_before
    print bytes_to_mb(nw_read_hosti)
    print bytes_to_mb(nw_write_hosti)
    print bytes_to_mb(st_read_hosti)
    print bytes_to_mb(st_write_hosti)

    sc.stop()
开发者ID:rohithvsm,项目名称:spark_exercises,代码行数:104,代码来源:spark_tpcds_persistence.py

示例7: SparkConf

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import cacheTable [as 别名]
from pyspark.sql import HiveContext
from pyspark.sql.types import Row
import random

conf = SparkConf().setAppName("spark_sql_cache_table")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

dataRDD = sc.textFile("hdfs://dip.cdh5.dev:8020/user/yurun/datas").map(lambda line: line.split(
    "\t")).map(lambda words: Row(col1=words[0], col2=words[1], col3=words[2]))

sourceRDD = hc.inferSchema(dataRDD)

sourceRDD.registerAsTable("source")

cacheRDD = hc.sql("select * from source where col1 = 'col1_50'")

cacheRDD.registerAsTable("cacheTable")

hc.cacheTable("cacheTable")

hc.sql("select col2, max(col3) from cacheTable group by col2").collect()

hc.sql("select col3, min(col2) from cacheTable group by col3").collect()

# hc.uncacheTable("cacheTable")

sc.stop()
开发者ID:Leaderman,项目名称:pyspark,代码行数:32,代码来源:spark_sql_cache_table.py

示例8: SparkEngine

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import cacheTable [as 别名]
class SparkEngine(object):
    def __init__(self, sc, debug=False):
        self.export_path = os.environ['COOPERHEWITT_ROOT'] + "/export/"
        self.sc = sc
        # hive requires writable permissions: ~/ephemeral-hdfs/bin/hadoop fs -chmod 777 /tmp/hive
        self.hive_cxt = HiveContext(sc)
        self.sql_cxt  = SQLContext(sc)
        if debug:
            print "{0}\n{1}\n{2}\n".format(sc.master, self.hive_cxt, self.sql_cxt)
            print sc._conf.getAll()
        #TBD destructor Unpersist memory

### functionality to query and create tables
    def _create_df_table(self, schema, frame, name):
        if schema: df = self.hive_cxt.createDataFrame(frame, schema=schema)
        else: df = self.hive_cxt.createDataFrame(frame)
        df.printSchema()
        df.registerTempTable(name)
        self.hive_cxt.cacheTable(name)
        return df

    def _query_temporal_data(self):
        # step 1. create main temporal table
        # n_obs => first join causes for each pen entry * num location entries existent (dependent on time period)
        samples_temporal_tb = self.hive_cxt.sql("""
            SELECT  s.refers_to_object_id, created, visit_raw,
                    room_floor, room_id, room_name,
                    spot_id, spot_name, spot_description,
                    room_count_objects, room_count_spots, spot_count_objects,
                    abs(datediff(
                        from_utc_timestamp(from_unixtime(created,   "yyyy-MM-dd"), 'US/Eastern'),
                        from_utc_timestamp(from_unixtime(visit_raw, "yyyy-MM-dd"), 'US/Eastern')
                    )) as delta
            FROM samples s
            JOIN temporal t
            ON s.refers_to_object_id = t.refers_to_object_id
            ORDER by s.refers_to_object_id, created, delta
        """)
        samples_temporal_tb.registerTempTable('samplestemporal')
        self.hive_cxt.cacheTable('samplestemporal')
        return samples_temporal_tb

    def _minimize_query(self):
        # From the temporal table, we need minimize the location (multiple locations) to the appropriate sample timestamp
        tb_samples = self.hive_cxt.sql("""
            SELECT *
            FROM (
                SELECT *,
                MIN(delta)   OVER ( PARTITION BY refers_to_object_id, created) AS min_delta,
                row_number() OVER ( PARTITION BY refers_to_object_id, created) AS ranks
                FROM samplestemporal st
                ORDER BY refers_to_object_id
            ) query
            where query.ranks = 1
        """)
        tb_samples = tb_samples.withColumn("meta_store", lit(1))
        tb_samples.registerTempTable('minimizedsamples')
        self.hive_cxt.cacheTable('minimizedsamples')
        return tb_samples

    def execute_query(self, (samples_schema,  samples_frame, samples_name),
                                        (temporal_schema, temporal_frame, temporal_name),
                                        cols):
        self.df_samples       = self._create_df_table(samples_schema,  samples_frame,  samples_name)
        self.df_temporal      = self._create_df_table(temporal_schema, temporal_frame, temporal_name)
        self.tb_meta          = self._query_temporal_data()
        self.tb_meta_min      = self._minimize_query()
        # combine to the original pen data (meta_store indicates if we had object data to integrate)
        self.df_samplesmeta   = self.df_samples.join(self.tb_meta_min, ['refers_to_object_id', 'created'], "left_outer")
        self.df_samplesmeta   = self.df_samplesmeta.fillna({'meta_store': 0})
        self.df_samplesmeta.printSchema()
        # pickle file to pandas: alternatively we can store as a json or parquet columnar format
        dropped_cols = ['delta', 'min_delta', 'ranks'] + cols
        samplesmeta_pd  = self.df_samplesmeta.toPandas()
        samplesmeta_pd  = samplesmeta_pd.drop(dropped_cols, axis=1)
        samplesmeta_pd.to_pickle(self.export_path + "penmeta_spark.pkl")
开发者ID:akamlani,项目名称:cooperhewitt,代码行数:78,代码来源:ch_spark.py


注:本文中的pyspark.sql.HiveContext.cacheTable方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。