本文整理汇总了Python中pyspark.sql.HiveContext.cacheTable方法的典型用法代码示例。如果您正苦于以下问题:Python HiveContext.cacheTable方法的具体用法?Python HiveContext.cacheTable怎么用?Python HiveContext.cacheTable使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.HiveContext
的用法示例。
在下文中一共展示了HiveContext.cacheTable方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: ch9_sql
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import cacheTable [as 别名]
def ch9_sql():
# Import Spark SQL
from pyspark.sql import HiveContext, Row
# Or if you can't include the hive requirements
from pyspark.sql import SQLContext, Row
hiveCtx = HiveContext(sc)
input_file = hiveCtx.read.json("testweet.json")
# Register the input_file schema RDD
input_file.registerTempTable("tweets")
# Select tweets based on the retweetCount
topTweets = hiveCtx.sql("""SELECT text, retweetCount FROM
tweets ORDER BY retweetCount LIMIT 10""")
topTweetText = topTweets.map(lambda row: row.text)
topTweetText.collect()
topTweets.schema
hiveCtx.cacheTable("tweets")
示例2: query12_input
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import cacheTable [as 别名]
def query12_input(query_name, conf=None, output_persist=False):
sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)
# SQL statements can be run by using the sql methods provided by sqlContext
sql = "use tpcds_text_db_1_50"
_ = sqlContext.sql(sql)
# web_sales_sql = "select * from web_sales"
# web_sales = sqlContext.sql(web_sales_sql)
# web_sales.persist()
# web_sales.registerAsTable("web_sales")
# item_sql = "select * from item"
# item = sqlContext.sql(item_sql)
# item.persist()
# item.registerAsTable("item")
# date_dim_sql = "select * from date_dim"
# date_dim = sqlContext.sql(date_dim_sql)
# date_dim.persist()
# date_dim.registerAsTable("date_dim")
sqlContext.cacheTable("web_sales")
sqlContext.cacheTable("item")
sqlContext.cacheTable("date_dim")
# discard the first query
output = execute_sql(query_name, sqlContext, output_persist)
# check the re-run statistics
output = execute_sql(query_name, sqlContext)
output['describe'] = output['output'].describe().show()
sc.stop()
return output
示例3: convert
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import cacheTable [as 别名]
mytable.registerTempTable("temp_mytable")
"""
def convert(val):
return val.upper()
hc.registerFunction("temp_convert", convert)
convertRDD = hc.sql(
"select temp_convert(col1) as col1, col2, col3 from temp_source")
convertRDD.registerAsTable("temp_mytable")
hc.cacheTable("temp_mytable")
def printRows(rows):
for row in rows:
print row
datas = hc.sql("select * from temp_mytable").collect()
printRows(datas)
datas = hc.sql("select col1 from temp_mytable").collect()
printRows(datas)
# hc.uncacheTable("temp_mytable")
示例4: COUNT
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import cacheTable [as 别名]
sqlContext.sql('set spark.sql.shuffle.partitions=%i' % numPartitions)
#comments = sqlContext.read.json('data/test/*/')
#comments = sqlContext.read.json('data/micro_fake.json')
comments = sqlContext.read.json('s3n://%s:%[email protected]/micro_fake.json' % (acc, sec))
#comments = sqlContext.read.json('s3n://%s:%[email protected]/test/*/*' % (acc, sec))
#comments = sqlContext.read.json('s3n://%s:%[email protected]/comments/2007/*' % (acc, sec))
#comments = sqlContext.read.json('s3n://%s:%[email protected]/comments/200*/*' % (acc, sec))
#comments = sqlContext.read.json('s3n://%s:%[email protected]/comments/*/*' % (acc, sec))
polcomments = comments.filter(comments.subreddit=='politics')
polcomments2 = polcomments.repartition(numPartitions)
polcomments2.registerTempTable('comments')
sqlContext.cacheTable('comments')
# Removed when filtering to single subreddit
# COLLECT_LIST(subreddit) AS subreddits,
# COUNT(DISTINCT(subreddit)) AS total_subreddits,
user_pivot = sqlContext.sql('''SELECT
author,
MIN(CAST((FROM_UNIXTIME(INT(created_utc))) AS TIMESTAMP)) AS first_post_datetime,
MAX(CAST((FROM_UNIXTIME(INT(created_utc))) AS TIMESTAMP)) AS last_post_datetime,
COLLECT_LIST(CAST((FROM_UNIXTIME(INT(created_utc))) AS TIMESTAMP)) AS post_datetimes,
COLLECT_LIST(id) AS post_ids,
COUNT(*) AS total_posts
FROM comments
GROUP BY author''')
#user_pivot2 = user_pivot.repartition(numPartitions)
示例5: SparkConf
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import cacheTable [as 别名]
# coding=utf-8
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.types import Row
conf = SparkConf().setAppName("spark_sql_cache_table_extend")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)
dataRDD = sc.textFile("/user/hdfs/rawlog/app_weibomobile03x4ts1kl_mwb_interface/").map(lambda line: line.split(
",")).filter(lambda words: len(words) >= 3).map(lambda words: Row(col1=words[0], col2=words[1], col3=words[2]))
sourceRDD = hc.inferSchema(dataRDD)
sourceRDD.registerAsTable("source")
hc.cacheTable("source")
hc.sql("select count(*) from source").collect()
hc.sql("select col2, max(col3) from source group by col2").collect()
hc.sql("select col3, min(col2) from source group by col3").collect()
# hc.uncacheTable("source")
sc.stop()
示例6: main
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import cacheTable [as 别名]
#.........这里部分代码省略.........
) cs_or_ws_sales,
item,
date_dim,
customer
where sold_date_sk = d_date_sk
and item_sk = i_item_sk
and i_category = 'Jewelry'
and i_class = 'football'
and c_customer_sk = cs_or_ws_sales.customer_sk
and d_moy = 3
and d_year = 2000
group by c_customer_sk
, c_current_addr_sk
)
, my_revenue as (
select c_customer_sk,
sum(ss_ext_sales_price) as revenue
from my_customers,
store_sales,
customer_address,
store,
date_dim
where c_current_addr_sk = ca_address_sk
and ca_county = s_county
and ca_state = s_state
and ss_sold_date_sk = d_date_sk
and c_customer_sk = ss_customer_sk
and d_month_seq between (1203)
and (1205)
group by c_customer_sk
)
, segments as
(select cast((revenue/50) as int) as segment
from my_revenue
)
select segment, count(*) as num_customers, segment*50 as segment_base
from segments
group by segment
order by segment, num_customers
limit 100
"""
# cache runs
if run == 2 or run == 3:
# tables in query12 used for collecting stats
hc.cacheTable('web_sales')
hc.cacheTable('item')
hc.cacheTable('date_dim')
# to circumvent lazy computation and force cache, we run a query
# that involves the above cached tables
if run == 2:
# we will avoid running query12 now since we want to run
# it below and collect stats
# Instead, we run query54 which involves all the above 3
# cached tables
df = hc.sql(query54)
# to force the caching of the outputRDD
elif run == 3:
# running the same query used to collect stats: query12
# since we want to cache the output
df = hc.sql(query12)
df.cache()
df.show()
time.sleep(120)
# record stats befor starting
nw_before = map(get_network_bytes, hosts)
st_before = map(get_storage_bytes, hosts)
time_before = time.time()
# actually run the query for collecting stastics
hc.sql(query12).show()
# record stat after completion
time_after = time.time()
nw_after = map(get_network_bytes, hosts)
st_after = map(get_storage_bytes, hosts)
# calculate the difference in stats
nw_read_hosti = 0
nw_write_hosti = 0
st_read_hosti = 0
st_write_hosti = 0
for i in range(len(hosts)):
nw_read_hosti += nw_after[i][0] - nw_before[i][0]
nw_write_hosti += nw_after[i][1] - nw_before[i][1]
st_read_hosti += st_after[i][0] - st_before[i][0]
st_write_hosti += st_after[i][1] - st_before[i][1]
# output the stats
print time_after - time_before
print bytes_to_mb(nw_read_hosti)
print bytes_to_mb(nw_write_hosti)
print bytes_to_mb(st_read_hosti)
print bytes_to_mb(st_write_hosti)
sc.stop()
示例7: SparkConf
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import cacheTable [as 别名]
from pyspark.sql import HiveContext
from pyspark.sql.types import Row
import random
conf = SparkConf().setAppName("spark_sql_cache_table")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)
dataRDD = sc.textFile("hdfs://dip.cdh5.dev:8020/user/yurun/datas").map(lambda line: line.split(
"\t")).map(lambda words: Row(col1=words[0], col2=words[1], col3=words[2]))
sourceRDD = hc.inferSchema(dataRDD)
sourceRDD.registerAsTable("source")
cacheRDD = hc.sql("select * from source where col1 = 'col1_50'")
cacheRDD.registerAsTable("cacheTable")
hc.cacheTable("cacheTable")
hc.sql("select col2, max(col3) from cacheTable group by col2").collect()
hc.sql("select col3, min(col2) from cacheTable group by col3").collect()
# hc.uncacheTable("cacheTable")
sc.stop()
示例8: SparkEngine
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import cacheTable [as 别名]
class SparkEngine(object):
def __init__(self, sc, debug=False):
self.export_path = os.environ['COOPERHEWITT_ROOT'] + "/export/"
self.sc = sc
# hive requires writable permissions: ~/ephemeral-hdfs/bin/hadoop fs -chmod 777 /tmp/hive
self.hive_cxt = HiveContext(sc)
self.sql_cxt = SQLContext(sc)
if debug:
print "{0}\n{1}\n{2}\n".format(sc.master, self.hive_cxt, self.sql_cxt)
print sc._conf.getAll()
#TBD destructor Unpersist memory
### functionality to query and create tables
def _create_df_table(self, schema, frame, name):
if schema: df = self.hive_cxt.createDataFrame(frame, schema=schema)
else: df = self.hive_cxt.createDataFrame(frame)
df.printSchema()
df.registerTempTable(name)
self.hive_cxt.cacheTable(name)
return df
def _query_temporal_data(self):
# step 1. create main temporal table
# n_obs => first join causes for each pen entry * num location entries existent (dependent on time period)
samples_temporal_tb = self.hive_cxt.sql("""
SELECT s.refers_to_object_id, created, visit_raw,
room_floor, room_id, room_name,
spot_id, spot_name, spot_description,
room_count_objects, room_count_spots, spot_count_objects,
abs(datediff(
from_utc_timestamp(from_unixtime(created, "yyyy-MM-dd"), 'US/Eastern'),
from_utc_timestamp(from_unixtime(visit_raw, "yyyy-MM-dd"), 'US/Eastern')
)) as delta
FROM samples s
JOIN temporal t
ON s.refers_to_object_id = t.refers_to_object_id
ORDER by s.refers_to_object_id, created, delta
""")
samples_temporal_tb.registerTempTable('samplestemporal')
self.hive_cxt.cacheTable('samplestemporal')
return samples_temporal_tb
def _minimize_query(self):
# From the temporal table, we need minimize the location (multiple locations) to the appropriate sample timestamp
tb_samples = self.hive_cxt.sql("""
SELECT *
FROM (
SELECT *,
MIN(delta) OVER ( PARTITION BY refers_to_object_id, created) AS min_delta,
row_number() OVER ( PARTITION BY refers_to_object_id, created) AS ranks
FROM samplestemporal st
ORDER BY refers_to_object_id
) query
where query.ranks = 1
""")
tb_samples = tb_samples.withColumn("meta_store", lit(1))
tb_samples.registerTempTable('minimizedsamples')
self.hive_cxt.cacheTable('minimizedsamples')
return tb_samples
def execute_query(self, (samples_schema, samples_frame, samples_name),
(temporal_schema, temporal_frame, temporal_name),
cols):
self.df_samples = self._create_df_table(samples_schema, samples_frame, samples_name)
self.df_temporal = self._create_df_table(temporal_schema, temporal_frame, temporal_name)
self.tb_meta = self._query_temporal_data()
self.tb_meta_min = self._minimize_query()
# combine to the original pen data (meta_store indicates if we had object data to integrate)
self.df_samplesmeta = self.df_samples.join(self.tb_meta_min, ['refers_to_object_id', 'created'], "left_outer")
self.df_samplesmeta = self.df_samplesmeta.fillna({'meta_store': 0})
self.df_samplesmeta.printSchema()
# pickle file to pandas: alternatively we can store as a json or parquet columnar format
dropped_cols = ['delta', 'min_delta', 'ranks'] + cols
samplesmeta_pd = self.df_samplesmeta.toPandas()
samplesmeta_pd = samplesmeta_pd.drop(dropped_cols, axis=1)
samplesmeta_pd.to_pickle(self.export_path + "penmeta_spark.pkl")