本文整理汇总了Python中pyspark.sql.HiveContext.table方法的典型用法代码示例。如果您正苦于以下问题:Python HiveContext.table方法的具体用法?Python HiveContext.table怎么用?Python HiveContext.table使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.HiveContext
的用法示例。
在下文中一共展示了HiveContext.table方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: table_schema_from_spark
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import table [as 别名]
def table_schema_from_spark(hcat_table_name):
#returns schema of table with this database.name in hcatalog
# (spark-workaround as long as hcatweb api is not available...)
# initialize spark
import findspark
findspark.init()
import pyspark
from pyspark.sql import HiveContext
sc_conf = pyspark.SparkConf()
#sc_conf.set('spark.executor.extraClassPath','/opt/cloudera/parcels/CDH/lib/hive/lib/*')
#sc_conf.set('spark.master','yarn-client')
sc = pyspark.SparkContext(appName = 'ade_get_table_schema', conf=sc_conf)
hc = HiveContext(sc)
hive_schema = hc.table(hcat_table_name).schema.jsonValue()
print hive_schema
sc.stop()
table_schema = {'columns':{}}
col_sequence = 0
for field in hive_schema['fields']:
table_schema['columns'][field['name']] = {'col_sequence': col_sequence, 'type':field['type']}
col_sequence += 1
return table_schema
示例2: SparkContext
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import table [as 别名]
sc = SparkContext("local", "best_hospitals")
from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)
# Select the top 10 hospital by average avgscore
# Please note that we filter out those hospital not qualified for evaluation
df_top10_hospitals = sqlContext.sql("select Q.providerid as id, AVG(Q.normalizedscore) as avgscore \
from total_quality Q join hospitals_qualified H on Q.providerid = H.providerid \
where Q.normalizedscore is not null and H.qualified = true \
group by Q.providerid \
order by avgscore DESC").limit(10)
# Join with hospitals_qualified to get the hospital name and state
# Note: couldn't figure out how to do it in the above select statement (together with Group By) in one-shot! :-(
df_hospitals = sqlContext.table("hospitals_qualified")
df_top10_hospitals_full = df_top10_hospitals.join(df_hospitals, df_top10_hospitals.id == df_hospitals.providerid).\
select(df_hospitals.providerid, df_hospitals.hospitalname, df_hospitals.state, df_top10_hospitals.avgscore)
df_top10_hospitals_full = df_top10_hospitals_full.orderBy(df_top10_hospitals_full.avgscore.desc())
# Save it as a table
df_top10_hospitals_full.registerTempTable("df")
sqlContext.sql("drop table if exists top_10_hospitals")
sqlContext.sql("CREATE TABLE top_10_hospitals AS SELECT * FROM df")
print
print "Top 10 hospitals"
print
rank = 1
for i in df_top10_hospitals_full.collect():
示例3: dict
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import table [as 别名]
# Some hospitals have too few non-NA measure. To have a fair ranking, we want to set a min. bar
# on the # of non-NA measure for our hospitals to participate in our evaluation.
# For each hospital, find out the # of non-NA measure it has
nonNAMeasureCount = dict(df_total_quality.map(lambda r: (r.providerid, r.normalizedscore)).
combineByKey( # Use combineByKey to count the # of non-NA Measure
lambda value: 0 if value is None else 1,
lambda x, value: x if value is None else x + 1,
lambda x, y: x + y).collect())
# Find the 25th percentile of non-NA measure, and this will be the min-bar of # of non-NA measure.
minMeasureCount = np.percentile(nonNAMeasureCount.values(), 25.)
df_hospitals = sqlContext.table("hospitals")
# For the purpose of evaluation, we keep only those hospitals which meet the bar
hospitals_qualified = df_hospitals.map(lambda r: (r.providerid, r.hospitalname, r.state,
bool(nonNAMeasureCount[r.providerid] >= minMeasureCount
if nonNAMeasureCount.has_key(r.providerid) else False)))
schema = StructType([
StructField("providerid", StringType(), True),
StructField("hospitalname", StringType(), True),
StructField("state", StringType(), True),
StructField("qualified", BooleanType(), True)])
df_hospitals_qualified = sqlContext.createDataFrame(hospitals_qualified, schema)
saveAsHiveTable(df_hospitals_qualified, "hospitals_qualified")
示例4: SparkConf
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import table [as 别名]
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
# Initialize Spark
SparkContext.setSystemProperty("spark.executor.memory", "4g")
conf = SparkConf()
conf.set("spark.executor.instances", 20)
sc = SparkContext("yarn-client", "kdd99", conf=conf)
hc = HiveContext(sc)
kdd = hc.table("kdd99")
(trainData, testData) = kdd.randomSplit([0.7, 0.3], seed=42)
trainData.cache()
services = trainData.withColumnRenamed("service", "srvc").select("srvc").distinct()
testData = testData.join(services, testData.service == services.srvc)
# filter out any rows with a service not trained upon
testData.cache()
print "training set has " + str(trainData.count()) + " instances"
print "test set has " + str(testData.count()) + " instances"
# Build model
inx1 = StringIndexer(inputCol="protocol", outputCol="protocol-cat")
inx2 = StringIndexer(inputCol="service", outputCol="service-cat")
inx3 = StringIndexer(inputCol="flag", outputCol="flag-cat")
inx4 = StringIndexer(inputCol="is_anomaly", outputCol="label")
ohe2 = OneHotEncoder(inputCol="service-cat", outputCol="service-ohe")
示例5: SparkContext
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import table [as 别名]
# importing required packages
from pyspark.sql import HiveContext
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import Row
# seeting up spark context and hive context
sc = SparkContext("local", "Simple App")
sqlCtx = HiveContext(sc)
# creating a spark data frame using the hive table effective_care
df_raw = sqlCtx.table("effective_care")
print 'Number of rows in the table {0}'.format(df_raw.count())
# removing all row not containing numbers for score variable
# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
try:
int(s)
return True
except ValueError:
return False
# creating a RDD by filtering out invalid scores
df_clean_rdd = df_raw.rdd.filter(lambda row: CheckValidScore(row.score))
# cretating datframe from the RDD
df_clean = sqlCtx.createDataFrame(df_clean_rdd)
print 'Number of rows in table after cleaning {0}'.format(df_clean.count())
# converting the data types for score column
示例6: SparkContext
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import table [as 别名]
# importing required packages
from pyspark.sql import HiveContext
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import Row
#import numpy as np
# seeting up spark context and hive context
sc = SparkContext("local", "Simple App")
sqlCtx = HiveContext(sc)
# creating a spark data frame using the hive table effective_care
df_raw = sqlCtx.table("effective_care")
print 'Number of rows in the table {0}'.format(df_raw.count())
# removing all row not containing numbers for score variable
# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
try:
int(s)
return True
except ValueError:
return False
# creating a RDD by filtering out invalid scores
df_clean_rdd = df_raw.rdd.filter(lambda row: CheckValidScore(row.score))
# cretating datframe from the RDD
df_clean = sqlCtx.createDataFrame(df_clean_rdd)
print 'Number of rows in table after cleaning {0}'.format(df_clean.count())
示例7: tok_str
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import table [as 别名]
def tok_str(text, ngrams=1, minChars=2):
text = re.sub(r'\s+', ' ', text) # change any whitespace to regular space
tokens = map(unicode, text.lower().split(' ')) # split into tokens and change to lower case
tokens = filter(lambda x: len(x)>=minChars and x[0]!='@', tokens)
# remove short words and usernames
tokens = ["URL" if t[:4]=="http" else t for t in tokens]
# repalce any url by the constant word "URL"
tokens = [punct.sub('', t) for t in tokens] # remove punctuation from tokens
if ngrams==1:
return tokens
else:
return tokens + [' '.join(tokens[i:i+ngrams]) for i in xrange(len(tokens)-ngrams+1)]
tokenize = F.udf(lambda s: tok_str(unicode(s),ngrams=2), ArrayType(StringType()))
# Load sentiment dictionary
wv = hc.table('sentiment_words').collect()
wordlist = dict([(r.word,r.score) for r in wv])
# get positive sentiment scores from words RDD using word-list
def pscore(words):
scores = filter(lambda x: x>0, [wordlist[t] for t in words if t in wordlist])
return 0.0 if len(scores)==0 else (float(sum(scores))/len(scores))
pos_score = F.udf(lambda w: pscore(w), FloatType())
# get negative sentiment scores from words RDD using word-list
def nscore(words):
scores = filter(lambda x: x<0, [wordlist[t] for t in words if t in wordlist])
return 0.0 if len(scores)==0 else (float(sum(scores))/len(scores))
neg_score = F.udf(lambda w: nscore(w), FloatType())
# Create feature matrix for the model
示例8: SparkContext
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import table [as 别名]
# importing required packages
from pyspark.sql import HiveContext
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import Row
from pyspark.mllib.stat import Statistics
# seeting up spark context and hive context
sc = SparkContext("local", "Simple App")
sqlCtx = HiveContext(sc)
# creating a spark data frame using the hive tables survey_response and effective_care
df_survey_raw = sqlCtx.table("survey_response")
df_care_raw = sqlCtx.table("effective_care")
print 'Number of rows in the survey table {0}'.format(df_survey_raw.count())
print 'Number of rows in the effective_care table {0}'.format(df_care_raw.count())
# removing all row not containing numbers for score variable
# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
try:
int(s)
return True
except ValueError:
return False
# creating a RDD by filtering out invalid scores
df_survey_clean_rdd = df_survey_raw.rdd.filter(lambda row: CheckValidScore(row.hcahps_base_score))
df_care_clean_rdd = df_care_raw.rdd.filter(lambda row: CheckValidScore(row.score))