当前位置: 首页>>代码示例>>Python>>正文


Python HiveContext.table方法代码示例

本文整理汇总了Python中pyspark.sql.HiveContext.table方法的典型用法代码示例。如果您正苦于以下问题:Python HiveContext.table方法的具体用法?Python HiveContext.table怎么用?Python HiveContext.table使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.HiveContext的用法示例。


在下文中一共展示了HiveContext.table方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: table_schema_from_spark

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import table [as 别名]
def table_schema_from_spark(hcat_table_name):
    #returns schema of table with this database.name in hcatalog
    #   (spark-workaround as long as hcatweb api is not available...)
    # initialize spark
    import findspark
    findspark.init()
     
    import pyspark
    from pyspark.sql import HiveContext
    
    sc_conf = pyspark.SparkConf()
    #sc_conf.set('spark.executor.extraClassPath','/opt/cloudera/parcels/CDH/lib/hive/lib/*')
    #sc_conf.set('spark.master','yarn-client')
    
    sc = pyspark.SparkContext(appName = 'ade_get_table_schema', conf=sc_conf)
    hc = HiveContext(sc)
    
    hive_schema = hc.table(hcat_table_name).schema.jsonValue()
    
    print hive_schema
    
    sc.stop()
    
    table_schema = {'columns':{}}
    
    col_sequence = 0
    for field in hive_schema['fields']:
        table_schema['columns'][field['name']] = {'col_sequence': col_sequence, 'type':field['type']}
        col_sequence += 1
    
    return table_schema
开发者ID:heuvel,项目名称:den,代码行数:33,代码来源:den_hadoop.py

示例2: SparkContext

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import table [as 别名]
sc = SparkContext("local", "best_hospitals")

from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)

# Select the top 10 hospital by average avgscore
# Please note that we filter out those hospital not qualified for evaluation
df_top10_hospitals = sqlContext.sql("select Q.providerid as id, AVG(Q.normalizedscore) as avgscore \
from total_quality Q join hospitals_qualified H on Q.providerid = H.providerid \
where Q.normalizedscore is not null and H.qualified = true \
group by Q.providerid \
order by avgscore DESC").limit(10)

# Join with hospitals_qualified to get the hospital name and state
# Note: couldn't figure out how to do it in the above select statement (together with Group By) in one-shot! :-(
df_hospitals = sqlContext.table("hospitals_qualified")
df_top10_hospitals_full = df_top10_hospitals.join(df_hospitals, df_top10_hospitals.id == df_hospitals.providerid).\
    select(df_hospitals.providerid, df_hospitals.hospitalname, df_hospitals.state, df_top10_hospitals.avgscore)

df_top10_hospitals_full = df_top10_hospitals_full.orderBy(df_top10_hospitals_full.avgscore.desc())

# Save it as a table
df_top10_hospitals_full.registerTempTable("df")
sqlContext.sql("drop table if exists top_10_hospitals")
sqlContext.sql("CREATE TABLE top_10_hospitals AS SELECT * FROM df")

print
print "Top 10 hospitals"
print
rank = 1
for i in df_top10_hospitals_full.collect():
开发者ID:patng323,项目名称:w205-ex1,代码行数:33,代码来源:best_hospitals.py

示例3: dict

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import table [as 别名]

# Some hospitals have too few non-NA measure.  To have a fair ranking, we want to set a min. bar
# on the # of non-NA measure for our hospitals to participate in our evaluation.

# For each hospital, find out the # of non-NA measure it has
nonNAMeasureCount = dict(df_total_quality.map(lambda r: (r.providerid, r.normalizedscore)).
                         combineByKey( # Use combineByKey to count the # of non-NA Measure
                            lambda value: 0 if value is None else 1,
                            lambda x, value: x if value is None else x + 1,
                            lambda x, y: x + y).collect())

# Find the 25th percentile of non-NA measure, and this will be the min-bar of # of non-NA measure.
minMeasureCount = np.percentile(nonNAMeasureCount.values(), 25.)

df_hospitals = sqlContext.table("hospitals")
# For the purpose of evaluation, we keep only those hospitals which meet the bar
hospitals_qualified = df_hospitals.map(lambda r: (r.providerid, r.hospitalname, r.state,
                        bool(nonNAMeasureCount[r.providerid] >= minMeasureCount
                             if nonNAMeasureCount.has_key(r.providerid) else False)))

schema = StructType([
    StructField("providerid", StringType(), True),
    StructField("hospitalname", StringType(), True),
    StructField("state", StringType(), True),
    StructField("qualified", BooleanType(), True)])

df_hospitals_qualified = sqlContext.createDataFrame(hospitals_qualified, schema)
saveAsHiveTable(df_hospitals_qualified, "hospitals_qualified")

开发者ID:patng323,项目名称:w205-ex1,代码行数:30,代码来源:transform.py

示例4: SparkConf

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import table [as 别名]
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

# Initialize Spark
SparkContext.setSystemProperty("spark.executor.memory", "4g")
conf = SparkConf()
conf.set("spark.executor.instances", 20)
sc = SparkContext("yarn-client", "kdd99", conf=conf)
hc = HiveContext(sc)

kdd = hc.table("kdd99")

(trainData, testData) = kdd.randomSplit([0.7, 0.3], seed=42)
trainData.cache()
services = trainData.withColumnRenamed("service", "srvc").select("srvc").distinct()
testData = testData.join(services, testData.service == services.srvc)
# filter out any rows with a service not trained upon
testData.cache()

print "training set has " + str(trainData.count()) + " instances"
print "test set has " + str(testData.count()) + " instances"

# Build model
inx1 = StringIndexer(inputCol="protocol", outputCol="protocol-cat")
inx2 = StringIndexer(inputCol="service", outputCol="service-cat")
inx3 = StringIndexer(inputCol="flag", outputCol="flag-cat")
inx4 = StringIndexer(inputCol="is_anomaly", outputCol="label")
ohe2 = OneHotEncoder(inputCol="service-cat", outputCol="service-ohe")
开发者ID:ofermend,项目名称:data-science-with-hadoop-book,代码行数:33,代码来源:anomaly.py

示例5: SparkContext

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import table [as 别名]
# importing required packages
from pyspark.sql import HiveContext
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import Row

# seeting up spark context and hive context
sc = SparkContext("local", "Simple App")
sqlCtx = HiveContext(sc)

# creating a spark data frame using the hive table effective_care
df_raw = sqlCtx.table("effective_care")
print 'Number of rows in the table {0}'.format(df_raw.count())

# removing all row not containing numbers for score variable

# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

# creating a RDD by filtering out invalid scores
df_clean_rdd = df_raw.rdd.filter(lambda row: CheckValidScore(row.score))
# cretating datframe from the RDD
df_clean = sqlCtx.createDataFrame(df_clean_rdd)
print 'Number of rows in table after cleaning {0}'.format(df_clean.count())

# converting the data types for score column
开发者ID:meabhishekkumar,项目名称:w205-lab-exercises,代码行数:33,代码来源:best_hospitals.py

示例6: SparkContext

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import table [as 别名]
# importing required packages
from pyspark.sql import HiveContext
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import Row
#import numpy as np

# seeting up spark context and hive context
sc = SparkContext("local", "Simple App")
sqlCtx = HiveContext(sc)

# creating a spark data frame using the hive table effective_care
df_raw = sqlCtx.table("effective_care")
print 'Number of rows in the table {0}'.format(df_raw.count())

# removing all row not containing numbers for score variable

# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

# creating a RDD by filtering out invalid scores
df_clean_rdd = df_raw.rdd.filter(lambda row: CheckValidScore(row.score))
# cretating datframe from the RDD
df_clean = sqlCtx.createDataFrame(df_clean_rdd)
print 'Number of rows in table after cleaning {0}'.format(df_clean.count())
开发者ID:meabhishekkumar,项目名称:w205-lab-exercises,代码行数:32,代码来源:hospital_variability.py

示例7: tok_str

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import table [as 别名]
def tok_str(text, ngrams=1, minChars=2):
    text = re.sub(r'\s+', ' ', text) 		     # change any whitespace to regular space
    tokens = map(unicode, text.lower().split(' '))     # split into tokens and change to lower case
    tokens = filter(lambda x: len(x)>=minChars and x[0]!='@', tokens)     
                                                       # remove short words and usernames
    tokens = ["URL" if t[:4]=="http" else t for t in tokens]      
     # repalce any url by the constant word "URL"
    tokens = [punct.sub('', t) for t in tokens]        # remove punctuation from tokens
    if ngrams==1:
        return tokens
    else:
        return tokens + [' '.join(tokens[i:i+ngrams]) for i in xrange(len(tokens)-ngrams+1)]
tokenize = F.udf(lambda s: tok_str(unicode(s),ngrams=2), ArrayType(StringType()))

# Load sentiment dictionary
wv = hc.table('sentiment_words').collect()
wordlist = dict([(r.word,r.score) for r in wv])

# get positive sentiment scores from words RDD using word-list
def pscore(words):
    scores = filter(lambda x: x>0, [wordlist[t] for t in words if t in wordlist])
    return 0.0 if len(scores)==0 else (float(sum(scores))/len(scores))
pos_score = F.udf(lambda w: pscore(w), FloatType())

# get negative sentiment scores from words RDD using word-list
def nscore(words):
    scores = filter(lambda x: x<0, [wordlist[t] for t in words if t in wordlist])
    return 0.0 if len(scores)==0 else (float(sum(scores))/len(scores))
neg_score = F.udf(lambda w: nscore(w), FloatType()) 

# Create feature matrix for the model
开发者ID:ofermend,项目名称:data-science-with-hadoop-book,代码行数:33,代码来源:script.py

示例8: SparkContext

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import table [as 别名]
# importing required packages
from pyspark.sql import HiveContext
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import Row
from pyspark.mllib.stat import Statistics


# seeting up spark context and hive context
sc = SparkContext("local", "Simple App")
sqlCtx = HiveContext(sc)

# creating a spark data frame using the hive tables survey_response and effective_care
df_survey_raw = sqlCtx.table("survey_response")
df_care_raw = sqlCtx.table("effective_care")
print 'Number of rows in the survey table {0}'.format(df_survey_raw.count())
print 'Number of rows in the effective_care table {0}'.format(df_care_raw.count())

# removing all row not containing numbers for score variable

# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

# creating a RDD by filtering out invalid scores
df_survey_clean_rdd = df_survey_raw.rdd.filter(lambda row: CheckValidScore(row.hcahps_base_score))
df_care_clean_rdd = df_care_raw.rdd.filter(lambda row: CheckValidScore(row.score))
开发者ID:meabhishekkumar,项目名称:w205-lab-exercises,代码行数:33,代码来源:hospitals_and_patients.py


注:本文中的pyspark.sql.HiveContext.table方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。