Python HiveContext.inferSchema方法代码示例

本文整理汇总了Python中pyspark.sql.HiveContext.inferSchema方法的典型用法代码示例。如果您正苦于以下问题：Python HiveContext.inferSchema方法的具体用法？Python HiveContext.inferSchema怎么用？Python HiveContext.inferSchema使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.HiveContext的用法示例。

在下文中一共展示了HiveContext.inferSchema方法的5个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: SparkConf

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import inferSchema [as 别名]
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, Row

conf = SparkConf().setAppName("spark_sql_delimiter_infer_schema")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize(["row1_col1 row1_col2 row1_col3",
                         "row2_col1 row2_col2 row3_col3", "row3_col1 row3_col2 row3_col3"])

columns = source.map(lambda line: line.split(" ")).filter(
    lambda columns: columns and len(columns) == 3)

rows = columns.map(
    lambda columns: Row(col1=columns[0], col2=columns[1], col3=columns[2]))

table = hc.inferSchema(rows)

table.registerAsTable("temp_mytable")

datas = hc.sql("select * from temp_mytable").collect()

sc.stop()

if datas:
    for data in datas:
        print data

开发者ID:Leaderman，项目名称:pyspark，代码行数:31，代码来源:spark_sql_delimiter_infer_schema.py

示例2: SparkConf

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import inferSchema [as 别名]
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import IntegerType
import json
import sys

if __name__ == "__main__":
    inputFile = sys.argv[1]
    conf = SparkConf().setAppName("TwitterAnalytics")
    sc = SparkContext()
    hiveCtx = HiveContext(sc)
    print "Loading tweets from " + inputFile
    input = hiveCtx.jsonFile(inputFile)
    input.registerTempTable("tweets")
    topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
    print topTweets.collect()
    topTweetText = topTweets.map(lambda row : row.text)
    print topTweetText.collect()
    # Make a happy person row
    happyPeopleRDD = sc.parallelize([Row(name="ganguly", favouriteBeverage="coffee")])
    happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD)
    happyPeopleSchemaRDD.registerTempTable("strong_people")
    # Make a UDF to tell us how long some text is
    hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
    lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10")
    print lengthSchemaRDD.collect()
    sc.stop()

开发者ID:datafibers，项目名称:BigData-Analytics，代码行数:29，代码来源:TwitterAnalytics.py

示例3: SparkContext

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import inferSchema [as 别名]
	sc = SparkContext(conf=conf)
	hc = HiveContext(sc)


	hc.sql("set spark.sql.shuffle.partitions = 10")

	orderMap = sc.textFile("hdfs:///user/hive/warehouse/retaildb.db/orders")\
	.map(lambda record : record.split(","))\
	.map(lambda record : Row(orderID=int(record[0]),orderDate=record[1][:11]))
	
	itemMap  = sc.textFile("hdfs:///user/hive/warehouse/retaildb.db/order_items")\
 	.map(lambda record : record.split(","))\
	.map(lambda row: (int(row[1]),float(row[4]))).reduceByKey(lambda x,y : x+y)\
	.map(lambda record : Row(orderID=int(record[0]),Total=record[1]))

	oSchema = hc.inferSchema(orderMap)
	iSchema = hc.inferSchema(itemMap)

	oSchema.registerTempTable("orders")
	iSchema.registerTempTable("items")

	avgSalesPerDay = hc.sql(" SELECT o.orderDate,avg(i.Total) as avgSales \
                            from orders o join items i \ 
                            where o.orderID = i.orderID \
                            group by o.orderDate \
                            order by avgSales DESC")


	avgSalesPerDay.map(lambda row : ",".join([row.orderDate,str(row.avgSales)]))\
  	.coalesce(1) \
  	.saveAsTextFile("AvgSalesPerDay2")

开发者ID:Pushkr，项目名称:Hadoop，代码行数:33，代码来源:AvgSalesPerDay_Hive.py

示例4: SparkConf

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import inferSchema [as 别名]
# coding=utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.types import Row

conf = SparkConf().setAppName("spark_sql_cache_table_extend")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

dataRDD = sc.textFile("/user/hdfs/rawlog/app_weibomobile03x4ts1kl_mwb_interface/").map(lambda line: line.split(
    ",")).filter(lambda words: len(words) >= 3).map(lambda words: Row(col1=words[0], col2=words[1], col3=words[2]))

sourceRDD = hc.inferSchema(dataRDD)

sourceRDD.registerAsTable("source")

hc.cacheTable("source")

hc.sql("select count(*) from source").collect()

hc.sql("select col2, max(col3) from source group by col2").collect()

hc.sql("select col3, min(col2) from source group by col3").collect()

# hc.uncacheTable("source")

sc.stop()

开发者ID:Leaderman，项目名称:pyspark，代码行数:32，代码来源:spark_sql_cache_table_extend.py

示例5: SparkConf

# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import inferSchema [as 别名]
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import StringType

conf = SparkConf().setAppName("spark_sql_udf")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

lines = sc.parallelize(["a", "b", "c"])

people = lines.map(lambda value: Row(name=value))

peopleSchema = hc.inferSchema(people)

peopleSchema.registerTempTable("people")


def myfunc(value):
    return value.upper()

hc.registerFunction("myfunc", myfunc, StringType())

rows = hc.sql("select myfunc(name) from people").rdd.filter(
    lambda row: isinstance(row, tuple)).collect()

sc.stop()

for row in rows:
    print row, type(row[0])

开发者ID:Leaderman，项目名称:pyspark，代码行数:33，代码来源:spark_sql_udf.py

注：本文中的pyspark.sql.HiveContext.inferSchema方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。