本文整理汇总了Python中pyspark.sql.HiveContext.inferSchema方法的典型用法代码示例。如果您正苦于以下问题:Python HiveContext.inferSchema方法的具体用法?Python HiveContext.inferSchema怎么用?Python HiveContext.inferSchema使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.HiveContext
的用法示例。
在下文中一共展示了HiveContext.inferSchema方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: SparkConf
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import inferSchema [as 别名]
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, Row
conf = SparkConf().setAppName("spark_sql_delimiter_infer_schema")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)
source = sc.parallelize(["row1_col1 row1_col2 row1_col3",
"row2_col1 row2_col2 row3_col3", "row3_col1 row3_col2 row3_col3"])
columns = source.map(lambda line: line.split(" ")).filter(
lambda columns: columns and len(columns) == 3)
rows = columns.map(
lambda columns: Row(col1=columns[0], col2=columns[1], col3=columns[2]))
table = hc.inferSchema(rows)
table.registerAsTable("temp_mytable")
datas = hc.sql("select * from temp_mytable").collect()
sc.stop()
if datas:
for data in datas:
print data
示例2: SparkConf
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import inferSchema [as 别名]
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import IntegerType
import json
import sys
if __name__ == "__main__":
inputFile = sys.argv[1]
conf = SparkConf().setAppName("TwitterAnalytics")
sc = SparkContext()
hiveCtx = HiveContext(sc)
print "Loading tweets from " + inputFile
input = hiveCtx.jsonFile(inputFile)
input.registerTempTable("tweets")
topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
print topTweets.collect()
topTweetText = topTweets.map(lambda row : row.text)
print topTweetText.collect()
# Make a happy person row
happyPeopleRDD = sc.parallelize([Row(name="ganguly", favouriteBeverage="coffee")])
happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD)
happyPeopleSchemaRDD.registerTempTable("strong_people")
# Make a UDF to tell us how long some text is
hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10")
print lengthSchemaRDD.collect()
sc.stop()
示例3: SparkContext
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import inferSchema [as 别名]
sc = SparkContext(conf=conf)
hc = HiveContext(sc)
hc.sql("set spark.sql.shuffle.partitions = 10")
orderMap = sc.textFile("hdfs:///user/hive/warehouse/retaildb.db/orders")\
.map(lambda record : record.split(","))\
.map(lambda record : Row(orderID=int(record[0]),orderDate=record[1][:11]))
itemMap = sc.textFile("hdfs:///user/hive/warehouse/retaildb.db/order_items")\
.map(lambda record : record.split(","))\
.map(lambda row: (int(row[1]),float(row[4]))).reduceByKey(lambda x,y : x+y)\
.map(lambda record : Row(orderID=int(record[0]),Total=record[1]))
oSchema = hc.inferSchema(orderMap)
iSchema = hc.inferSchema(itemMap)
oSchema.registerTempTable("orders")
iSchema.registerTempTable("items")
avgSalesPerDay = hc.sql(" SELECT o.orderDate,avg(i.Total) as avgSales \
from orders o join items i \
where o.orderID = i.orderID \
group by o.orderDate \
order by avgSales DESC")
avgSalesPerDay.map(lambda row : ",".join([row.orderDate,str(row.avgSales)]))\
.coalesce(1) \
.saveAsTextFile("AvgSalesPerDay2")
示例4: SparkConf
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import inferSchema [as 别名]
# coding=utf-8
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.types import Row
conf = SparkConf().setAppName("spark_sql_cache_table_extend")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)
dataRDD = sc.textFile("/user/hdfs/rawlog/app_weibomobile03x4ts1kl_mwb_interface/").map(lambda line: line.split(
",")).filter(lambda words: len(words) >= 3).map(lambda words: Row(col1=words[0], col2=words[1], col3=words[2]))
sourceRDD = hc.inferSchema(dataRDD)
sourceRDD.registerAsTable("source")
hc.cacheTable("source")
hc.sql("select count(*) from source").collect()
hc.sql("select col2, max(col3) from source group by col2").collect()
hc.sql("select col3, min(col2) from source group by col3").collect()
# hc.uncacheTable("source")
sc.stop()
示例5: SparkConf
# 需要导入模块: from pyspark.sql import HiveContext [as 别名]
# 或者: from pyspark.sql.HiveContext import inferSchema [as 别名]
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import StringType
conf = SparkConf().setAppName("spark_sql_udf")
sc = SparkContext(conf=conf)
hc = HiveContext(sc)
lines = sc.parallelize(["a", "b", "c"])
people = lines.map(lambda value: Row(name=value))
peopleSchema = hc.inferSchema(people)
peopleSchema.registerTempTable("people")
def myfunc(value):
return value.upper()
hc.registerFunction("myfunc", myfunc, StringType())
rows = hc.sql("select myfunc(name) from people").rdd.filter(
lambda row: isinstance(row, tuple)).collect()
sc.stop()
for row in rows:
print row, type(row[0])