本文整理汇总了Python中pyspark.sql.SQLContext.applySchema方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.applySchema方法的具体用法?Python SQLContext.applySchema怎么用?Python SQLContext.applySchema使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.SQLContext
的用法示例。
在下文中一共展示了SQLContext.applySchema方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import applySchema [as 别名]
def run(self):
sc = SparkContext("local", "gender")
sqlContext = SQLContext(sc)
#StringType =(str, unicode)
_out = self.output().open('w')
#lines = sc.textFile("myUser.csv")
#fobj = self.input().open("r")
#lines = sc.textFile(fobj.name)
print(type(self.required_tasks['insert_source'].output()))
print(self.required_tasks['insert_source'])
#print(self.input()['insert_source'].input())
lines = sc.textFile("myUser.csv")
parts = lines.map(lambda l: l.split(","))
users = parts.map(lambda p: (p[0], p[1],p[2],p[3],p[4],p[5],p[6],p[7],
p[8],p[9],p[10],p[11],p[12],p[13],p[14],p[15],p[16],p[17],p[18],p[19]))
schemaString = "userId lmsUserId lmsName orgName name gender registrationDate emailId mothertounge highestEduDegree goals city state active firstAccesDate lastAccessDate allowCert yearOfBirth pincode aadharId"
print(schemaString)
_out.write(schemaString )
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
#schemaUser = sqlContext.createDataFrame(users, schema)
schemaUser = sqlContext.applySchema(users, schema)
schemaUser.registerTempTable("users")
results = sqlContext.sql("SELECT gender FROM users")
genders = results.map(lambda p : (p,1))
counts = genders.reduceByKey(lambda a, b: a + b) #.map(lambda t : ("Gender " + t(0) + " No " + t(1))).collect()
for name in counts.collect():
_out.write(str(name))
_out.close()
示例2: Row
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import applySchema [as 别名]
# RDD is created from a list of rows
some_rdd = sc.parallelize([Row(name="John", age=19),
Row(name="Smith", age=23),
Row(name="Sarah", age=18)])
# Infer schema from the first row, create a DataFrame and print the schema
some_df = sqlContext.inferSchema(some_rdd)
some_df.printSchema()
# Another RDD is created from a list of tuples
another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)])
# Schema with two fields - person_name and person_age
schema = StructType([StructField("person_name", StringType(), False),
StructField("person_age", IntegerType(), False)])
# Create a DataFrame by applying the schema to the RDD and print the schema
another_df = sqlContext.applySchema(another_rdd, schema)
another_df.printSchema()
# root
# |-- age: integer (nullable = true)
# |-- name: string (nullable = true)
# A JSON dataset is pointed to by path.
# The path can be either a single text file or a directory storing text files.
path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json")
# Create a DataFrame from the file(s) pointed to by path
people = sqlContext.jsonFile(path)
# root
# |-- person_name: string (nullable = false)
# |-- person_age: integer (nullable = false)
# The inferred schema can be visualized using the printSchema() method.
示例3: SQLContext
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import applySchema [as 别名]
sqlContext = SQLContext(sc)
def csvParse( tup ):
line = tup[ 0 ];
reader = csv.reader( [ line ] );
return list( reader )[ 0 ];
# Load File, remove header, and parse
file = sc.textFile("hdfs://wolf.iems.northwestern.edu/user/huser88/crime/Crimes_-_2001_to_present.csv").zipWithIndex().filter(lambda x: x[ 1 ] > 0 ).map(csvParse)
# Create RDD with year and month
file1 = file.map(lambda x: Row(id=x[0], date=x[2])).cache()
# Prepare for sql queries
headers = "date id"
fields = [StructField(field_name, StringType(), True) for field_name in headers.split()]
schema = StructType(fields)
schema_file = sqlContext.applySchema(file1, schema)
schema_file.registerTempTable("crime1")
# Get monthly average crime rate
crimeByMonth = sqlContext.sql("SELECT substr(date, 0,2), COUNT(id)/COUNT(DISTINCT substr(date,7,4)) AS avgCrimeCnt FROM crime1 GROUP BY substr(date,0,2)")
# Print output to screen
for m in crimeByMonth.collect():
print m
示例4: Row
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import applySchema [as 别名]
# RDD is created from a list of rows
some_rdd = sc.parallelize([Row(name="John", age=19),
Row(name="Smith", age=23),
Row(name="Sarah", age=18)])
# Infer schema from the first row, create a SchemaRDD and print the schema
some_schemardd = sqlContext.inferSchema(some_rdd)
some_schemardd.printSchema()
# Another RDD is created from a list of tuples
another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)])
# Schema with two fields - person_name and person_age
schema = StructType([StructField("person_name", StringType(), False),
StructField("person_age", IntegerType(), False)])
# Create a SchemaRDD by applying the schema to the RDD and print the schema
another_schemardd = sqlContext.applySchema(another_rdd, schema)
another_schemardd.printSchema()
# root
# |-- age: integer (nullable = true)
# |-- name: string (nullable = true)
# A JSON dataset is pointed to by path.
# The path can be either a single text file or a directory storing text files.
path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json")
# Create a SchemaRDD from the file(s) pointed to by path
people = sqlContext.jsonFile(path)
# root
# |-- person_name: string (nullable = false)
# |-- person_age: integer (nullable = false)
# The inferred schema can be visualized using the printSchema() method.
示例5: SparkConf
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import applySchema [as 别名]
# coding: utf-8
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from decimal import Decimal
from pyspark.sql.types import StructType, StructField, DecimalType
conf = SparkConf().setAppName("spark_sql_datatype_decimal")
sc = SparkContext(conf=conf)
hc = SQLContext(sc)
source = sc.parallelize(
[(Decimal("1.0"), Decimal("2.0"))])
schema = StructType([StructField("col1", DecimalType(), False),
StructField("col2", DecimalType(), False)])
table = hc.applySchema(source, schema)
table.registerAsTable("temp_table")
rows = hc.sql(
"select col1 + col2, col2 + 1.0 from temp_table").collect()
sc.stop()
for row in rows:
print row
示例6: Row
# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import applySchema [as 别名]
messages = lines.map(lambda l: l.split(","))
messages_subset = messages.map(lambda p: Row(ip=p[0], user=p[1], date=p[2], time=p[3]))
# Ex 2
lines = sc.textFile("file:///" + "C:/coding/Hadoop/pig/MapReduceInputData/VH_Formtype.txt")
messages = lines.map(lambda l: l.split("\t"))
messages_subset = messages.map(lambda p: Row(formtypename=p[1]))
# See example: http://spark.apache.org/docs/latest/sql-programming-guide.html
schema_messages = sqlContext.inferSchema(messages_subset)
# NOTE: inferSchema is deprecated, please use createDataFrame instead
schemaString = "ip user date time"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
schema_messages = sqlContext.applySchema(messages_subset, schema)
# NOTE: applySchema is deprecated, please use createDataFrame instead
schema_messages.registerTempTable("messages_subset")
# Ex 1
data = sqlContext.sql("SELECT * FROM messages_subset") # Can then use RDD operations on the returned RDD
# #x 2
data = sqlContext.sql("""SELECT formtypename, count(formtypename) AS processed FROM
messages_subset GROUP BY formtypename ORDER BY formtypename""")
data2 = data.map(lambda r: r).collect()
for d in data2: # An RDD(?) on Row objects. TODO: How to convert from Row?
print d[0], d[1]
formtypes = {} # Add formtypes to a dictionary
for d in data2:
formtypes[d[0]] = d[1]