当前位置: 首页>>代码示例>>Python>>正文


Python SQLContext.applySchema方法代码示例

本文整理汇总了Python中pyspark.sql.SQLContext.applySchema方法的典型用法代码示例。如果您正苦于以下问题:Python SQLContext.applySchema方法的具体用法?Python SQLContext.applySchema怎么用?Python SQLContext.applySchema使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.SQLContext的用法示例。


在下文中一共展示了SQLContext.applySchema方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: run

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import applySchema [as 别名]
 def run(self):
     sc = SparkContext("local", "gender")
     sqlContext = SQLContext(sc)
     #StringType =(str, unicode)
     _out = self.output().open('w')
     #lines = sc.textFile("myUser.csv")
     #fobj = self.input().open("r")
     #lines = sc.textFile(fobj.name)
     print(type(self.required_tasks['insert_source'].output()))
     print(self.required_tasks['insert_source'])
     #print(self.input()['insert_source'].input())
     lines = sc.textFile("myUser.csv")
     parts = lines.map(lambda l: l.split(","))
     users = parts.map(lambda p: (p[0], p[1],p[2],p[3],p[4],p[5],p[6],p[7],
         p[8],p[9],p[10],p[11],p[12],p[13],p[14],p[15],p[16],p[17],p[18],p[19]))
     schemaString = "userId lmsUserId lmsName orgName name gender registrationDate emailId mothertounge highestEduDegree goals city state active firstAccesDate lastAccessDate allowCert yearOfBirth pincode aadharId"
     print(schemaString)
     _out.write(schemaString )
     fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
     schema = StructType(fields)
     #schemaUser = sqlContext.createDataFrame(users, schema)
     schemaUser = sqlContext.applySchema(users, schema)
     schemaUser.registerTempTable("users")
     results = sqlContext.sql("SELECT gender FROM users")
     genders = results.map(lambda p : (p,1))
     counts = genders.reduceByKey(lambda a, b: a + b) #.map(lambda t : ("Gender " + t(0) + " No " + t(1))).collect()
     for name in counts.collect():
         _out.write(str(name))
     _out.close()
开发者ID:Zarana-Parekh,项目名称:analytics,代码行数:31,代码来源:genderTask.py

示例2: Row

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import applySchema [as 别名]
    # RDD is created from a list of rows
    some_rdd = sc.parallelize([Row(name="John", age=19),
                              Row(name="Smith", age=23),
                              Row(name="Sarah", age=18)])
    # Infer schema from the first row, create a DataFrame and print the schema
    some_df = sqlContext.inferSchema(some_rdd)
    some_df.printSchema()

    # Another RDD is created from a list of tuples
    another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)])
    # Schema with two fields - person_name and person_age
    schema = StructType([StructField("person_name", StringType(), False),
                        StructField("person_age", IntegerType(), False)])
    # Create a DataFrame by applying the schema to the RDD and print the schema
    another_df = sqlContext.applySchema(another_rdd, schema)
    another_df.printSchema()
    # root
    #  |-- age: integer (nullable = true)
    #  |-- name: string (nullable = true)

    # A JSON dataset is pointed to by path.
    # The path can be either a single text file or a directory storing text files.
    path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json")
    # Create a DataFrame from the file(s) pointed to by path
    people = sqlContext.jsonFile(path)
    # root
    #  |-- person_name: string (nullable = false)
    #  |-- person_age: integer (nullable = false)

    # The inferred schema can be visualized using the printSchema() method.
开发者ID:MLDL,项目名称:spark,代码行数:32,代码来源:sql.py

示例3: SQLContext

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import applySchema [as 别名]
sqlContext = SQLContext(sc)

def csvParse( tup ):
    line = tup[ 0 ];
    reader = csv.reader( [ line ] );
    return list( reader )[ 0 ];



# Load File, remove header, and parse
file = sc.textFile("hdfs://wolf.iems.northwestern.edu/user/huser88/crime/Crimes_-_2001_to_present.csv").zipWithIndex().filter(lambda x: x[ 1 ] > 0 ).map(csvParse)

# Create RDD with year and month
file1 = file.map(lambda x: Row(id=x[0], date=x[2])).cache()

# Prepare for sql queries
headers = "date id"
fields = [StructField(field_name, StringType(), True) for field_name in headers.split()]
schema = StructType(fields)

schema_file = sqlContext.applySchema(file1, schema)
schema_file.registerTempTable("crime1")

# Get monthly average crime rate
crimeByMonth = sqlContext.sql("SELECT substr(date, 0,2), COUNT(id)/COUNT(DISTINCT substr(date,7,4)) AS avgCrimeCnt FROM crime1 GROUP BY substr(date,0,2)")

# Print output to screen
for m in crimeByMonth.collect():
	print m

开发者ID:sanjuw,项目名称:Spark_ChicagoCrimeDataAnalysis,代码行数:31,代码来源:crime1.py

示例4: Row

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import applySchema [as 别名]
    # RDD is created from a list of rows
    some_rdd = sc.parallelize([Row(name="John", age=19),
                              Row(name="Smith", age=23),
                              Row(name="Sarah", age=18)])
    # Infer schema from the first row, create a SchemaRDD and print the schema
    some_schemardd = sqlContext.inferSchema(some_rdd)
    some_schemardd.printSchema()

    # Another RDD is created from a list of tuples
    another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)])
    # Schema with two fields - person_name and person_age
    schema = StructType([StructField("person_name", StringType(), False),
                        StructField("person_age", IntegerType(), False)])
    # Create a SchemaRDD by applying the schema to the RDD and print the schema
    another_schemardd = sqlContext.applySchema(another_rdd, schema)
    another_schemardd.printSchema()
    # root
    #  |-- age: integer (nullable = true)
    #  |-- name: string (nullable = true)

    # A JSON dataset is pointed to by path.
    # The path can be either a single text file or a directory storing text files.
    path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json")
    # Create a SchemaRDD from the file(s) pointed to by path
    people = sqlContext.jsonFile(path)
    # root
    #  |-- person_name: string (nullable = false)
    #  |-- person_age: integer (nullable = false)

    # The inferred schema can be visualized using the printSchema() method.
开发者ID:0asa,项目名称:spark,代码行数:32,代码来源:sql.py

示例5: SparkConf

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import applySchema [as 别名]
# coding: utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from decimal import Decimal
from pyspark.sql.types import StructType, StructField, DecimalType

conf = SparkConf().setAppName("spark_sql_datatype_decimal")

sc = SparkContext(conf=conf)

hc = SQLContext(sc)

source = sc.parallelize(
    [(Decimal("1.0"), Decimal("2.0"))])

schema = StructType([StructField("col1", DecimalType(), False),
                     StructField("col2", DecimalType(), False)])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
    "select col1 + col2, col2 + 1.0 from temp_table").collect()

sc.stop()

for row in rows:
    print row
开发者ID:Leaderman,项目名称:pyspark,代码行数:32,代码来源:spark_sql_datatype_decimal.py

示例6: Row

# 需要导入模块: from pyspark.sql import SQLContext [as 别名]
# 或者: from pyspark.sql.SQLContext import applySchema [as 别名]
messages = lines.map(lambda l: l.split(","))
messages_subset = messages.map(lambda p: Row(ip=p[0], user=p[1], date=p[2], time=p[3]))    

# Ex 2
lines = sc.textFile("file:///" + "C:/coding/Hadoop/pig/MapReduceInputData/VH_Formtype.txt")
messages = lines.map(lambda l: l.split("\t"))
messages_subset = messages.map(lambda p: Row(formtypename=p[1]))    

# See example: http://spark.apache.org/docs/latest/sql-programming-guide.html 
schema_messages = sqlContext.inferSchema(messages_subset)
# NOTE: inferSchema is deprecated, please use createDataFrame instead

schemaString = "ip user date time"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
schema_messages = sqlContext.applySchema(messages_subset, schema)
# NOTE: applySchema is deprecated, please use createDataFrame instead

schema_messages.registerTempTable("messages_subset")

# Ex 1
data = sqlContext.sql("SELECT * FROM messages_subset") # Can then use RDD operations on the returned RDD
# #x 2
data = sqlContext.sql("""SELECT formtypename, count(formtypename) AS processed FROM
    messages_subset GROUP BY formtypename ORDER BY formtypename""")
data2 = data.map(lambda r: r).collect()
for d in data2: # An RDD(?) on Row objects. TODO: How to convert from Row?
    print d[0], d[1]
formtypes = {} # Add formtypes to a dictionary
for d in data2:
    formtypes[d[0]] = d[1]
开发者ID:wargile,项目名称:Python,代码行数:33,代码来源:Spark.py


注:本文中的pyspark.sql.SQLContext.applySchema方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。