本文整理汇总了Python中pyspark.sql.types.StructType.fromJson方法的典型用法代码示例。如果您正苦于以下问题:Python StructType.fromJson方法的具体用法?Python StructType.fromJson怎么用?Python StructType.fromJson使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.types.StructType
的用法示例。
在下文中一共展示了StructType.fromJson方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_rmse
# 需要导入模块: from pyspark.sql.types import StructType [as 别名]
# 或者: from pyspark.sql.types.StructType import fromJson [as 别名]
def test_rmse():
# TODO: revised so that it will take user's inputs instead of hardcoded values
movies_schema = None
ratings_schema = None
# load the schemas
with open("movielens_20m_movies_schema.json", "r") as json_schema_file:
movies_schema = StructType.fromJson(json.load(json_schema_file))
with open("movielens_20m_ratings_schema.json", "r") as json_schema_file:
ratings_schema = StructType.fromJson(json.load(json_schema_file))
# create a hdfs directory
os.system("hdfs dfs -mkdir datasets")
# load the json file into the hdfs directory
os.system("hdfs dfs -put movielens_10m_ratings.json.gz datasets/movielens_10m_ratings.json.gz")
# create a DataFrame based on the content of the json file
ratingsDF = scsingleton.sqlCtx.read.json("hdfs://localhost:9000/datasets/movielens_10m_ratings.json.gz", schema=ratings_schema)
# explicitly repartition RDD after loading so that more tasks can run on it in parallel
# by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster
ratingsDF = ratingsDF.repartition(scsingleton.sc.defaultParallelism * 3)
# parse ratings DataFrame into an RDD of [(userId, itemId, rating)]
ratingsRDD = ratingsDF.map(lambda row: (row.user_id, row.movie_id, row.rating))
ratingsRDD.cache()
# split data into train (60%), test (40%)
# TODO: add validation in the future? train (60%), validation (20%), test(20%)?
trainingRDD, testRDD = ratingsRDD.randomSplit([0.6, 0.4])
trainingRDD.cache()
testRDD.cache()
# run training algorithm to build the model
# without validation
with Timer() as t:
model = ALS.train(trainingRDD, rank=3)
print "ALS.train(trainingRDD, rank=3): %s seconds" % t.secs
# make a prediction
with Timer() as t:
testPredRDD = model.predictAll( testRDD.map( lambda x: (x[0], x[1]) ) ).cache()
print "testPredRDD: %s seconds" % t.secs
# calculate RMSE
with Timer() as t:
testRmse = pm.calculate_rmse_using_rdd(testRDD, testPredRDD)
print "testRmse: %s seconds" % t.secs
print "testRmse", testRmse
return
示例2: get_twitter_schema
# 需要导入模块: from pyspark.sql.types import StructType [as 别名]
# 或者: from pyspark.sql.types.StructType import fromJson [as 别名]
def get_twitter_schema(json_file_name):
schema_dict = json.load(open(json_file_name))
schema_struct = StructType.fromJson(schema_dict)
return schema_struct
示例3: generate_schema_dict
# 需要导入模块: from pyspark.sql.types import StructType [as 别名]
# 或者: from pyspark.sql.types.StructType import fromJson [as 别名]
COUNTRIES_FILE_PATH = '/opt/SparkDatasets/geography/countries.csv'
CITIES_FILE_PATH = '/opt/SparkDatasets/geography/cities.csv'
CONTINENT_STRUCTURE = \
[ ( 'continent_id' , 'integer' )
, ( 'continent_name', 'string' ) ]
COUNTRY_STRUCTURE = \
[ ( 'country_id' , 'integer' )
, ( 'continent_id', 'integer' )
, ( 'country_name', 'string' ) ]
CITY_STRUCTURE = \
[ ( 'city_id' , 'integer' )
, ( 'country_id', 'integer' )
, ( 'city_name' , 'string' ) ]
CONTINENT_SCHEMA = StructType.fromJson( generate_schema_dict(CONTINENT_STRUCTURE) )
COUNTRY_SCHEMA = StructType.fromJson( generate_schema_dict(COUNTRY_STRUCTURE) )
CITY_SCHEMA = StructType.fromJson( generate_schema_dict(CITY_STRUCTURE) )
spark = SparkSession.builder.getOrCreate()
continents_df = generate_dataframe( spark, CONTINENT_SCHEMA, CONTINENTS_FILE_PATH )
countries_df = generate_dataframe( spark, COUNTRY_SCHEMA , COUNTRIES_FILE_PATH )
cities_df = generate_dataframe( spark, CITY_SCHEMA , CITIES_FILE_PATH )
continents_df.registerTempTable('continents')
countries_df.registerTempTable('countries')
cities_df.registerTempTable('cities')
print continents_df.count()
print countries_df.count()