Python StructType.fromJson方法代码示例

本文整理汇总了Python中pyspark.sql.types.StructType.fromJson方法的典型用法代码示例。如果您正苦于以下问题：Python StructType.fromJson方法的具体用法？Python StructType.fromJson怎么用？Python StructType.fromJson使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.types.StructType的用法示例。

在下文中一共展示了StructType.fromJson方法的3个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_rmse

# 需要导入模块: from pyspark.sql.types import StructType [as 别名]
# 或者: from pyspark.sql.types.StructType import fromJson [as 别名]
def test_rmse():
    # TODO: revised so that it will take user's inputs instead of hardcoded values

    movies_schema = None
    ratings_schema = None

    # load the schemas
    with open("movielens_20m_movies_schema.json", "r") as json_schema_file:
        movies_schema = StructType.fromJson(json.load(json_schema_file))

    with open("movielens_20m_ratings_schema.json", "r") as json_schema_file:
        ratings_schema = StructType.fromJson(json.load(json_schema_file))

    # create a hdfs directory
    os.system("hdfs dfs -mkdir datasets")

    # load the json file into the hdfs directory
    os.system("hdfs dfs -put movielens_10m_ratings.json.gz datasets/movielens_10m_ratings.json.gz")

    # create a DataFrame based on the content of the json file
    ratingsDF = scsingleton.sqlCtx.read.json("hdfs://localhost:9000/datasets/movielens_10m_ratings.json.gz", schema=ratings_schema)
    # explicitly repartition RDD after loading so that more tasks can run on it in parallel
    # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster
    ratingsDF = ratingsDF.repartition(scsingleton.sc.defaultParallelism * 3)    

    # parse ratings DataFrame into an RDD of [(userId, itemId, rating)]
    ratingsRDD = ratingsDF.map(lambda row: (row.user_id, row.movie_id, row.rating))
    ratingsRDD.cache()

    # split data into train (60%), test (40%)
    # TODO: add validation in the future? train (60%), validation (20%), test(20%)?
    trainingRDD, testRDD = ratingsRDD.randomSplit([0.6, 0.4])
    trainingRDD.cache()
    testRDD.cache()

    # run training algorithm to build the model
    # without validation
    with Timer() as t:
        model = ALS.train(trainingRDD, rank=3)
    print "ALS.train(trainingRDD, rank=3): %s seconds" % t.secs

    # make a prediction
    with Timer() as t:
        testPredRDD = model.predictAll( testRDD.map( lambda x: (x[0], x[1]) ) ).cache()
    print "testPredRDD: %s seconds" % t.secs

    # calculate RMSE
    with Timer() as t:
        testRmse = pm.calculate_rmse_using_rdd(testRDD, testPredRDD)
    print "testRmse: %s seconds" % t.secs
    print "testRmse", testRmse

    return

开发者ID:Lomascolo，项目名称:hermes，代码行数:55，代码来源:cf_example.py

示例2: get_twitter_schema

# 需要导入模块: from pyspark.sql.types import StructType [as 别名]
# 或者: from pyspark.sql.types.StructType import fromJson [as 别名]
def get_twitter_schema(json_file_name):
    schema_dict = json.load(open(json_file_name))
    schema_struct = StructType.fromJson(schema_dict)
    return schema_struct

开发者ID:Lab41，项目名称:soft-boiled，代码行数:6，代码来源:schema.py

示例3: generate_schema_dict

# 需要导入模块: from pyspark.sql.types import StructType [as 别名]
# 或者: from pyspark.sql.types.StructType import fromJson [as 别名]
COUNTRIES_FILE_PATH  = '/opt/SparkDatasets/geography/countries.csv'
CITIES_FILE_PATH     = '/opt/SparkDatasets/geography/cities.csv'

CONTINENT_STRUCTURE = \
    [ ( 'continent_id'  , 'integer' )
    , ( 'continent_name', 'string'  ) ]
COUNTRY_STRUCTURE = \
    [ ( 'country_id'  , 'integer' )
    , ( 'continent_id', 'integer' )
    , ( 'country_name', 'string'  ) ]
CITY_STRUCTURE = \
    [ ( 'city_id'   , 'integer' )
    , ( 'country_id', 'integer' )
    , ( 'city_name' , 'string'  ) ]

CONTINENT_SCHEMA = StructType.fromJson( generate_schema_dict(CONTINENT_STRUCTURE) )
COUNTRY_SCHEMA   = StructType.fromJson( generate_schema_dict(COUNTRY_STRUCTURE)   )
CITY_SCHEMA      = StructType.fromJson( generate_schema_dict(CITY_STRUCTURE)      )

spark = SparkSession.builder.getOrCreate()

continents_df = generate_dataframe( spark, CONTINENT_SCHEMA, CONTINENTS_FILE_PATH )
countries_df  = generate_dataframe( spark, COUNTRY_SCHEMA  , COUNTRIES_FILE_PATH  )
cities_df     = generate_dataframe( spark, CITY_SCHEMA     , CITIES_FILE_PATH     )

continents_df.registerTempTable('continents')
countries_df.registerTempTable('countries')
cities_df.registerTempTable('cities')

print continents_df.count()
print countries_df.count()

开发者ID:dserban，项目名称:SparkCode，代码行数:33，代码来源:init_metastore_db.py

注：本文中的pyspark.sql.types.StructType.fromJson方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。