Python types.StructType类代码示例

本文整理汇总了Python中pyspark.sql.types.StructType类的典型用法代码示例。如果您正苦于以下问题：Python StructType类的具体用法？Python StructType怎么用？Python StructType使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了StructType类的10个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _get_schema

def _get_schema(header, schema):
    if schema is None or len(schema) == 0:
        # Use header to generate schema
        if header is None or len(header) == 0:
            return None
        elif len(header) > 4:
            warnings.warn(WARNING_MOVIE_LENS_HEADER)
            header = header[:4]

        schema = StructType()
        try:
            schema.add(StructField(header[0], IntegerType())).add(
                StructField(header[1], IntegerType())
            ).add(StructField(header[2], FloatType())).add(
                StructField(header[3], LongType())
            )
        except IndexError:
            pass
    else:
        if header is not None:
            warnings.warn(WARNING_HAVE_SCHEMA_AND_HEADER)

        if len(schema) > 4:
            warnings.warn(WARNING_MOVIE_LENS_HEADER)
            schema = schema[:4]

    return schema

开发者ID:David-Li-L，项目名称:recommenders，代码行数:27，代码来源:movielens.py

示例2: get_spark_schema

def get_spark_schema(header=DEFAULT_HEADER):
    ## create schema
    schema = StructType()
    ## do label + ints
    n_ints = 14
    for i in range(n_ints):
        schema.add(StructField(header[i], IntegerType()))
    ## do categoricals
    for i in range(26):
        schema.add(StructField(header[i + n_ints], StringType()))
    return schema

开发者ID:David-Li-L，项目名称:recommenders，代码行数:11，代码来源:criteo.py

示例3: test_rmse

def test_rmse():
    # TODO: revised so that it will take user's inputs instead of hardcoded values

    movies_schema = None
    ratings_schema = None

    # load the schemas
    with open("movielens_20m_movies_schema.json", "r") as json_schema_file:
        movies_schema = StructType.fromJson(json.load(json_schema_file))

    with open("movielens_20m_ratings_schema.json", "r") as json_schema_file:
        ratings_schema = StructType.fromJson(json.load(json_schema_file))

    # create a hdfs directory
    os.system("hdfs dfs -mkdir datasets")

    # load the json file into the hdfs directory
    os.system("hdfs dfs -put movielens_10m_ratings.json.gz datasets/movielens_10m_ratings.json.gz")

    # create a DataFrame based on the content of the json file
    ratingsDF = scsingleton.sqlCtx.read.json("hdfs://localhost:9000/datasets/movielens_10m_ratings.json.gz", schema=ratings_schema)
    # explicitly repartition RDD after loading so that more tasks can run on it in parallel
    # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster
    ratingsDF = ratingsDF.repartition(scsingleton.sc.defaultParallelism * 3)    

    # parse ratings DataFrame into an RDD of [(userId, itemId, rating)]
    ratingsRDD = ratingsDF.map(lambda row: (row.user_id, row.movie_id, row.rating))
    ratingsRDD.cache()

    # split data into train (60%), test (40%)
    # TODO: add validation in the future? train (60%), validation (20%), test(20%)?
    trainingRDD, testRDD = ratingsRDD.randomSplit([0.6, 0.4])
    trainingRDD.cache()
    testRDD.cache()

    # run training algorithm to build the model
    # without validation
    with Timer() as t:
        model = ALS.train(trainingRDD, rank=3)
    print "ALS.train(trainingRDD, rank=3): %s seconds" % t.secs

    # make a prediction
    with Timer() as t:
        testPredRDD = model.predictAll( testRDD.map( lambda x: (x[0], x[1]) ) ).cache()
    print "testPredRDD: %s seconds" % t.secs

    # calculate RMSE
    with Timer() as t:
        testRmse = pm.calculate_rmse_using_rdd(testRDD, testPredRDD)
    print "testRmse: %s seconds" % t.secs
    print "testRmse", testRmse

    return

开发者ID:Lomascolo，项目名称:hermes，代码行数:53，代码来源:cf_example.py

示例4: init

 def __init__(self, scoreAndLabels):
     sc = scoreAndLabels.ctx
     sql_ctx = SQLContext.getOrCreate(sc)
     numCol = len(scoreAndLabels.first())
     schema = StructType([
         StructField("score", DoubleType(), nullable=False),
         StructField("label", DoubleType(), nullable=False)])
     if numCol == 3:
         schema.add("weight", DoubleType(), False)
     df = sql_ctx.createDataFrame(scoreAndLabels, schema=schema)
     java_class = sc._jvm.org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
     java_model = java_class(df._jdf)
     super(BinaryClassificationMetrics, self).__init__(java_model)

开发者ID:apache，项目名称:spark，代码行数:13，代码来源:evaluation.py

示例5: init

 def __init__(self, predAndLabelsWithOptWeight):
     sc = predAndLabelsWithOptWeight.ctx
     sql_ctx = SQLContext.getOrCreate(sc)
     numCol = len(predAndLabelsWithOptWeight.first())
     schema = StructType([
         StructField("prediction", DoubleType(), nullable=False),
         StructField("label", DoubleType(), nullable=False)])
     if (numCol == 3):
         schema.add("weight", DoubleType(), False)
     df = sql_ctx.createDataFrame(predAndLabelsWithOptWeight, schema)
     java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics
     java_model = java_class(df._jdf)
     super(MulticlassMetrics, self).__init__(java_model)

开发者ID:CodingCat，项目名称:spark，代码行数:13，代码来源:evaluation.py

示例6: createDataFrame

    def createDataFrame(self, data, schema=None, samplingRatio=None):
        """
        Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.

        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.

        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of :class:`Row`,
        or :class:`namedtuple`, or :class:`dict`.

        When ``schema`` is :class:`DataType` or datatype string, it must match the real data, or
        exception will be thrown at runtime. If the given schema is not StructType, it will be
        wrapped into a StructType as its only field, and the field name will be "value", each record
        will also be wrapped into a tuple, which can be converted to row later.

        If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
        rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.

        :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean,
            etc.), or :class:`list`, or :class:`pandas.DataFrame`.
        :param schema: a :class:`DataType` or a datatype string or a list of column names, default
            is None.  The data type string format equals to `DataType.simpleString`, except that
            top level struct type can omit the `struct<>` and atomic types use `typeName()` as
            their format, e.g. use `byte` instead of `tinyint` for ByteType. We can also use `int`
            as a short name for IntegerType.
        :param samplingRatio: the sample ratio of rows used for inferring
        :return: :class:`DataFrame`

        .. versionchanged:: 2.0
           The schema parameter can be a DataType or a datatype string after 2.0. If it's not a
           StructType, it will be wrapped into a StructType and each record will also be wrapped
           into a tuple.

        >>> l = [('Alice', 1)]
        >>> spark.createDataFrame(l).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> spark.createDataFrame(l, ['name', 'age']).collect()
        [Row(name=u'Alice', age=1)]

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> spark.createDataFrame(d).collect()
        [Row(age=1, name=u'Alice')]

        >>> rdd = sc.parallelize(l)
        >>> spark.createDataFrame(rdd).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> df = spark.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = spark.createDataFrame(person)
        >>> df2.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = spark.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name=u'Alice', age=1)]

        >>> spark.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name=u'Alice', age=1)]
        >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
        [Row(0=1, 1=2)]

        >>> spark.createDataFrame(rdd, "a: string, b: int").collect()
        [Row(a=u'Alice', b=1)]
        >>> rdd = rdd.map(lambda row: row[1])
        >>> spark.createDataFrame(rdd, "int").collect()
        [Row(value=1)]
        >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        Py4JJavaError: ...
        """
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if isinstance(schema, basestring):
            schema = _parse_datatype_string(schema)

        try:
            import pandas
            has_pandas = True
        except Exception:
            has_pandas = False
        if has_pandas and isinstance(data, pandas.DataFrame):
            if schema is None:
                schema = [str(x) for x in data.columns]
            data = [r.tolist() for r in data.to_records(index=False)]

        if isinstance(schema, StructType):
            def prepare(obj):
                _verify_type(obj, schema)
#.........这里部分代码省略.........

开发者ID:GIladland，项目名称:spark，代码行数:101，代码来源:session.py

示例7: createDataFrame

    def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True):
        """
        Creates a :class:`DataFrame` from an :class:`RDD`, a list or a :class:`pandas.DataFrame`.

        When ``schema`` is a list of column names, the type of each column
        will be inferred from ``data``.

        When ``schema`` is ``None``, it will try to infer the schema (column names and types)
        from ``data``, which should be an RDD of :class:`Row`,
        or :class:`namedtuple`, or :class:`dict`.

        When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must match
        the real data, or an exception will be thrown at runtime. If the given schema is not
        :class:`pyspark.sql.types.StructType`, it will be wrapped into a
        :class:`pyspark.sql.types.StructType` as its only field, and the field name will be "value",
        each record will also be wrapped into a tuple, which can be converted to row later.

        If schema inference is needed, ``samplingRatio`` is used to determined the ratio of
        rows used for schema inference. The first row will be used if ``samplingRatio`` is ``None``.

        :param data: an RDD of any kind of SQL data representation(e.g. row, tuple, int, boolean,
            etc.), or :class:`list`, or :class:`pandas.DataFrame`.
        :param schema: a :class:`pyspark.sql.types.DataType` or a datatype string or a list of
            column names, default is ``None``.  The data type string format equals to
            :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can
            omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use
            ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`. We can also use
            ``int`` as a short name for ``IntegerType``.
        :param samplingRatio: the sample ratio of rows used for inferring
        :param verifySchema: verify data types of every row against schema.
        :return: :class:`DataFrame`

        .. versionchanged:: 2.1
           Added verifySchema.

        .. note:: Usage with spark.sql.execution.arrow.enabled=True is experimental.

        >>> l = [('Alice', 1)]
        >>> spark.createDataFrame(l).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> spark.createDataFrame(l, ['name', 'age']).collect()
        [Row(name=u'Alice', age=1)]

        >>> d = [{'name': 'Alice', 'age': 1}]
        >>> spark.createDataFrame(d).collect()
        [Row(age=1, name=u'Alice')]

        >>> rdd = sc.parallelize(l)
        >>> spark.createDataFrame(rdd).collect()
        [Row(_1=u'Alice', _2=1)]
        >>> df = spark.createDataFrame(rdd, ['name', 'age'])
        >>> df.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql import Row
        >>> Person = Row('name', 'age')
        >>> person = rdd.map(lambda r: Person(*r))
        >>> df2 = spark.createDataFrame(person)
        >>> df2.collect()
        [Row(name=u'Alice', age=1)]

        >>> from pyspark.sql.types import *
        >>> schema = StructType([
        ...    StructField("name", StringType(), True),
        ...    StructField("age", IntegerType(), True)])
        >>> df3 = spark.createDataFrame(rdd, schema)
        >>> df3.collect()
        [Row(name=u'Alice', age=1)]

        >>> spark.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
        [Row(name=u'Alice', age=1)]
        >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
        [Row(0=1, 1=2)]

        >>> spark.createDataFrame(rdd, "a: string, b: int").collect()
        [Row(a=u'Alice', b=1)]
        >>> rdd = rdd.map(lambda row: row[1])
        >>> spark.createDataFrame(rdd, "int").collect()
        [Row(value=1)]
        >>> spark.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        Py4JJavaError: ...
        """
        SparkSession._activeSession = self
        self._jvm.SparkSession.setActiveSession(self._jsparkSession)
        if isinstance(data, DataFrame):
            raise TypeError("data is already a DataFrame")

        if isinstance(schema, basestring):
            schema = _parse_datatype_string(schema)
        elif isinstance(schema, (list, tuple)):
            # Must re-encode any unicode strings to be consistent with StructField names
            schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in schema]

        try:
            import pandas
            has_pandas = True
        except Exception:
            has_pandas = False
#.........这里部分代码省略.........

开发者ID:CodingCat，项目名称:spark，代码行数:101，代码来源:session.py

示例8: _create_from_pandas_with_arrow

    def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from distutils.version import LooseVersion
        from pyspark.serializers import ArrowStreamPandasSerializer
        from pyspark.sql.types import from_arrow_type, to_arrow_type, TimestampType
        from pyspark.sql.utils import require_minimum_pandas_version, \
            require_minimum_pyarrow_version

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
        import pyarrow as pa

        # Create the Spark schema from list of names passed in with Arrow types
        if isinstance(schema, (list, tuple)):
            if LooseVersion(pa.__version__) < LooseVersion("0.12.0"):
                temp_batch = pa.RecordBatch.from_pandas(pdf[0:100], preserve_index=False)
                arrow_schema = temp_batch.schema
            else:
                arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False)
            struct = StructType()
            for name, field in zip(schema, arrow_schema):
                struct.add(name, from_arrow_type(field.type), nullable=field.nullable)
            schema = struct

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [to_arrow_type(TimestampType())
                           if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                           for t in pdf.dtypes]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism)  # round int up
        pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))

        # Create list of Arrow (columns, type) for serializer dump_stream
        arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]
                      for pdf_slice in pdf_slices]

        jsqlContext = self._wrapped._jsqlContext

        safecheck = self._wrapped._conf.arrowSafeTypeConversion()
        col_by_name = True  # col by name only applies to StructType columns, can't happen here
        ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name)

        def reader_func(temp_filename):
            return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename)

        def create_RDD_server():
            return self._jvm.ArrowRDDServer(jsqlContext)

        # Create Spark DataFrame from Arrow stream file, using one batch per partition
        jrdd = self._sc._serialize_to_jvm(arrow_data, ser, reader_func, create_RDD_server)
        jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df

开发者ID:amolthacker，项目名称:spark，代码行数:67，代码来源:session.py

示例9: get_twitter_schema

def get_twitter_schema(json_file_name):
    schema_dict = json.load(open(json_file_name))
    schema_struct = StructType.fromJson(schema_dict)
    return schema_struct

开发者ID:Lab41，项目名称:soft-boiled，代码行数:4，代码来源:schema.py

示例10: generate_schema_dict

COUNTRIES_FILE_PATH  = '/opt/SparkDatasets/geography/countries.csv'
CITIES_FILE_PATH     = '/opt/SparkDatasets/geography/cities.csv'

CONTINENT_STRUCTURE = \
    [ ( 'continent_id'  , 'integer' )
    , ( 'continent_name', 'string'  ) ]
COUNTRY_STRUCTURE = \
    [ ( 'country_id'  , 'integer' )
    , ( 'continent_id', 'integer' )
    , ( 'country_name', 'string'  ) ]
CITY_STRUCTURE = \
    [ ( 'city_id'   , 'integer' )
    , ( 'country_id', 'integer' )
    , ( 'city_name' , 'string'  ) ]

CONTINENT_SCHEMA = StructType.fromJson( generate_schema_dict(CONTINENT_STRUCTURE) )
COUNTRY_SCHEMA   = StructType.fromJson( generate_schema_dict(COUNTRY_STRUCTURE)   )
CITY_SCHEMA      = StructType.fromJson( generate_schema_dict(CITY_STRUCTURE)      )

spark = SparkSession.builder.getOrCreate()

continents_df = generate_dataframe( spark, CONTINENT_SCHEMA, CONTINENTS_FILE_PATH )
countries_df  = generate_dataframe( spark, COUNTRY_SCHEMA  , COUNTRIES_FILE_PATH  )
cities_df     = generate_dataframe( spark, CITY_SCHEMA     , CITIES_FILE_PATH     )

continents_df.registerTempTable('continents')
countries_df.registerTempTable('countries')
cities_df.registerTempTable('cities')

print continents_df.count()
print countries_df.count()

开发者ID:dserban，项目名称:SparkCode，代码行数:31，代码来源:init_metastore_db.py

注：本文中的pyspark.sql.types.StructType类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。