Python types.DataType方法代码示例

本文整理汇总了Python中pyspark.sql.types.DataType方法的典型用法代码示例。如果您正苦于以下问题：Python types.DataType方法的具体用法？Python types.DataType怎么用？Python types.DataType使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.types的用法示例。

在下文中一共展示了types.DataType方法的11个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _simplify_data_type

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DataType [as 别名]
def _simplify_data_type(data_type: T.DataType) -> Tuple:
    """Simplify datatype into a tuple of equality information we care about

    Most notably this ignores nullability concerns due to hive not
    being able to represent not null in it's schemas.
    """
    try:
        # Normalize UDT into it's sql form. Allows comparison of schemas
        # from hive and spark.
        sql_type = data_type.sqlType()  # type: ignore
    except AttributeError:
        sql_type = data_type

    if isinstance(sql_type, T.StructType):
        return ('StructType', [(field.name, _simplify_data_type(field.dataType)) for field in sql_type])
    elif isinstance(sql_type, T.ArrayType):
        return ('ArrayType', _simplify_data_type(sql_type.elementType))
    else:
        return (type(sql_type).__name__,)

开发者ID:wikimedia，项目名称:search-MjoLniR，代码行数:21，代码来源:transform.py

示例2: infer_pd_series_spark_type

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DataType [as 别名]
def infer_pd_series_spark_type(s: pd.Series) -> types.DataType:
    """Infer Spark DataType from pandas Series dtype.

    :param s: :class:`pandas.Series` to be inferred
    :return: the inferred Spark data type
    """
    dt = s.dtype
    if dt == np.dtype("object"):
        if len(s) == 0 or s.isnull().all():
            raise ValueError("can not infer schema from empty or null dataset")
        elif hasattr(s[0], "__UDT__"):
            return s[0].__UDT__
        else:
            return from_arrow_type(pa.Array.from_pandas(s).type)
    elif is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
        return types.TimestampType()
    else:
        return from_arrow_type(pa.from_numpy_dtype(dt))

开发者ID:databricks，项目名称:koalas，代码行数:20，代码来源:typehints.py

示例3: default

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DataType [as 别名]
def default(value, **kwargs) -> pt.DataType:
    raise com.IbisTypeError('Value {!r} is not a valid datatype'.format(value))

开发者ID:ibis-project，项目名称:ibis，代码行数:4，代码来源:datatypes.py

示例4: from_spark_dtype

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DataType [as 别名]
def from_spark_dtype(value: pt.DataType) -> pt.DataType:
    return value

开发者ID:ibis-project，项目名称:ibis，代码行数:4，代码来源:datatypes.py

示例5: smvArrayFlatten

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DataType [as 别名]
def smvArrayFlatten(self, elemType):
        """smvArrayFlatten helper applies flatten operation on an Array of Array
            column.

            Example:
                >>> df.select(col('arrayOfArrayOfStr').smvArrayFlatten(StringType()))

            Args:
                elemType (DataType or DataFram): array element's data type,
                    in object form or the DataFrame to infer the
                    element data type
        """
        if(isinstance(elemType, DataType)):
            elemTypeJson = elemType.json()
        elif(isinstance(elemType, DataFrame)):
            elemTypeJson = elemType.select(self.col)\
                .schema.fields[0].dataType.elementType.elementType.json()
        else:
            raise SmvRuntimeError("smvArrayFlatten does not support type: {}".format(type(elemType)))

        jc = self._jColumnHelper.smvArrayFlatten(elemTypeJson)
        return Column(jc)



# Initialize DataFrame and Column with helper methods. Called by SmvApp.

开发者ID:TresAmigosSD，项目名称:SMV，代码行数:28，代码来源:helpers.py

示例6: _count_expr

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DataType [as 别名]
def _count_expr(col: spark.Column, spark_type: DataType) -> spark.Column:
        # Special handle floating point types because Spark's count treats nan as a valid value,
        # whereas pandas count doesn't include nan.
        if isinstance(spark_type, (FloatType, DoubleType)):
            return F.count(F.nanvl(col, F.lit(None)))
        else:
            return F.count(col)

开发者ID:databricks，项目名称:koalas，代码行数:9，代码来源:generic.py

示例7: as_spark_type

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DataType [as 别名]
def as_spark_type(tpe) -> types.DataType:
    """
    Given a python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - python3's typing system
    """
    if tpe in (str, "str", "string"):
        return types.StringType()
    elif tpe in (bytes,):
        return types.BinaryType()
    elif tpe in (np.int8, "int8", "byte"):
        return types.ByteType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    elif tpe in (int, "int", np.int, np.int32):
        return types.IntegerType()
    elif tpe in (np.int64, "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (float, "float", np.float):
        return types.FloatType()
    elif tpe in (np.float64, "float64", "double"):
        return types.DoubleType()
    elif tpe in (datetime.datetime, np.datetime64):
        return types.TimestampType()
    elif tpe in (datetime.date,):
        return types.DateType()
    elif tpe in (bool, "boolean", "bool", np.bool):
        return types.BooleanType()
    elif tpe in (np.ndarray,):
        # TODO: support other child types
        return types.ArrayType(types.StringType())
    else:
        raise TypeError("Type %s was not understood." % tpe)

开发者ID:databricks，项目名称:koalas，代码行数:39，代码来源:typehints.py

示例8: spark_type_to_pandas_dtype

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DataType [as 别名]
def spark_type_to_pandas_dtype(spark_type):
    """ Return the given Spark DataType to pandas dtype. """
    if isinstance(spark_type, (types.DateType, types.UserDefinedType)):
        return np.dtype("object")
    elif isinstance(spark_type, types.TimestampType):
        return np.dtype("datetime64[ns]")
    else:
        return np.dtype(to_arrow_type(spark_type).to_pandas_dtype())

开发者ID:databricks，项目名称:koalas，代码行数:10，代码来源:typehints.py

示例9: _return_type

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DataType [as 别名]
def _return_type(self) -> DataType:
        hint = self._return_type_hint
        # The logic is simple for now, because it corresponds to the default
        # case: continuous predictions
        # TODO: do something smarter, for example when there is a sklearn.Classifier (it should
        # return an integer or a categorical)
        # We can do the same for pytorch/tensorflow/keras models by looking at the output types.
        # However, this is probably better done in mlflow than here.
        if hint == "infer" or not hint:
            hint = np.float64
        return as_spark_type(hint)

开发者ID:databricks，项目名称:koalas，代码行数:13，代码来源:mlflow.py

示例10: from_json

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DataType [as 别名]
def from_json(col, schema, options={}):
    """
    Parses a column containing a JSON string into a :class:`MapType` with :class:`StringType`
    as keys type, :class:`StructType` or :class:`ArrayType` with
    the specified schema. Returns `null`, in the case of an unparseable string.

    :param col: string column in json format
    :param schema: a StructType or ArrayType of StructType to use when parsing the json column.
    :param options: options to control parsing. accepts the same options as the json datasource

    .. note:: Since Spark 2.3, the DDL-formatted string or a JSON format string is also
              supported for ``schema``.

    >>> from pyspark.sql.types import *
    >>> data = [(1, '''{"a": 1}''')]
    >>> schema = StructType([StructField("a", IntegerType())])
    >>> df = spark.createDataFrame(data, ("key", "value"))
    >>> df.select(from_json(df.value, schema).alias("json")).collect()
    [Row(json=Row(a=1))]
    >>> df.select(from_json(df.value, "a INT").alias("json")).collect()
    [Row(json=Row(a=1))]
    >>> df.select(from_json(df.value, "MAP<STRING,INT>").alias("json")).collect()
    [Row(json={u'a': 1})]
    >>> data = [(1, '''[{"a": 1}]''')]
    >>> schema = ArrayType(StructType([StructField("a", IntegerType())]))
    >>> df = spark.createDataFrame(data, ("key", "value"))
    >>> df.select(from_json(df.value, schema).alias("json")).collect()
    [Row(json=[Row(a=1)])]
    >>> schema = schema_of_json(lit('''{"a": 0}'''))
    >>> df.select(from_json(df.value, schema).alias("json")).collect()
    [Row(json=Row(a=1))]
    >>> data = [(1, '''[1, 2, 3]''')]
    >>> schema = ArrayType(IntegerType())
    >>> df = spark.createDataFrame(data, ("key", "value"))
    >>> df.select(from_json(df.value, schema).alias("json")).collect()
    [Row(json=[1, 2, 3])]
    """

    sc = SparkContext._active_spark_context
    if isinstance(schema, DataType):
        schema = schema.json()
    elif isinstance(schema, Column):
        schema = _to_java_column(schema)
    jc = sc._jvm.functions.from_json(_to_java_column(col), schema, options)
    return Column(jc)

开发者ID:runawayhorse001，项目名称:LearningApacheSpark，代码行数:47，代码来源:functions.py

示例11: udf

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DataType [as 别名]
def udf(f=None, returnType=StringType()):
    """Creates a user defined function (UDF).

    .. note:: The user-defined functions are considered deterministic by default. Due to
        optimization, duplicate invocations may be eliminated or the function may even be invoked
        more times than it is present in the query. If your function is not deterministic, call
        `asNondeterministic` on the user defined function. E.g.:

    >>> from pyspark.sql.types import IntegerType
    >>> import random
    >>> random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic()

    .. note:: The user-defined functions do not support conditional expressions or short circuiting
        in boolean expressions and it ends up with being executed all internally. If the functions
        can fail on special rows, the workaround is to incorporate the condition into the functions.

    .. note:: The user-defined functions do not take keyword arguments on the calling side.

    :param f: python function if used as a standalone function
    :param returnType: the return type of the user-defined function. The value can be either a
        :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.

    >>> from pyspark.sql.types import IntegerType
    >>> slen = udf(lambda s: len(s), IntegerType())
    >>> @udf
    ... def to_upper(s):
    ...     if s is not None:
    ...         return s.upper()
    ...
    >>> @udf(returnType=IntegerType())
    ... def add_one(x):
    ...     if x is not None:
    ...         return x + 1
    ...
    >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
    >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")).show()
    +----------+--------------+------------+
    |slen(name)|to_upper(name)|add_one(age)|
    +----------+--------------+------------+
    |         8|      JOHN DOE|          22|
    +----------+--------------+------------+
    """
    # decorator @udf, @udf(), @udf(dataType())
    if f is None or isinstance(f, (str, DataType)):
        # If DataType has been passed as a positional argument
        # for decorator use it as a returnType
        return_type = f or returnType
        return functools.partial(_create_udf, returnType=return_type,
                                 evalType=PythonEvalType.SQL_BATCHED_UDF)
    else:
        return _create_udf(f=f, returnType=returnType,
                           evalType=PythonEvalType.SQL_BATCHED_UDF)

开发者ID:runawayhorse001，项目名称:LearningApacheSpark，代码行数:54，代码来源:functions.py

注：本文中的pyspark.sql.types.DataType方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。