当前位置: 首页>>代码示例>>Python>>正文


Python types.TimestampType方法代码示例

本文整理汇总了Python中pyspark.sql.types.TimestampType方法的典型用法代码示例。如果您正苦于以下问题:Python types.TimestampType方法的具体用法?Python types.TimestampType怎么用?Python types.TimestampType使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.sql.types的用法示例。


在下文中一共展示了types.TimestampType方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import TimestampType [as 别名]
def __init__(self, series: "ks.Series"):
        if not isinstance(series.spark.data_type, (DateType, TimestampType)):
            raise ValueError(
                "Cannot call DatetimeMethods on type {}".format(series.spark.data_type)
            )
        self._data = series

    # Properties 
开发者ID:databricks,项目名称:koalas,代码行数:10,代码来源:datetimes.py

示例2: __sub__

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import TimestampType [as 别名]
def __sub__(self, other):
        # Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's
        # behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction.
        if isinstance(other, IndexOpsMixin) and isinstance(self.spark.data_type, TimestampType):
            if not isinstance(other.spark.data_type, TimestampType):
                raise TypeError("datetime subtraction can only be applied to datetime series.")
            return self.astype("bigint") - other.astype("bigint")
        elif isinstance(other, IndexOpsMixin) and isinstance(self.spark.data_type, DateType):
            if not isinstance(other.spark.data_type, DateType):
                raise TypeError("date subtraction can only be applied to date series.")
            return column_op(F.datediff)(self, other)
        else:
            return column_op(Column.__sub__)(self, other) 
开发者ID:databricks,项目名称:koalas,代码行数:15,代码来源:base.py

示例3: is_all_dates

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import TimestampType [as 别名]
def is_all_dates(self):
        """
        Return if all data types of the index are datetime.
        remember that since Koalas does not support multiple data types in an index,
        so it returns True if any type of data is datetime.

        Examples
        --------
        >>> from datetime import datetime

        >>> idx = ks.Index([datetime(2019, 1, 1, 0, 0, 0), datetime(2019, 2, 3, 0, 0, 0)])
        >>> idx
        DatetimeIndex(['2019-01-01', '2019-02-03'], dtype='datetime64[ns]', freq=None)

        >>> idx.is_all_dates
        True

        >>> idx = ks.Index([datetime(2019, 1, 1, 0, 0, 0), None])
        >>> idx
        DatetimeIndex(['2019-01-01', 'NaT'], dtype='datetime64[ns]', freq=None)

        >>> idx.is_all_dates
        True

        >>> idx = ks.Index([0, 1, 2])
        >>> idx
        Int64Index([0, 1, 2], dtype='int64')

        >>> idx.is_all_dates
        False
        """
        return isinstance(self.spark.data_type, TimestampType) 
开发者ID:databricks,项目名称:koalas,代码行数:34,代码来源:indexes.py

示例4: as_spark_type

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import TimestampType [as 别名]
def as_spark_type(tpe) -> types.DataType:
    """
    Given a python type, returns the equivalent spark type.
    Accepts:
    - the built-in types in python
    - the built-in types in numpy
    - list of pairs of (field_name, type)
    - dictionaries of field_name -> type
    - python3's typing system
    """
    if tpe in (str, "str", "string"):
        return types.StringType()
    elif tpe in (bytes,):
        return types.BinaryType()
    elif tpe in (np.int8, "int8", "byte"):
        return types.ByteType()
    elif tpe in (np.int16, "int16", "short"):
        return types.ShortType()
    elif tpe in (int, "int", np.int, np.int32):
        return types.IntegerType()
    elif tpe in (np.int64, "int64", "long", "bigint"):
        return types.LongType()
    elif tpe in (float, "float", np.float):
        return types.FloatType()
    elif tpe in (np.float64, "float64", "double"):
        return types.DoubleType()
    elif tpe in (datetime.datetime, np.datetime64):
        return types.TimestampType()
    elif tpe in (datetime.date,):
        return types.DateType()
    elif tpe in (bool, "boolean", "bool", np.bool):
        return types.BooleanType()
    elif tpe in (np.ndarray,):
        # TODO: support other child types
        return types.ArrayType(types.StringType())
    else:
        raise TypeError("Type %s was not understood." % tpe) 
开发者ID:databricks,项目名称:koalas,代码行数:39,代码来源:typehints.py

示例5: spark_type_to_pandas_dtype

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import TimestampType [as 别名]
def spark_type_to_pandas_dtype(spark_type):
    """ Return the given Spark DataType to pandas dtype. """
    if isinstance(spark_type, (types.DateType, types.UserDefinedType)):
        return np.dtype("object")
    elif isinstance(spark_type, types.TimestampType):
        return np.dtype("datetime64[ns]")
    else:
        return np.dtype(to_arrow_type(spark_type).to_pandas_dtype()) 
开发者ID:databricks,项目名称:koalas,代码行数:10,代码来源:typehints.py

示例6: _convert_from_pandas

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import TimestampType [as 别名]
def _convert_from_pandas(self, pdf, schema, timezone):
        """
         Convert a pandas.DataFrame to list of records that can be used to make a DataFrame
         :return list of records
        """
        if timezone is not None:
            from pyspark.sql.types import _check_series_convert_timestamps_tz_local
            copied = False
            if isinstance(schema, StructType):
                for field in schema:
                    # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
                    if isinstance(field.dataType, TimestampType):
                        s = _check_series_convert_timestamps_tz_local(pdf[field.name], timezone)
                        if s is not pdf[field.name]:
                            if not copied:
                                # Copy once if the series is modified to prevent the original
                                # Pandas DataFrame from being updated
                                pdf = pdf.copy()
                                copied = True
                            pdf[field.name] = s
            else:
                for column, series in pdf.iteritems():
                    s = _check_series_convert_timestamps_tz_local(series, timezone)
                    if s is not series:
                        if not copied:
                            # Copy once if the series is modified to prevent the original
                            # Pandas DataFrame from being updated
                            pdf = pdf.copy()
                            copied = True
                        pdf[column] = s

        # Convert pandas.DataFrame to list of numpy records
        np_records = pdf.to_records(index=False)

        # Check if any columns need to be fixed for Spark to infer properly
        if len(np_records) > 0:
            record_dtype = self._get_numpy_record_dtype(np_records[0])
            if record_dtype is not None:
                return [r.astype(record_dtype).tolist() for r in np_records]

        # Convert list of numpy records to python lists
        return [r.tolist() for r in np_records] 
开发者ID:pingcap,项目名称:tidb-docker-compose,代码行数:44,代码来源:session.py

示例7: _create_from_pandas_with_arrow

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import TimestampType [as 别名]
def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.serializers import ArrowStreamSerializer, _create_batch
        from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType
        from pyspark.sql.utils import require_minimum_pandas_version, \
            require_minimum_pyarrow_version

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [to_arrow_type(TimestampType())
                           if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                           for t in pdf.dtypes]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism)  # round int up
        pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))

        # Create Arrow record batches
        batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
                                 timezone)
                   for pdf_slice in pdf_slices]

        # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
        if isinstance(schema, (list, tuple)):
            struct = from_arrow_schema(batches[0].schema)
            for i, name in enumerate(schema):
                struct.fields[i].name = name
                struct.names[i] = name
            schema = struct

        jsqlContext = self._wrapped._jsqlContext

        def reader_func(temp_filename):
            return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename)

        def create_RDD_server():
            return self._jvm.ArrowRDDServer(jsqlContext)

        # Create Spark DataFrame from Arrow stream file, using one batch per partition
        jrdd = self._sc._serialize_to_jvm(batches, ArrowStreamSerializer(), reader_func,
                                          create_RDD_server)
        jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df 
开发者ID:pingcap,项目名称:tidb-docker-compose,代码行数:61,代码来源:session.py

示例8: _create_from_pandas_with_arrow

# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import TimestampType [as 别名]
def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.serializers import ArrowSerializer, _create_batch
        from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType
        from pyspark.sql.utils import require_minimum_pandas_version, \
            require_minimum_pyarrow_version

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [to_arrow_type(TimestampType())
                           if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                           for t in pdf.dtypes]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism)  # round int up
        pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))

        # Create Arrow record batches
        batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
                                 timezone)
                   for pdf_slice in pdf_slices]

        # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
        if isinstance(schema, (list, tuple)):
            struct = from_arrow_schema(batches[0].schema)
            for i, name in enumerate(schema):
                struct.fields[i].name = name
                struct.names[i] = name
            schema = struct

        # Create the Spark DataFrame directly from the Arrow data and schema
        jrdd = self._sc._serialize_to_jvm(batches, len(batches), ArrowSerializer())
        jdf = self._jvm.PythonSQLUtils.arrowPayloadToDataFrame(
            jrdd, schema.json(), self._wrapped._jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df 
开发者ID:pingcap,项目名称:tidb-docker-compose,代码行数:53,代码来源:session.py


注:本文中的pyspark.sql.types.TimestampType方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。