当前位置: 首页>>代码示例>>Python>>正文


Python types.is_datetime64_dtype方法代码示例

本文整理汇总了Python中pandas.api.types.is_datetime64_dtype方法的典型用法代码示例。如果您正苦于以下问题:Python types.is_datetime64_dtype方法的具体用法?Python types.is_datetime64_dtype怎么用?Python types.is_datetime64_dtype使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pandas.api.types的用法示例。


在下文中一共展示了types.is_datetime64_dtype方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _get_columns_info

# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def _get_columns_info(self, stats):
        column_info = {}
        column_info[self.TYPE_CONSTANT] = stats['uniques'][stats['uniques'] == 1].index
        column_info[self.TYPE_BOOL] = stats['uniques'][stats['uniques'] == 2].index
        rest_columns = self.get_columns(self.df,
                                        self.EXCLUDE,
                                        column_info['constant'].union(column_info['bool']))
        column_info[self.TYPE_NUMERIC] = pd.Index([c for c in rest_columns
                                                   if types.is_numeric_dtype(self.df[c])])
        rest_columns = self.get_columns(
            self.df[rest_columns], self.EXCLUDE, column_info['numeric'])
        column_info[self.TYPE_DATE] = pd.Index([c for c in rest_columns
                                                if types.is_datetime64_dtype(self.df[c])])
        rest_columns = self.get_columns(
            self.df[rest_columns], self.EXCLUDE, column_info['date'])
        unique_columns = stats['uniques'][rest_columns] == stats['counts'][rest_columns]
        column_info[self.TYPE_UNIQUE] = stats['uniques'][rest_columns][unique_columns].index
        column_info[self.TYPE_CATEGORICAL] = stats['uniques'][rest_columns][~unique_columns].index
        return column_info 
开发者ID:mouradmourafiq,项目名称:pandas-summary,代码行数:21,代码来源:__init__.py

示例2: infer_pd_series_spark_type

# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def infer_pd_series_spark_type(s: pd.Series) -> types.DataType:
    """Infer Spark DataType from pandas Series dtype.

    :param s: :class:`pandas.Series` to be inferred
    :return: the inferred Spark data type
    """
    dt = s.dtype
    if dt == np.dtype("object"):
        if len(s) == 0 or s.isnull().all():
            raise ValueError("can not infer schema from empty or null dataset")
        elif hasattr(s[0], "__UDT__"):
            return s[0].__UDT__
        else:
            return from_arrow_type(pa.Array.from_pandas(s).type)
    elif is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
        return types.TimestampType()
    else:
        return from_arrow_type(pa.from_numpy_dtype(dt)) 
开发者ID:databricks,项目名称:koalas,代码行数:20,代码来源:typehints.py

示例3: _check_series_convert_timestamps_localize

# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def _check_series_convert_timestamps_localize(s, from_timezone, to_timezone):
    """
    Convert timestamp to timezone-naive in the specified timezone or local timezone

    :param s: a pandas.Series
    :param from_timezone: the timezone to convert from. if None then use local timezone
    :param to_timezone: the timezone to convert to. if None then use local timezone
    :return pandas.Series where if it is a timestamp, has been converted to tz-naive
    """
    from pyspark.sql.utils import require_minimum_pandas_version
    require_minimum_pandas_version()

    import pandas as pd
    from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype
    from_tz = from_timezone or _get_local_timezone()
    to_tz = to_timezone or _get_local_timezone()
    # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
    if is_datetime64tz_dtype(s.dtype):
        return s.dt.tz_convert(to_tz).dt.tz_localize(None)
    elif is_datetime64_dtype(s.dtype) and from_tz != to_tz:
        # `s.dt.tz_localize('tzlocal()')` doesn't work properly when including NaT.
        return s.apply(
            lambda ts: ts.tz_localize(from_tz, ambiguous=False).tz_convert(to_tz).tz_localize(None)
            if ts is not pd.NaT else pd.NaT)
    else:
        return s 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:28,代码来源:types.py

示例4: class_from_dtype

# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def class_from_dtype(dtype) -> type:
        """
        Determine ColumnType class, based on pandas/numpy `dtype`.
        """
        if is_numeric_dtype(dtype):
            return ColumnType.NUMBER
        elif is_datetime64_dtype(dtype):
            return ColumnType.DATETIME
        elif dtype == object or dtype == "category":
            return ColumnType.TEXT
        else:
            raise ValueError(f"Unknown dtype: {dtype}") 
开发者ID:CJWorkbench,项目名称:cjworkbench,代码行数:14,代码来源:types.py

示例5: _create_from_pandas_with_arrow

# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.serializers import ArrowStreamSerializer, _create_batch
        from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType
        from pyspark.sql.utils import require_minimum_pandas_version, \
            require_minimum_pyarrow_version

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [to_arrow_type(TimestampType())
                           if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                           for t in pdf.dtypes]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism)  # round int up
        pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))

        # Create Arrow record batches
        batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
                                 timezone)
                   for pdf_slice in pdf_slices]

        # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
        if isinstance(schema, (list, tuple)):
            struct = from_arrow_schema(batches[0].schema)
            for i, name in enumerate(schema):
                struct.fields[i].name = name
                struct.names[i] = name
            schema = struct

        jsqlContext = self._wrapped._jsqlContext

        def reader_func(temp_filename):
            return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename)

        def create_RDD_server():
            return self._jvm.ArrowRDDServer(jsqlContext)

        # Create Spark DataFrame from Arrow stream file, using one batch per partition
        jrdd = self._sc._serialize_to_jvm(batches, ArrowStreamSerializer(), reader_func,
                                          create_RDD_server)
        jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df 
开发者ID:pingcap,项目名称:tidb-docker-compose,代码行数:61,代码来源:session.py

示例6: _create_from_pandas_with_arrow

# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.serializers import ArrowSerializer, _create_batch
        from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType
        from pyspark.sql.utils import require_minimum_pandas_version, \
            require_minimum_pyarrow_version

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [to_arrow_type(TimestampType())
                           if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                           for t in pdf.dtypes]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism)  # round int up
        pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))

        # Create Arrow record batches
        batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
                                 timezone)
                   for pdf_slice in pdf_slices]

        # Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
        if isinstance(schema, (list, tuple)):
            struct = from_arrow_schema(batches[0].schema)
            for i, name in enumerate(schema):
                struct.fields[i].name = name
                struct.names[i] = name
            schema = struct

        # Create the Spark DataFrame directly from the Arrow data and schema
        jrdd = self._sc._serialize_to_jvm(batches, len(batches), ArrowSerializer())
        jdf = self._jvm.PythonSQLUtils.arrowPayloadToDataFrame(
            jrdd, schema.json(), self._wrapped._jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df 
开发者ID:pingcap,项目名称:tidb-docker-compose,代码行数:53,代码来源:session.py

示例7: _check_series_convert_timestamps_internal

# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def _check_series_convert_timestamps_internal(s, timezone):
    """
    Convert a tz-naive timestamp in the specified timezone or local timezone to UTC normalized for
    Spark internal storage

    :param s: a pandas.Series
    :param timezone: the timezone to convert. if None then use local timezone
    :return pandas.Series where if it is a timestamp, has been UTC normalized without a time zone
    """
    from pyspark.sql.utils import require_minimum_pandas_version
    require_minimum_pandas_version()

    from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
    # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
    if is_datetime64_dtype(s.dtype):
        # When tz_localize a tz-naive timestamp, the result is ambiguous if the tz-naive
        # timestamp is during the hour when the clock is adjusted backward during due to
        # daylight saving time (dst).
        # E.g., for America/New_York, the clock is adjusted backward on 2015-11-01 2:00 to
        # 2015-11-01 1:00 from dst-time to standard time, and therefore, when tz_localize
        # a tz-naive timestamp 2015-11-01 1:30 with America/New_York timezone, it can be either
        # dst time (2015-01-01 1:30-0400) or standard time (2015-11-01 1:30-0500).
        #
        # Here we explicit choose to use standard time. This matches the default behavior of
        # pytz.
        #
        # Here are some code to help understand this behavior:
        # >>> import datetime
        # >>> import pandas as pd
        # >>> import pytz
        # >>>
        # >>> t = datetime.datetime(2015, 11, 1, 1, 30)
        # >>> ts = pd.Series([t])
        # >>> tz = pytz.timezone('America/New_York')
        # >>>
        # >>> ts.dt.tz_localize(tz, ambiguous=True)
        # 0   2015-11-01 01:30:00-04:00
        # dtype: datetime64[ns, America/New_York]
        # >>>
        # >>> ts.dt.tz_localize(tz, ambiguous=False)
        # 0   2015-11-01 01:30:00-05:00
        # dtype: datetime64[ns, America/New_York]
        # >>>
        # >>> str(tz.localize(t))
        # '2015-11-01 01:30:00-05:00'
        tz = timezone or _get_local_timezone()
        return s.dt.tz_localize(tz, ambiguous=False).dt.tz_convert('UTC')
    elif is_datetime64tz_dtype(s.dtype):
        return s.dt.tz_convert('UTC')
    else:
        return s 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:53,代码来源:types.py

示例8: from_pandas

# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def from_pandas(pdf: pd.DataFrame) -> "InternalFrame":
        """ Create an immutable DataFrame from pandas DataFrame.

        :param pdf: :class:`pd.DataFrame`
        :return: the created immutable DataFrame
        """
        columns = pdf.columns
        data_columns = [name_like_string(col) for col in columns]
        if isinstance(columns, pd.MultiIndex):
            column_labels = columns.tolist()
        else:
            column_labels = None
        column_label_names = columns.names

        index_names = [
            name if name is None or isinstance(name, tuple) else (name,) for name in pdf.index.names
        ]
        index_columns = [SPARK_INDEX_NAME_FORMAT(i) for i in range(len(index_names))]

        pdf = pdf.copy()
        pdf.index.names = index_columns
        reset_index = pdf.reset_index()
        reset_index.columns = index_columns + data_columns
        schema = StructType(
            [
                StructField(
                    name, infer_pd_series_spark_type(col), nullable=bool(col.isnull().any()),
                )
                for name, col in reset_index.iteritems()
            ]
        )
        for name, col in reset_index.iteritems():
            dt = col.dtype
            if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
                continue
            reset_index[name] = col.replace({np.nan: None})
        sdf = default_session().createDataFrame(reset_index, schema=schema)
        return InternalFrame(
            spark_frame=sdf,
            index_map=OrderedDict(zip(index_columns, index_names)),
            column_labels=column_labels,
            data_spark_columns=[scol_for(sdf, col) for col in data_columns],
            column_label_names=column_label_names,
        ) 
开发者ID:databricks,项目名称:koalas,代码行数:46,代码来源:internal.py

示例9: autocast_series_dtype

# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def autocast_series_dtype(series: pd.Series) -> pd.Series:
    """
    Cast any sane Series to str/category[str]/number/datetime.

    This is appropriate when parsing CSV data or Excel data. It _seems_
    appropriate when a search-and-replace produces numeric columns like
    '$1.32' => '1.32' ... but perhaps that's only appropriate in very-specific
    cases.

    The input must be "sane": if the dtype is object or category, se assume
    _every value_ is str (or null).

    If the series is all-null, do nothing.

    Avoid spurious calls to this function: it's expensive.

    TODO handle dates and maybe booleans.
    """
    if series.dtype == object:
        nulls = series.isnull()
        if (nulls | (series == "")).all():
            return series
        try:
            # If it all looks like numbers (like in a CSV), cast to number.
            return pd.to_numeric(series)
        except (ValueError, TypeError):
            # Otherwise, we want all-string. Is that what we already have?
            #
            # TODO assert that we already have all-string, and nix this
            # spurious conversion.
            array = series[~nulls].array
            if any(type(x) != str for x in array):
                series = series.astype(str)
                series[nulls] = None
            return series
    elif hasattr(series, "cat"):
        # Categorical series. Try to infer type of series.
        #
        # Assume categories are all str: after all, we're assuming the input is
        # "sane" and "sane" means only str categories are valid.
        if (series.isnull() | (series == "")).all():
            return series
        try:
            return pd.to_numeric(series)
        except (ValueError, TypeError):
            # We don't cast categories to str here -- because we have no
            # callers that would create categories that aren't all-str. If we
            # ever do, this is where we should do the casting.
            return series
    else:
        assert is_numeric_dtype(series) or is_datetime64_dtype(series)
        return series 
开发者ID:CJWorkbench,项目名称:cjworkbench,代码行数:54,代码来源:scrapetable.py


注:本文中的pandas.api.types.is_datetime64_dtype方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。