本文整理汇总了Python中pandas.api.types.is_datetime64_dtype方法的典型用法代码示例。如果您正苦于以下问题:Python types.is_datetime64_dtype方法的具体用法?Python types.is_datetime64_dtype怎么用?Python types.is_datetime64_dtype使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pandas.api.types
的用法示例。
在下文中一共展示了types.is_datetime64_dtype方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _get_columns_info
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def _get_columns_info(self, stats):
column_info = {}
column_info[self.TYPE_CONSTANT] = stats['uniques'][stats['uniques'] == 1].index
column_info[self.TYPE_BOOL] = stats['uniques'][stats['uniques'] == 2].index
rest_columns = self.get_columns(self.df,
self.EXCLUDE,
column_info['constant'].union(column_info['bool']))
column_info[self.TYPE_NUMERIC] = pd.Index([c for c in rest_columns
if types.is_numeric_dtype(self.df[c])])
rest_columns = self.get_columns(
self.df[rest_columns], self.EXCLUDE, column_info['numeric'])
column_info[self.TYPE_DATE] = pd.Index([c for c in rest_columns
if types.is_datetime64_dtype(self.df[c])])
rest_columns = self.get_columns(
self.df[rest_columns], self.EXCLUDE, column_info['date'])
unique_columns = stats['uniques'][rest_columns] == stats['counts'][rest_columns]
column_info[self.TYPE_UNIQUE] = stats['uniques'][rest_columns][unique_columns].index
column_info[self.TYPE_CATEGORICAL] = stats['uniques'][rest_columns][~unique_columns].index
return column_info
示例2: infer_pd_series_spark_type
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def infer_pd_series_spark_type(s: pd.Series) -> types.DataType:
"""Infer Spark DataType from pandas Series dtype.
:param s: :class:`pandas.Series` to be inferred
:return: the inferred Spark data type
"""
dt = s.dtype
if dt == np.dtype("object"):
if len(s) == 0 or s.isnull().all():
raise ValueError("can not infer schema from empty or null dataset")
elif hasattr(s[0], "__UDT__"):
return s[0].__UDT__
else:
return from_arrow_type(pa.Array.from_pandas(s).type)
elif is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
return types.TimestampType()
else:
return from_arrow_type(pa.from_numpy_dtype(dt))
示例3: _check_series_convert_timestamps_localize
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def _check_series_convert_timestamps_localize(s, from_timezone, to_timezone):
"""
Convert timestamp to timezone-naive in the specified timezone or local timezone
:param s: a pandas.Series
:param from_timezone: the timezone to convert from. if None then use local timezone
:param to_timezone: the timezone to convert to. if None then use local timezone
:return pandas.Series where if it is a timestamp, has been converted to tz-naive
"""
from pyspark.sql.utils import require_minimum_pandas_version
require_minimum_pandas_version()
import pandas as pd
from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype
from_tz = from_timezone or _get_local_timezone()
to_tz = to_timezone or _get_local_timezone()
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?
if is_datetime64tz_dtype(s.dtype):
return s.dt.tz_convert(to_tz).dt.tz_localize(None)
elif is_datetime64_dtype(s.dtype) and from_tz != to_tz:
# `s.dt.tz_localize('tzlocal()')` doesn't work properly when including NaT.
return s.apply(
lambda ts: ts.tz_localize(from_tz, ambiguous=False).tz_convert(to_tz).tz_localize(None)
if ts is not pd.NaT else pd.NaT)
else:
return s
示例4: class_from_dtype
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def class_from_dtype(dtype) -> type:
"""
Determine ColumnType class, based on pandas/numpy `dtype`.
"""
if is_numeric_dtype(dtype):
return ColumnType.NUMBER
elif is_datetime64_dtype(dtype):
return ColumnType.DATETIME
elif dtype == object or dtype == "category":
return ColumnType.TEXT
else:
raise ValueError(f"Unknown dtype: {dtype}")
示例5: _create_from_pandas_with_arrow
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
"""
Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
data types will be used to coerce the data in Pandas to Arrow conversion.
"""
from pyspark.serializers import ArrowStreamSerializer, _create_batch
from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType
from pyspark.sql.utils import require_minimum_pandas_version, \
require_minimum_pyarrow_version
require_minimum_pandas_version()
require_minimum_pyarrow_version()
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
# Determine arrow types to coerce data when creating batches
if isinstance(schema, StructType):
arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
elif isinstance(schema, DataType):
raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
else:
# Any timestamps must be coerced to be compatible with Spark
arrow_types = [to_arrow_type(TimestampType())
if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
for t in pdf.dtypes]
# Slice the DataFrame to be batched
step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up
pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))
# Create Arrow record batches
batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
timezone)
for pdf_slice in pdf_slices]
# Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
if isinstance(schema, (list, tuple)):
struct = from_arrow_schema(batches[0].schema)
for i, name in enumerate(schema):
struct.fields[i].name = name
struct.names[i] = name
schema = struct
jsqlContext = self._wrapped._jsqlContext
def reader_func(temp_filename):
return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename)
def create_RDD_server():
return self._jvm.ArrowRDDServer(jsqlContext)
# Create Spark DataFrame from Arrow stream file, using one batch per partition
jrdd = self._sc._serialize_to_jvm(batches, ArrowStreamSerializer(), reader_func,
create_RDD_server)
jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext)
df = DataFrame(jdf, self._wrapped)
df._schema = schema
return df
示例6: _create_from_pandas_with_arrow
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
"""
Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
data types will be used to coerce the data in Pandas to Arrow conversion.
"""
from pyspark.serializers import ArrowSerializer, _create_batch
from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType
from pyspark.sql.utils import require_minimum_pandas_version, \
require_minimum_pyarrow_version
require_minimum_pandas_version()
require_minimum_pyarrow_version()
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
# Determine arrow types to coerce data when creating batches
if isinstance(schema, StructType):
arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
elif isinstance(schema, DataType):
raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
else:
# Any timestamps must be coerced to be compatible with Spark
arrow_types = [to_arrow_type(TimestampType())
if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
for t in pdf.dtypes]
# Slice the DataFrame to be batched
step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up
pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))
# Create Arrow record batches
batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
timezone)
for pdf_slice in pdf_slices]
# Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
if isinstance(schema, (list, tuple)):
struct = from_arrow_schema(batches[0].schema)
for i, name in enumerate(schema):
struct.fields[i].name = name
struct.names[i] = name
schema = struct
# Create the Spark DataFrame directly from the Arrow data and schema
jrdd = self._sc._serialize_to_jvm(batches, len(batches), ArrowSerializer())
jdf = self._jvm.PythonSQLUtils.arrowPayloadToDataFrame(
jrdd, schema.json(), self._wrapped._jsqlContext)
df = DataFrame(jdf, self._wrapped)
df._schema = schema
return df
示例7: _check_series_convert_timestamps_internal
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def _check_series_convert_timestamps_internal(s, timezone):
"""
Convert a tz-naive timestamp in the specified timezone or local timezone to UTC normalized for
Spark internal storage
:param s: a pandas.Series
:param timezone: the timezone to convert. if None then use local timezone
:return pandas.Series where if it is a timestamp, has been UTC normalized without a time zone
"""
from pyspark.sql.utils import require_minimum_pandas_version
require_minimum_pandas_version()
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?
if is_datetime64_dtype(s.dtype):
# When tz_localize a tz-naive timestamp, the result is ambiguous if the tz-naive
# timestamp is during the hour when the clock is adjusted backward during due to
# daylight saving time (dst).
# E.g., for America/New_York, the clock is adjusted backward on 2015-11-01 2:00 to
# 2015-11-01 1:00 from dst-time to standard time, and therefore, when tz_localize
# a tz-naive timestamp 2015-11-01 1:30 with America/New_York timezone, it can be either
# dst time (2015-01-01 1:30-0400) or standard time (2015-11-01 1:30-0500).
#
# Here we explicit choose to use standard time. This matches the default behavior of
# pytz.
#
# Here are some code to help understand this behavior:
# >>> import datetime
# >>> import pandas as pd
# >>> import pytz
# >>>
# >>> t = datetime.datetime(2015, 11, 1, 1, 30)
# >>> ts = pd.Series([t])
# >>> tz = pytz.timezone('America/New_York')
# >>>
# >>> ts.dt.tz_localize(tz, ambiguous=True)
# 0 2015-11-01 01:30:00-04:00
# dtype: datetime64[ns, America/New_York]
# >>>
# >>> ts.dt.tz_localize(tz, ambiguous=False)
# 0 2015-11-01 01:30:00-05:00
# dtype: datetime64[ns, America/New_York]
# >>>
# >>> str(tz.localize(t))
# '2015-11-01 01:30:00-05:00'
tz = timezone or _get_local_timezone()
return s.dt.tz_localize(tz, ambiguous=False).dt.tz_convert('UTC')
elif is_datetime64tz_dtype(s.dtype):
return s.dt.tz_convert('UTC')
else:
return s
示例8: from_pandas
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def from_pandas(pdf: pd.DataFrame) -> "InternalFrame":
""" Create an immutable DataFrame from pandas DataFrame.
:param pdf: :class:`pd.DataFrame`
:return: the created immutable DataFrame
"""
columns = pdf.columns
data_columns = [name_like_string(col) for col in columns]
if isinstance(columns, pd.MultiIndex):
column_labels = columns.tolist()
else:
column_labels = None
column_label_names = columns.names
index_names = [
name if name is None or isinstance(name, tuple) else (name,) for name in pdf.index.names
]
index_columns = [SPARK_INDEX_NAME_FORMAT(i) for i in range(len(index_names))]
pdf = pdf.copy()
pdf.index.names = index_columns
reset_index = pdf.reset_index()
reset_index.columns = index_columns + data_columns
schema = StructType(
[
StructField(
name, infer_pd_series_spark_type(col), nullable=bool(col.isnull().any()),
)
for name, col in reset_index.iteritems()
]
)
for name, col in reset_index.iteritems():
dt = col.dtype
if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
continue
reset_index[name] = col.replace({np.nan: None})
sdf = default_session().createDataFrame(reset_index, schema=schema)
return InternalFrame(
spark_frame=sdf,
index_map=OrderedDict(zip(index_columns, index_names)),
column_labels=column_labels,
data_spark_columns=[scol_for(sdf, col) for col in data_columns],
column_label_names=column_label_names,
)
示例9: autocast_series_dtype
# 需要导入模块: from pandas.api import types [as 别名]
# 或者: from pandas.api.types import is_datetime64_dtype [as 别名]
def autocast_series_dtype(series: pd.Series) -> pd.Series:
"""
Cast any sane Series to str/category[str]/number/datetime.
This is appropriate when parsing CSV data or Excel data. It _seems_
appropriate when a search-and-replace produces numeric columns like
'$1.32' => '1.32' ... but perhaps that's only appropriate in very-specific
cases.
The input must be "sane": if the dtype is object or category, se assume
_every value_ is str (or null).
If the series is all-null, do nothing.
Avoid spurious calls to this function: it's expensive.
TODO handle dates and maybe booleans.
"""
if series.dtype == object:
nulls = series.isnull()
if (nulls | (series == "")).all():
return series
try:
# If it all looks like numbers (like in a CSV), cast to number.
return pd.to_numeric(series)
except (ValueError, TypeError):
# Otherwise, we want all-string. Is that what we already have?
#
# TODO assert that we already have all-string, and nix this
# spurious conversion.
array = series[~nulls].array
if any(type(x) != str for x in array):
series = series.astype(str)
series[nulls] = None
return series
elif hasattr(series, "cat"):
# Categorical series. Try to infer type of series.
#
# Assume categories are all str: after all, we're assuming the input is
# "sane" and "sane" means only str categories are valid.
if (series.isnull() | (series == "")).all():
return series
try:
return pd.to_numeric(series)
except (ValueError, TypeError):
# We don't cast categories to str here -- because we have no
# callers that would create categories that aren't all-str. If we
# ever do, this is where we should do the casting.
return series
else:
assert is_numeric_dtype(series) or is_datetime64_dtype(series)
return series