本文整理汇总了Python中pyspark.sql.types.TimestampType方法的典型用法代码示例。如果您正苦于以下问题:Python types.TimestampType方法的具体用法?Python types.TimestampType怎么用?Python types.TimestampType使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.types
的用法示例。
在下文中一共展示了types.TimestampType方法的8个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import TimestampType [as 别名]
def __init__(self, series: "ks.Series"):
if not isinstance(series.spark.data_type, (DateType, TimestampType)):
raise ValueError(
"Cannot call DatetimeMethods on type {}".format(series.spark.data_type)
)
self._data = series
# Properties
示例2: __sub__
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import TimestampType [as 别名]
def __sub__(self, other):
# Note that timestamp subtraction casts arguments to integer. This is to mimic pandas's
# behaviors. pandas returns 'timedelta64[ns]' from 'datetime64[ns]'s subtraction.
if isinstance(other, IndexOpsMixin) and isinstance(self.spark.data_type, TimestampType):
if not isinstance(other.spark.data_type, TimestampType):
raise TypeError("datetime subtraction can only be applied to datetime series.")
return self.astype("bigint") - other.astype("bigint")
elif isinstance(other, IndexOpsMixin) and isinstance(self.spark.data_type, DateType):
if not isinstance(other.spark.data_type, DateType):
raise TypeError("date subtraction can only be applied to date series.")
return column_op(F.datediff)(self, other)
else:
return column_op(Column.__sub__)(self, other)
示例3: is_all_dates
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import TimestampType [as 别名]
def is_all_dates(self):
"""
Return if all data types of the index are datetime.
remember that since Koalas does not support multiple data types in an index,
so it returns True if any type of data is datetime.
Examples
--------
>>> from datetime import datetime
>>> idx = ks.Index([datetime(2019, 1, 1, 0, 0, 0), datetime(2019, 2, 3, 0, 0, 0)])
>>> idx
DatetimeIndex(['2019-01-01', '2019-02-03'], dtype='datetime64[ns]', freq=None)
>>> idx.is_all_dates
True
>>> idx = ks.Index([datetime(2019, 1, 1, 0, 0, 0), None])
>>> idx
DatetimeIndex(['2019-01-01', 'NaT'], dtype='datetime64[ns]', freq=None)
>>> idx.is_all_dates
True
>>> idx = ks.Index([0, 1, 2])
>>> idx
Int64Index([0, 1, 2], dtype='int64')
>>> idx.is_all_dates
False
"""
return isinstance(self.spark.data_type, TimestampType)
示例4: as_spark_type
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import TimestampType [as 别名]
def as_spark_type(tpe) -> types.DataType:
"""
Given a python type, returns the equivalent spark type.
Accepts:
- the built-in types in python
- the built-in types in numpy
- list of pairs of (field_name, type)
- dictionaries of field_name -> type
- python3's typing system
"""
if tpe in (str, "str", "string"):
return types.StringType()
elif tpe in (bytes,):
return types.BinaryType()
elif tpe in (np.int8, "int8", "byte"):
return types.ByteType()
elif tpe in (np.int16, "int16", "short"):
return types.ShortType()
elif tpe in (int, "int", np.int, np.int32):
return types.IntegerType()
elif tpe in (np.int64, "int64", "long", "bigint"):
return types.LongType()
elif tpe in (float, "float", np.float):
return types.FloatType()
elif tpe in (np.float64, "float64", "double"):
return types.DoubleType()
elif tpe in (datetime.datetime, np.datetime64):
return types.TimestampType()
elif tpe in (datetime.date,):
return types.DateType()
elif tpe in (bool, "boolean", "bool", np.bool):
return types.BooleanType()
elif tpe in (np.ndarray,):
# TODO: support other child types
return types.ArrayType(types.StringType())
else:
raise TypeError("Type %s was not understood." % tpe)
示例5: spark_type_to_pandas_dtype
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import TimestampType [as 别名]
def spark_type_to_pandas_dtype(spark_type):
""" Return the given Spark DataType to pandas dtype. """
if isinstance(spark_type, (types.DateType, types.UserDefinedType)):
return np.dtype("object")
elif isinstance(spark_type, types.TimestampType):
return np.dtype("datetime64[ns]")
else:
return np.dtype(to_arrow_type(spark_type).to_pandas_dtype())
示例6: _convert_from_pandas
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import TimestampType [as 别名]
def _convert_from_pandas(self, pdf, schema, timezone):
"""
Convert a pandas.DataFrame to list of records that can be used to make a DataFrame
:return list of records
"""
if timezone is not None:
from pyspark.sql.types import _check_series_convert_timestamps_tz_local
copied = False
if isinstance(schema, StructType):
for field in schema:
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?
if isinstance(field.dataType, TimestampType):
s = _check_series_convert_timestamps_tz_local(pdf[field.name], timezone)
if s is not pdf[field.name]:
if not copied:
# Copy once if the series is modified to prevent the original
# Pandas DataFrame from being updated
pdf = pdf.copy()
copied = True
pdf[field.name] = s
else:
for column, series in pdf.iteritems():
s = _check_series_convert_timestamps_tz_local(series, timezone)
if s is not series:
if not copied:
# Copy once if the series is modified to prevent the original
# Pandas DataFrame from being updated
pdf = pdf.copy()
copied = True
pdf[column] = s
# Convert pandas.DataFrame to list of numpy records
np_records = pdf.to_records(index=False)
# Check if any columns need to be fixed for Spark to infer properly
if len(np_records) > 0:
record_dtype = self._get_numpy_record_dtype(np_records[0])
if record_dtype is not None:
return [r.astype(record_dtype).tolist() for r in np_records]
# Convert list of numpy records to python lists
return [r.tolist() for r in np_records]
示例7: _create_from_pandas_with_arrow
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import TimestampType [as 别名]
def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
"""
Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
data types will be used to coerce the data in Pandas to Arrow conversion.
"""
from pyspark.serializers import ArrowStreamSerializer, _create_batch
from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType
from pyspark.sql.utils import require_minimum_pandas_version, \
require_minimum_pyarrow_version
require_minimum_pandas_version()
require_minimum_pyarrow_version()
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
# Determine arrow types to coerce data when creating batches
if isinstance(schema, StructType):
arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
elif isinstance(schema, DataType):
raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
else:
# Any timestamps must be coerced to be compatible with Spark
arrow_types = [to_arrow_type(TimestampType())
if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
for t in pdf.dtypes]
# Slice the DataFrame to be batched
step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up
pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))
# Create Arrow record batches
batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
timezone)
for pdf_slice in pdf_slices]
# Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
if isinstance(schema, (list, tuple)):
struct = from_arrow_schema(batches[0].schema)
for i, name in enumerate(schema):
struct.fields[i].name = name
struct.names[i] = name
schema = struct
jsqlContext = self._wrapped._jsqlContext
def reader_func(temp_filename):
return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename)
def create_RDD_server():
return self._jvm.ArrowRDDServer(jsqlContext)
# Create Spark DataFrame from Arrow stream file, using one batch per partition
jrdd = self._sc._serialize_to_jvm(batches, ArrowStreamSerializer(), reader_func,
create_RDD_server)
jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext)
df = DataFrame(jdf, self._wrapped)
df._schema = schema
return df
示例8: _create_from_pandas_with_arrow
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import TimestampType [as 别名]
def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
"""
Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
data types will be used to coerce the data in Pandas to Arrow conversion.
"""
from pyspark.serializers import ArrowSerializer, _create_batch
from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType
from pyspark.sql.utils import require_minimum_pandas_version, \
require_minimum_pyarrow_version
require_minimum_pandas_version()
require_minimum_pyarrow_version()
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
# Determine arrow types to coerce data when creating batches
if isinstance(schema, StructType):
arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
elif isinstance(schema, DataType):
raise ValueError("Single data type %s is not supported with Arrow" % str(schema))
else:
# Any timestamps must be coerced to be compatible with Spark
arrow_types = [to_arrow_type(TimestampType())
if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
for t in pdf.dtypes]
# Slice the DataFrame to be batched
step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up
pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))
# Create Arrow record batches
batches = [_create_batch([(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)],
timezone)
for pdf_slice in pdf_slices]
# Create the Spark schema from the first Arrow batch (always at least 1 batch after slicing)
if isinstance(schema, (list, tuple)):
struct = from_arrow_schema(batches[0].schema)
for i, name in enumerate(schema):
struct.fields[i].name = name
struct.names[i] = name
schema = struct
# Create the Spark DataFrame directly from the Arrow data and schema
jrdd = self._sc._serialize_to_jvm(batches, len(batches), ArrowSerializer())
jdf = self._jvm.PythonSQLUtils.arrowPayloadToDataFrame(
jrdd, schema.json(), self._wrapped._jsqlContext)
df = DataFrame(jdf, self._wrapped)
df._schema = schema
return df