本文整理汇总了Python中pyspark.sql.types.LongType方法的典型用法代码示例。如果您正苦于以下问题:Python types.LongType方法的具体用法?Python types.LongType怎么用?Python types.LongType使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.types
的用法示例。
在下文中一共展示了types.LongType方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _numpy_to_spark_mapping
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def _numpy_to_spark_mapping():
"""Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation
of multiple objects in each call."""
# Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot'
# notation to avoid copy/paste/typo mistakes
cache_attr_name = 'cached_numpy_to_pyspark_types_map'
if not hasattr(_numpy_to_spark_mapping, cache_attr_name):
import pyspark.sql.types as T
setattr(_numpy_to_spark_mapping, cache_attr_name,
{
np.int8: T.ByteType(),
np.uint8: T.ShortType(),
np.int16: T.ShortType(),
np.uint16: T.IntegerType(),
np.int32: T.IntegerType(),
np.int64: T.LongType(),
np.float32: T.FloatType(),
np.float64: T.DoubleType(),
np.string_: T.StringType(),
np.str_: T.StringType(),
np.unicode_: T.StringType(),
np.bool_: T.BooleanType(),
})
return getattr(_numpy_to_spark_mapping, cache_attr_name)
# TODO: Changing fields in this class or the UnischemaField will break reading due to the schema being pickled next to
# the dataset on disk
示例2: test_invalid_schema_field
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def test_invalid_schema_field(synthetic_dataset, reader_factory):
# Let's assume we are selecting columns using a schema which is different from the one
# stored in the dataset. Would expect to get a reasonable error message
BogusSchema = Unischema('BogusSchema', [
UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False),
UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False),
UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False)])
expected_values = {'bogus_key': 11, 'id': 1}
with pytest.raises(ValueError, match='bogus_key'):
reader_factory(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(),
shuffle_row_groups=False,
predicate=EqualPredicate(expected_values))
示例3: test_create_schema_view_fails_validate
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def test_create_schema_view_fails_validate():
""" Exercises code paths unischema.create_schema_view ValueError, and unischema.__str__."""
TestSchema = Unischema('TestSchema', [
UnischemaField('int_field', np.int8, (), ScalarCodec(IntegerType()), False),
UnischemaField('string_field', np.string_, (), ScalarCodec(StringType()), False),
])
with pytest.raises(ValueError, match='does not belong to the schema'):
TestSchema.create_schema_view([UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False)])
示例4: encode
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def encode(self, unischema_field, value):
# Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
# (currently works only with make_batch_reader). We should move all pyspark related code into a separate module
import pyspark.sql.types as sql_types
# We treat ndarrays with shape=() as scalars
unsized_numpy_array = isinstance(value, np.ndarray) and value.shape == ()
# Validate the input to be a scalar (or an unsized numpy array)
if not unsized_numpy_array and hasattr(value, '__len__') and (not isinstance(value, str)):
raise TypeError('Expected a scalar as a value for field \'{}\'. '
'Got a non-numpy type\'{}\''.format(unischema_field.name, type(value)))
if unischema_field.shape:
raise ValueError('The shape field of unischema_field \'%s\' must be an empty tuple (i.e. \'()\' '
'to indicate a scalar. However, the actual shape is %s',
unischema_field.name, unischema_field.shape)
if isinstance(self._spark_type, (sql_types.ByteType, sql_types.ShortType, sql_types.IntegerType,
sql_types.LongType)):
return int(value)
if isinstance(self._spark_type, (sql_types.FloatType, sql_types.DoubleType)):
return float(value)
if isinstance(self._spark_type, sql_types.BooleanType):
return bool(value)
if isinstance(self._spark_type, sql_types.StringType):
if not isinstance(value, str):
raise ValueError(
'Expected a string value for field {}. Got type {}'.format(unischema_field.name, type(value)))
return str(value)
return value
示例5: year
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def year(self) -> "ks.Series":
"""
The year of the datetime.
"""
return column_op(lambda c: F.year(c).cast(LongType()))(self._data).alias(self._data.name)
示例6: month
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def month(self) -> "ks.Series":
"""
The month of the timestamp as January = 1 December = 12.
"""
return column_op(lambda c: F.month(c).cast(LongType()))(self._data).alias(self._data.name)
示例7: day
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def day(self) -> "ks.Series":
"""
The days of the datetime.
"""
return column_op(lambda c: F.dayofmonth(c).cast(LongType()))(self._data).alias(
self._data.name
)
示例8: hour
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def hour(self) -> "ks.Series":
"""
The hours of the datetime.
"""
return column_op(lambda c: F.hour(c).cast(LongType()))(self._data).alias(self._data.name)
示例9: minute
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def minute(self) -> "ks.Series":
"""
The minutes of the datetime.
"""
return column_op(lambda c: F.minute(c).cast(LongType()))(self._data).alias(self._data.name)
示例10: week
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def week(self) -> "ks.Series":
"""
The week ordinal of the year.
"""
return column_op(lambda c: F.weekofyear(c).cast(LongType()))(self._data).alias(
self._data.name
)
示例11: numpy_column_op
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def numpy_column_op(f):
@wraps(f)
def wrapper(self, *args):
# PySpark does not support NumPy type out of the box. For now, we convert NumPy types
# into some primitive types understandable in PySpark.
new_args = []
for arg in args:
# TODO: This is a quick hack to support NumPy type. We should revisit this.
if isinstance(self.spark.data_type, LongType) and isinstance(arg, np.timedelta64):
new_args.append(float(arg / np.timedelta64(1, "s")))
else:
new_args.append(arg)
return column_op(f)(self, *new_args)
return wrapper
示例12: _select_rows_by_iterable
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def _select_rows_by_iterable(
self, rows_sel: Iterable
) -> Tuple[Optional[spark.Column], Optional[int], Optional[int]]:
sdf = self._internal.spark_frame
if any(isinstance(key, (int, np.int, np.int64, np.int32)) and key < 0 for key in rows_sel):
offset = sdf.count()
else:
offset = 0
new_rows_sel = []
for key in list(rows_sel):
if not isinstance(key, (int, np.int, np.int64, np.int32)):
raise TypeError(
"cannot do positional indexing with these indexers [{}] of {}".format(
key, type(key)
)
)
if key < 0:
key = key + offset
new_rows_sel.append(key)
if len(new_rows_sel) != len(set(new_rows_sel)):
raise NotImplementedError(
"Duplicated row selection is not currently supported; "
"however, normalised index was [%s]" % new_rows_sel
)
sequence_scol = sdf[self._sequence_col]
cond = []
for key in new_rows_sel:
cond.append(sequence_scol == F.lit(int(key)).cast(LongType()))
if len(cond) == 0:
cond = [F.lit(False)]
return reduce(lambda x, y: x | y, cond), None, None
示例13: as_spark_type
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def as_spark_type(tpe) -> types.DataType:
"""
Given a python type, returns the equivalent spark type.
Accepts:
- the built-in types in python
- the built-in types in numpy
- list of pairs of (field_name, type)
- dictionaries of field_name -> type
- python3's typing system
"""
if tpe in (str, "str", "string"):
return types.StringType()
elif tpe in (bytes,):
return types.BinaryType()
elif tpe in (np.int8, "int8", "byte"):
return types.ByteType()
elif tpe in (np.int16, "int16", "short"):
return types.ShortType()
elif tpe in (int, "int", np.int, np.int32):
return types.IntegerType()
elif tpe in (np.int64, "int64", "long", "bigint"):
return types.LongType()
elif tpe in (float, "float", np.float):
return types.FloatType()
elif tpe in (np.float64, "float64", "double"):
return types.DoubleType()
elif tpe in (datetime.datetime, np.datetime64):
return types.TimestampType()
elif tpe in (datetime.date,):
return types.DateType()
elif tpe in (bool, "boolean", "bool", np.bool):
return types.BooleanType()
elif tpe in (np.ndarray,):
# TODO: support other child types
return types.ArrayType(types.StringType())
else:
raise TypeError("Type %s was not understood." % tpe)
示例14: len
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def len(self) -> "ks.Series":
"""
Computes the length of each element in the Series.
The element may be a sequence (such as a string, tuple or list).
Returns
-------
Series of int
A Series of integer values indicating the length of each element in
the Series.
Examples
--------
Returns the length (number of characters) in a string. Returns the
number of entries for lists or tuples.
>>> s1 = ks.Series(['dog', 'monkey'])
>>> s1.str.len()
0 3
1 6
Name: 0, dtype: int64
>>> s2 = ks.Series([["a", "b", "c"], []])
>>> s2.str.len()
0 3
1 0
Name: 0, dtype: int64
"""
if isinstance(self._data.spark.data_type, (ArrayType, MapType)):
return column_op(lambda c: F.size(c).cast(LongType()))(self._data).alias(
self._data.name
)
else:
return column_op(lambda c: F.length(c).cast(LongType()))(self._data).alias(
self._data.name
)
示例15: read
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import LongType [as 别名]
def read(self, file_path, spark_session, indexcol=0, schema=None):
"""
Creates a dataframe from the csv file
:param indexcol: if 1, create a tuple id column as auto increment
:param schema: optional schema of file if known
:param spark_session: The spark_session we created in Holoclean object
:param file_path: The path to the file
:return: dataframe
"""
if schema is None:
df = spark_session.read.csv(file_path, header=True)
else:
df = spark_session.read.csv(file_path, header=True, schema=schema)
if indexcol == 0:
return df
index_name = GlobalVariables.index_name
new_cols = df.schema.names + [index_name]
list_schema = []
for index_attribute in range(len(df.schema.names)):
list_schema.append(StructField("_" + str(index_attribute),
df.schema[
index_attribute].dataType,
True))
list_schema.append(
StructField("_" + str(len(new_cols)), LongType(), True))
schema = StructType(list_schema)
ix_df = df.rdd.zipWithIndex().map(
lambda (row, ix): row + (ix + 1,)).toDF(schema)
tmp_cols = ix_df.schema.names
new_df = reduce(lambda data, idx: data.withColumnRenamed(tmp_cols[idx],
new_cols[idx]),
xrange(len(tmp_cols)), ix_df)
new_df = self.checking_string_size(new_df)
return new_df