本文整理汇总了Python中pyspark.sql.types.ArrayType方法的典型用法代码示例。如果您正苦于以下问题:Python types.ArrayType方法的具体用法?Python types.ArrayType怎么用?Python types.ArrayType使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.types
的用法示例。
在下文中一共展示了types.ArrayType方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _decodeOutputAsPredictions
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import ArrayType [as 别名]
def _decodeOutputAsPredictions(self, df):
# If we start having different weights than imagenet, we'll need to
# move this logic to individual model building in NamedImageTransformer.
# Also, we could put the computation directly in the main computation
# graph or use a scala UDF for potentially better performance.
topK = self.getOrDefault(self.topK)
def decode(predictions):
pred_arr = np.expand_dims(np.array(predictions), axis=0)
decoded = decode_predictions(pred_arr, top=topK)[0]
# convert numpy dtypes to python native types
return [(t[0], t[1], t[2].item()) for t in decoded]
decodedSchema = ArrayType(
StructType([
StructField("class", StringType(), False),
StructField("description", StringType(), False),
StructField("probability", FloatType(), False)
]))
decodeUDF = udf(decode, decodedSchema)
interim_output = self._getIntermediateOutputCol()
return df \
.withColumn(self.getOutputCol(), decodeUDF(df[interim_output])) \
.drop(interim_output)
示例2: _simplify_data_type
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import ArrayType [as 别名]
def _simplify_data_type(data_type: T.DataType) -> Tuple:
"""Simplify datatype into a tuple of equality information we care about
Most notably this ignores nullability concerns due to hive not
being able to represent not null in it's schemas.
"""
try:
# Normalize UDT into it's sql form. Allows comparison of schemas
# from hive and spark.
sql_type = data_type.sqlType() # type: ignore
except AttributeError:
sql_type = data_type
if isinstance(sql_type, T.StructType):
return ('StructType', [(field.name, _simplify_data_type(field.dataType)) for field in sql_type])
elif isinstance(sql_type, T.ArrayType):
return ('ArrayType', _simplify_data_type(sql_type.elementType))
else:
return (type(sql_type).__name__,)
示例3: _convert_precision
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import ArrayType [as 别名]
def _convert_precision(df, dtype):
if dtype is None:
return df
if dtype != "float32" and dtype != "float64":
raise ValueError("dtype {} is not supported. \
Use 'float32' or float64".format(dtype))
source_type, target_type = (DoubleType, FloatType) \
if dtype == "float32" else (FloatType, DoubleType)
logger.warning("Converting floating-point columns to %s", dtype)
for field in df.schema:
col_name = field.name
if isinstance(field.dataType, source_type):
df = df.withColumn(col_name, df[col_name].cast(target_type()))
elif isinstance(field.dataType, ArrayType) and \
isinstance(field.dataType.elementType, source_type):
df = df.withColumn(col_name, df[col_name].cast(ArrayType(target_type())))
return df
示例4: register_udfs
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import ArrayType [as 别名]
def register_udfs(self, sess, sc):
"""Register UDFs to be used in SQL queries.
:type sess: `pyspark.sql.SparkSession`
:param sess: Session used in Spark for SQL queries.
:type sc: `pyspark.SparkContext`
:param sc: Spark Context to run Spark jobs.
"""
sess.udf.register("SQUARED", self.squared, returnType=(
stypes.ArrayType(stypes.StructType(
fields=[stypes.StructField('sku0', stypes.StringType()),
stypes.StructField('norm', stypes.FloatType())]))))
sess.udf.register('INTERSECTIONS',self.process_intersections,
returnType=stypes.ArrayType(stypes.StructType(fields=[
stypes.StructField('sku0', stypes.StringType()),
stypes.StructField('sku1', stypes.StringType()),
stypes.StructField('cor', stypes.FloatType())])))
示例5: transform
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import ArrayType [as 别名]
def transform(df, url_list=None, brokers=None, **kwargs):
if brokers and url_list:
raise ValueError('cannot specify brokers and url_list')
if brokers:
rdd = transform_from_kafka(df, brokers, **kwargs)
else:
rdd = transform_from_elasticsearch(df, url_list, **kwargs)
return df.sql_ctx.createDataFrame(rdd, T.StructType([
df.schema['wikiid'],
df.schema['query'],
df.schema['norm_query'],
T.StructField('hit_page_ids', T.ArrayType(T.IntegerType()), nullable=False),
]))
示例6: cluster_within_norm_query_groups
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import ArrayType [as 别名]
def cluster_within_norm_query_groups(df: DataFrame) -> DataFrame:
make_groups = F.udf(_make_query_groups, T.ArrayType(T.StructType([
T.StructField('query', T.StringType(), nullable=False),
T.StructField('norm_query_group_id', T.IntegerType(), nullable=False),
])))
return (
df
.groupBy('wikiid', 'norm_query')
.agg(F.collect_list(F.struct('query', 'hit_page_ids')).alias('source'))
.select(
'wikiid', 'norm_query',
F.explode(make_groups('source')).alias('group'))
.select('wikiid', 'norm_query', 'group.query', 'group.norm_query_group_id'))
示例7: ibis_array_dtype_to_spark_dtype
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import ArrayType [as 别名]
def ibis_array_dtype_to_spark_dtype(ibis_dtype_obj):
element_type = spark_dtype(ibis_dtype_obj.value_type)
contains_null = ibis_dtype_obj.value_type.nullable
return pt.ArrayType(element_type, contains_null)
示例8: flatten_dataset
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import ArrayType [as 别名]
def flatten_dataset(dataset: DataFrame):
tmp = dataset
for field in tmp.schema.fields:
if isinstance(field.dataType, ArrayType):
print(field.name, field.dataType)
tmp = tmp.withColumn(field.name, explode(tmp.field.name))
return tmp
示例9: as_spark_type
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import ArrayType [as 别名]
def as_spark_type(tpe) -> types.DataType:
"""
Given a python type, returns the equivalent spark type.
Accepts:
- the built-in types in python
- the built-in types in numpy
- list of pairs of (field_name, type)
- dictionaries of field_name -> type
- python3's typing system
"""
if tpe in (str, "str", "string"):
return types.StringType()
elif tpe in (bytes,):
return types.BinaryType()
elif tpe in (np.int8, "int8", "byte"):
return types.ByteType()
elif tpe in (np.int16, "int16", "short"):
return types.ShortType()
elif tpe in (int, "int", np.int, np.int32):
return types.IntegerType()
elif tpe in (np.int64, "int64", "long", "bigint"):
return types.LongType()
elif tpe in (float, "float", np.float):
return types.FloatType()
elif tpe in (np.float64, "float64", "double"):
return types.DoubleType()
elif tpe in (datetime.datetime, np.datetime64):
return types.TimestampType()
elif tpe in (datetime.date,):
return types.DateType()
elif tpe in (bool, "boolean", "bool", np.bool):
return types.BooleanType()
elif tpe in (np.ndarray,):
# TODO: support other child types
return types.ArrayType(types.StringType())
else:
raise TypeError("Type %s was not understood." % tpe)
示例10: __init__
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import ArrayType [as 别名]
def __init__(self, series: "ks.Series"):
if not isinstance(series.spark.data_type, (StringType, BinaryType, ArrayType)):
raise ValueError("Cannot call StringMethods on type {}".format(series.spark.data_type))
self._data = series
self.name = self._data.name
# Methods
示例11: len
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import ArrayType [as 别名]
def len(self) -> "ks.Series":
"""
Computes the length of each element in the Series.
The element may be a sequence (such as a string, tuple or list).
Returns
-------
Series of int
A Series of integer values indicating the length of each element in
the Series.
Examples
--------
Returns the length (number of characters) in a string. Returns the
number of entries for lists or tuples.
>>> s1 = ks.Series(['dog', 'monkey'])
>>> s1.str.len()
0 3
1 6
Name: 0, dtype: int64
>>> s2 = ks.Series([["a", "b", "c"], []])
>>> s2.str.len()
0 3
1 0
Name: 0, dtype: int64
"""
if isinstance(self._data.spark.data_type, (ArrayType, MapType)):
return column_op(lambda c: F.size(c).cast(LongType()))(self._data).alias(
self._data.name
)
else:
return column_op(lambda c: F.length(c).cast(LongType()))(self._data).alias(
self._data.name
)
示例12: get_addons_per_client
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import ArrayType [as 别名]
def get_addons_per_client(users_df, minimum_addons_count):
""" Extracts a DataFrame that contains one row
for each client along with the list of active add-on GUIDs.
"""
def is_valid_addon(addon):
return not (
addon.is_system
or addon.app_disabled
or addon.type != "extension"
or addon.user_disabled
or addon.foreign_install
or addon.install_day is None
)
# may need additional whitelisting to remove shield addons
def get_valid_addon_ids(addons):
sorted_addons = sorted(
[(a.addon_id, a.install_day) for a in addons if is_valid_addon(a)],
key=lambda addon_tuple: addon_tuple[1],
)
return [addon_id for (addon_id, install_day) in sorted_addons]
get_valid_addon_ids_udf = udf(get_valid_addon_ids, ArrayType(StringType()))
# Create an add-ons dataset un-nesting the add-on map from each
# user to a list of add-on GUIDs. Also filter undesired add-ons.
return users_df.select(
"client_id", get_valid_addon_ids_udf("active_addons").alias("addon_ids")
).filter(size("addon_ids") > minimum_addons_count)
示例13: load_users_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import ArrayType [as 别名]
def load_users_schema():
"""Loads schema with data type [user, [(sku, score), (sku, score)]]
:rtype: `pyspark.sql.type.StructType`
:returns: schema speficiation for user -> (sku, score) data.
"""
return stypes.StructType(fields=[
stypes.StructField("user", stypes.StringType()),
stypes.StructField('interactions', stypes.ArrayType(
stypes.StructType(fields=[stypes.StructField('item',
stypes.StringType()), stypes.StructField('score',
stypes.FloatType())])))])
示例14: load_neighbor_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import ArrayType [as 别名]
def load_neighbor_schema(self):
"""Loads neighborhood schema for similarity matrix
:rtype: `pyspark.sql.types.StructField`
:returns: schema of type ["key", [("key", "value")]]
"""
return stypes.StructType(fields=[
stypes.StructField("item", stypes.StringType()),
stypes.StructField("similarity_items", stypes.ArrayType(
stypes.StructType(fields=[
stypes.StructField("item", stypes.StringType()),
stypes.StructField("similarity", stypes.FloatType())])))])
示例15: test_load_users_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import ArrayType [as 别名]
def test_load_users_schema(self):
klass = self.get_target_klass()()
expected = stypes.StructType(fields=[
stypes.StructField("user", stypes.StringType()),
stypes.StructField('interactions', stypes.ArrayType(
stypes.StructType(fields=[stypes.StructField('item',
stypes.StringType()), stypes.StructField('score',
stypes.FloatType())])))])
result = klass.load_users_schema()
self.assertEqual(result, expected)