本文整理汇总了Python中pyspark.sql.types.FloatType方法的典型用法代码示例。如果您正苦于以下问题:Python types.FloatType方法的具体用法?Python types.FloatType怎么用?Python types.FloatType使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.types
的用法示例。
在下文中一共展示了types.FloatType方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: _decodeOutputAsPredictions
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import FloatType [as 别名]
def _decodeOutputAsPredictions(self, df):
# If we start having different weights than imagenet, we'll need to
# move this logic to individual model building in NamedImageTransformer.
# Also, we could put the computation directly in the main computation
# graph or use a scala UDF for potentially better performance.
topK = self.getOrDefault(self.topK)
def decode(predictions):
pred_arr = np.expand_dims(np.array(predictions), axis=0)
decoded = decode_predictions(pred_arr, top=topK)[0]
# convert numpy dtypes to python native types
return [(t[0], t[1], t[2].item()) for t in decoded]
decodedSchema = ArrayType(
StructType([
StructField("class", StringType(), False),
StructField("description", StringType(), False),
StructField("probability", FloatType(), False)
]))
decodeUDF = udf(decode, decodedSchema)
interim_output = self._getIntermediateOutputCol()
return df \
.withColumn(self.getOutputCol(), decodeUDF(df[interim_output])) \
.drop(interim_output)
示例2: _convert_precision
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import FloatType [as 别名]
def _convert_precision(df, dtype):
if dtype is None:
return df
if dtype != "float32" and dtype != "float64":
raise ValueError("dtype {} is not supported. \
Use 'float32' or float64".format(dtype))
source_type, target_type = (DoubleType, FloatType) \
if dtype == "float32" else (FloatType, DoubleType)
logger.warning("Converting floating-point columns to %s", dtype)
for field in df.schema:
col_name = field.name
if isinstance(field.dataType, source_type):
df = df.withColumn(col_name, df[col_name].cast(target_type()))
elif isinstance(field.dataType, ArrayType) and \
isinstance(field.dataType.elementType, source_type):
df = df.withColumn(col_name, df[col_name].cast(ArrayType(target_type())))
return df
示例3: register_udfs
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import FloatType [as 别名]
def register_udfs(self, sess, sc):
"""Register UDFs to be used in SQL queries.
:type sess: `pyspark.sql.SparkSession`
:param sess: Session used in Spark for SQL queries.
:type sc: `pyspark.SparkContext`
:param sc: Spark Context to run Spark jobs.
"""
sess.udf.register("SQUARED", self.squared, returnType=(
stypes.ArrayType(stypes.StructType(
fields=[stypes.StructField('sku0', stypes.StringType()),
stypes.StructField('norm', stypes.FloatType())]))))
sess.udf.register('INTERSECTIONS',self.process_intersections,
returnType=stypes.ArrayType(stypes.StructType(fields=[
stypes.StructField('sku0', stypes.StringType()),
stypes.StructField('sku1', stypes.StringType()),
stypes.StructField('cor', stypes.FloatType())])))
示例4: get_petastorm_column
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import FloatType [as 别名]
def get_petastorm_column(df_column):
column_type = df_column.type
column_name = df_column.name
column_is_nullable = df_column.is_nullable
column_array_dimensions = df_column.array_dimensions
# Reference:
# https://github.com/uber/petastorm/blob/master/petastorm/
# tests/test_common.py
petastorm_column = None
if column_type == ColumnType.INTEGER:
petastorm_column = UnischemaField(column_name,
np.int32,
(),
ScalarCodec(IntegerType()),
column_is_nullable)
elif column_type == ColumnType.FLOAT:
petastorm_column = UnischemaField(column_name,
np.float64,
(),
ScalarCodec(FloatType()),
column_is_nullable)
elif column_type == ColumnType.TEXT:
petastorm_column = UnischemaField(column_name,
np.string_,
(),
ScalarCodec(StringType()),
column_is_nullable)
elif column_type == ColumnType.NDARRAY:
petastorm_column = UnischemaField(column_name,
np.uint8,
column_array_dimensions,
NdarrayCodec(),
column_is_nullable)
else:
LoggingManager().log("Invalid column type: " + str(column_type),
LoggingLevel.ERROR)
return petastorm_column
示例5: _numpy_to_spark_mapping
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import FloatType [as 别名]
def _numpy_to_spark_mapping():
"""Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation
of multiple objects in each call."""
# Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot'
# notation to avoid copy/paste/typo mistakes
cache_attr_name = 'cached_numpy_to_pyspark_types_map'
if not hasattr(_numpy_to_spark_mapping, cache_attr_name):
import pyspark.sql.types as T
setattr(_numpy_to_spark_mapping, cache_attr_name,
{
np.int8: T.ByteType(),
np.uint8: T.ShortType(),
np.int16: T.ShortType(),
np.uint16: T.IntegerType(),
np.int32: T.IntegerType(),
np.int64: T.LongType(),
np.float32: T.FloatType(),
np.float64: T.DoubleType(),
np.string_: T.StringType(),
np.str_: T.StringType(),
np.unicode_: T.StringType(),
np.bool_: T.BooleanType(),
})
return getattr(_numpy_to_spark_mapping, cache_attr_name)
# TODO: Changing fields in this class or the UnischemaField will break reading due to the schema being pickled next to
# the dataset on disk
示例6: encode
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import FloatType [as 别名]
def encode(self, unischema_field, value):
# Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
# (currently works only with make_batch_reader). We should move all pyspark related code into a separate module
import pyspark.sql.types as sql_types
# We treat ndarrays with shape=() as scalars
unsized_numpy_array = isinstance(value, np.ndarray) and value.shape == ()
# Validate the input to be a scalar (or an unsized numpy array)
if not unsized_numpy_array and hasattr(value, '__len__') and (not isinstance(value, str)):
raise TypeError('Expected a scalar as a value for field \'{}\'. '
'Got a non-numpy type\'{}\''.format(unischema_field.name, type(value)))
if unischema_field.shape:
raise ValueError('The shape field of unischema_field \'%s\' must be an empty tuple (i.e. \'()\' '
'to indicate a scalar. However, the actual shape is %s',
unischema_field.name, unischema_field.shape)
if isinstance(self._spark_type, (sql_types.ByteType, sql_types.ShortType, sql_types.IntegerType,
sql_types.LongType)):
return int(value)
if isinstance(self._spark_type, (sql_types.FloatType, sql_types.DoubleType)):
return float(value)
if isinstance(self._spark_type, sql_types.BooleanType):
return bool(value)
if isinstance(self._spark_type, sql_types.StringType):
if not isinstance(value, str):
raise ValueError(
'Expected a string value for field {}. Got type {}'.format(unischema_field.name, type(value)))
return str(value)
return value
示例7: test_fromFile
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import FloatType [as 别名]
def test_fromFile(self):
f = os.path.join(SmvSchemaTest.resourceTestDir(), "data/a.schema")
s = SmvSchema.fromFile(f)
fields = s.spark_schema.fields
assert(len(fields) == 2)
assert(fields[0] == st.StructField('a', st.StringType()))
assert(fields[1] == st.StructField('b', st.FloatType()))
示例8: _count_expr
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import FloatType [as 别名]
def _count_expr(col: spark.Column, spark_type: DataType) -> spark.Column:
# Special handle floating point types because Spark's count treats nan as a valid value,
# whereas pandas count doesn't include nan.
if isinstance(spark_type, (FloatType, DoubleType)):
return F.count(F.nanvl(col, F.lit(None)))
else:
return F.count(col)
示例9: isnull
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import FloatType [as 别名]
def isnull(self):
"""
Detect existing (non-missing) values.
Return a boolean same-sized object indicating if the values are NA.
NA values, such as None or numpy.NaN, gets mapped to True values.
Everything else gets mapped to False values. Characters such as empty strings '' or
numpy.inf are not considered NA values
(unless you set pandas.options.mode.use_inf_as_na = True).
Returns
-------
Series : Mask of bool values for each element in Series
that indicates whether an element is not an NA value.
Examples
--------
>>> ser = ks.Series([5, 6, np.NaN])
>>> ser.isna() # doctest: +NORMALIZE_WHITESPACE
0 False
1 False
2 True
Name: 0, dtype: bool
>>> ser.rename("a").to_frame().set_index("a").index.isna()
Index([False, False, True], dtype='object', name='a')
"""
from databricks.koalas.indexes import MultiIndex
if isinstance(self, MultiIndex):
raise NotImplementedError("isna is not defined for MultiIndex")
if isinstance(self.spark.data_type, (FloatType, DoubleType)):
return self._with_new_scol(
self.spark.column.isNull() | F.isnan(self.spark.column)
).rename(self.name)
else:
return self._with_new_scol(self.spark.column.isNull()).rename(self.name)
示例10: as_spark_type
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import FloatType [as 别名]
def as_spark_type(tpe) -> types.DataType:
"""
Given a python type, returns the equivalent spark type.
Accepts:
- the built-in types in python
- the built-in types in numpy
- list of pairs of (field_name, type)
- dictionaries of field_name -> type
- python3's typing system
"""
if tpe in (str, "str", "string"):
return types.StringType()
elif tpe in (bytes,):
return types.BinaryType()
elif tpe in (np.int8, "int8", "byte"):
return types.ByteType()
elif tpe in (np.int16, "int16", "short"):
return types.ShortType()
elif tpe in (int, "int", np.int, np.int32):
return types.IntegerType()
elif tpe in (np.int64, "int64", "long", "bigint"):
return types.LongType()
elif tpe in (float, "float", np.float):
return types.FloatType()
elif tpe in (np.float64, "float64", "double"):
return types.DoubleType()
elif tpe in (datetime.datetime, np.datetime64):
return types.TimestampType()
elif tpe in (datetime.date,):
return types.DateType()
elif tpe in (bool, "boolean", "bool", np.bool):
return types.BooleanType()
elif tpe in (np.ndarray,):
# TODO: support other child types
return types.ArrayType(types.StringType())
else:
raise TypeError("Type %s was not understood." % tpe)
示例11: load_users_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import FloatType [as 别名]
def load_users_schema():
"""Loads schema with data type [user, [(sku, score), (sku, score)]]
:rtype: `pyspark.sql.type.StructType`
:returns: schema speficiation for user -> (sku, score) data.
"""
return stypes.StructType(fields=[
stypes.StructField("user", stypes.StringType()),
stypes.StructField('interactions', stypes.ArrayType(
stypes.StructType(fields=[stypes.StructField('item',
stypes.StringType()), stypes.StructField('score',
stypes.FloatType())])))])
示例12: load_neighbor_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import FloatType [as 别名]
def load_neighbor_schema(self):
"""Loads neighborhood schema for similarity matrix
:rtype: `pyspark.sql.types.StructField`
:returns: schema of type ["key", [("key", "value")]]
"""
return stypes.StructType(fields=[
stypes.StructField("item", stypes.StringType()),
stypes.StructField("similarity_items", stypes.ArrayType(
stypes.StructType(fields=[
stypes.StructField("item", stypes.StringType()),
stypes.StructField("similarity", stypes.FloatType())])))])
示例13: test_load_users_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import FloatType [as 别名]
def test_load_users_schema(self):
klass = self.get_target_klass()()
expected = stypes.StructType(fields=[
stypes.StructField("user", stypes.StringType()),
stypes.StructField('interactions', stypes.ArrayType(
stypes.StructType(fields=[stypes.StructField('item',
stypes.StringType()), stypes.StructField('score',
stypes.FloatType())])))])
result = klass.load_users_schema()
self.assertEqual(result, expected)
示例14: test_load_neighbor_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import FloatType [as 别名]
def test_load_neighbor_schema(self):
klass = self.get_target_klass()()
result = klass.load_neighbor_schema()
expected = stypes.StructType(fields=[
stypes.StructField("item", stypes.StringType()),
stypes.StructField("similarity_items", stypes.ArrayType(
stypes.StructType(fields=[
stypes.StructField("item", stypes.StringType()),
stypes.StructField("similarity", stypes.FloatType())])))])
self.assertEqual(expected, result)
示例15: test_spark_udf
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import FloatType [as 别名]
def test_spark_udf(spark, model_path):
mlflow.pyfunc.save_model(
path=model_path,
loader_module=__name__,
code_path=[os.path.dirname(tests.__file__)],
)
reloaded_pyfunc_model = mlflow.pyfunc.load_pyfunc(model_path)
pandas_df = pd.DataFrame(data=np.ones((10, 10)), columns=[str(i) for i in range(10)])
spark_df = spark.createDataFrame(pandas_df)
# Test all supported return types
type_map = {"float": (FloatType(), np.number),
"int": (IntegerType(), np.int32),
"double": (DoubleType(), np.number),
"long": (LongType(), np.int),
"string": (StringType(), None)}
for tname, tdef in type_map.items():
spark_type, np_type = tdef
prediction_df = reloaded_pyfunc_model.predict(pandas_df)
for is_array in [True, False]:
t = ArrayType(spark_type) if is_array else spark_type
if tname == "string":
expected = prediction_df.applymap(str)
else:
expected = prediction_df.select_dtypes(np_type)
if tname == "float":
expected = expected.astype(np.float32)
expected = [list(row[1]) if is_array else row[1][0] for row in expected.iterrows()]
pyfunc_udf = spark_udf(spark, model_path, result_type=t)
new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns))
actual = list(new_df.select("prediction").toPandas()['prediction'])
assert expected == actual
if not is_array:
pyfunc_udf = spark_udf(spark, model_path, result_type=tname)
new_df = spark_df.withColumn("prediction", pyfunc_udf(*pandas_df.columns))
actual = list(new_df.select("prediction").toPandas()['prediction'])
assert expected == actual