本文整理汇总了Python中pyspark.sql.types.DoubleType方法的典型用法代码示例。如果您正苦于以下问题:Python types.DoubleType方法的具体用法?Python types.DoubleType怎么用?Python types.DoubleType使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.types
的用法示例。
在下文中一共展示了types.DoubleType方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DoubleType [as 别名]
def main():
temp_schema = StructType([
StructField('StationID', StringType(), False),
StructField('DateTime', StringType(), False),
StructField('Observation', StringType(), False),
StructField('DataValue', DoubleType(), False),
StructField('MFlag', StringType(), True),
StructField('QFlag', StringType(), True),
StructField('SFlag', StringType(), True),
StructField('OBSTime', StringType(), True),
])
df = sqlContext.read.format('com.databricks.spark.csv').options(header='false').load(inputs1, schema=temp_schema)
df = df.filter(df.QFlag == '')
dfrange = get_range(df)
result = dfrange.rdd.map(lambda r: str(r.DateTime)+' '+str(r.StationID)+' '+str(r.MaxRange))
outdata = result.sortBy(lambda r: r[0]).coalesce(1)
outdata.saveAsTextFile(output)
示例2: _convert_precision
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DoubleType [as 别名]
def _convert_precision(df, dtype):
if dtype is None:
return df
if dtype != "float32" and dtype != "float64":
raise ValueError("dtype {} is not supported. \
Use 'float32' or float64".format(dtype))
source_type, target_type = (DoubleType, FloatType) \
if dtype == "float32" else (FloatType, DoubleType)
logger.warning("Converting floating-point columns to %s", dtype)
for field in df.schema:
col_name = field.name
if isinstance(field.dataType, source_type):
df = df.withColumn(col_name, df[col_name].cast(target_type()))
elif isinstance(field.dataType, ArrayType) and \
isinstance(field.dataType.elementType, source_type):
df = df.withColumn(col_name, df[col_name].cast(ArrayType(target_type())))
return df
示例3: _transform
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DoubleType [as 别名]
def _transform(self, dataset):
if any([field.dataType == DoubleType() for field in dataset.schema]):
logger.warning("Detected DoubleType columns in dataframe passed to transform(). In "
"Deep Learning Pipelines 1.0 and above, DoubleType columns can only be "
"fed to input tensors of type tf.float64. To feed dataframe data to "
"tensors of other types (e.g. tf.float32, tf.int32, tf.int64), use the "
"corresponding Spark SQL data types (FloatType, IntegerType, LongType).")
graph_def = self._optimize_for_inference()
input_mapping = self.getInputMapping()
output_mapping = self.getOutputMapping()
graph = tf.Graph()
with tf.Session(graph=graph):
analyzed_df = tfs.analyze(dataset)
out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping]
# Load graph
tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names)
# Feed dict maps from placeholder name to DF column name
feed_dict = {tfx.op_name(tnsr_name): col_name for col_name, tnsr_name in input_mapping}
fetches = [tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names]
out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict)
# We still have to rename output columns
for tnsr_name, new_colname in output_mapping:
old_colname = tfx.op_name(tnsr_name, graph)
if old_colname != new_colname:
out_df = out_df.withColumnRenamed(old_colname, new_colname)
return out_df
示例4: _transform
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DoubleType [as 别名]
def _transform(self, dataset):
inp = self.getOrDefault(self.inputCol)
out = self.getOrDefault(self.predictionCol)
mod_str = self.getOrDefault(self.modStr)
use_vector_out = self.getOrDefault(self.useVectorOut)
model = dill.loads(codecs.decode(mod_str.encode(), "base64"))
model_broadcast = dataset._sc.broadcast(model)
def predict_vec(data):
features = data.toArray().reshape((1, len(data)))
x_data = torch.from_numpy(features).float()
model = model_broadcast.value
model.eval()
return Vectors.dense(model(x_data).detach().numpy().flatten())
def predict_float(data):
features = data.toArray().reshape((1, len(data)))
x_data = torch.from_numpy(features).float()
model = model_broadcast.value
model.eval()
raw_prediction = model(x_data).detach().numpy().flatten()
if len(raw_prediction) > 1:
return float(np.argmax(raw_prediction))
return float(raw_prediction[0])
if use_vector_out:
udfGenerateCode = F.udf(predict_vec, VectorUDT())
else:
udfGenerateCode = F.udf(predict_float, DoubleType())
return dataset.withColumn(out, udfGenerateCode(inp))
示例5: _transform
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DoubleType [as 别名]
def _transform(self, df):
"""Private transform method of a Transformer. This serves as batch-prediction method for our purposes.
"""
output_col = self.getOutputCol()
label_col = self.getLabelCol()
new_schema = copy.deepcopy(df.schema)
new_schema.add(StructField(output_col, StringType(), True))
rdd = df.rdd.coalesce(1)
features = np.asarray(
rdd.map(lambda x: from_vector(x.features)).collect())
# Note that we collect, since executing this on the rdd would require model serialization once again
model = model_from_yaml(self.get_keras_model_config())
model.set_weights(self.weights.value)
predictions = rdd.ctx.parallelize(
model.predict_classes(features)).coalesce(1)
predictions = predictions.map(lambda x: tuple(str(x)))
results_rdd = rdd.zip(predictions).map(lambda x: x[0] + x[1])
results_df = df.sql_ctx.createDataFrame(results_rdd, new_schema)
results_df = results_df.withColumn(
output_col, results_df[output_col].cast(DoubleType()))
results_df = results_df.withColumn(
label_col, results_df[label_col].cast(DoubleType()))
return results_df
示例6: _numpy_to_spark_mapping
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DoubleType [as 别名]
def _numpy_to_spark_mapping():
"""Returns a mapping from numpy to pyspark.sql type. Caches the mapping dictionary inorder to avoid instantiation
of multiple objects in each call."""
# Refer to the attribute of the function we use to cache the map using a name in the variable instead of a 'dot'
# notation to avoid copy/paste/typo mistakes
cache_attr_name = 'cached_numpy_to_pyspark_types_map'
if not hasattr(_numpy_to_spark_mapping, cache_attr_name):
import pyspark.sql.types as T
setattr(_numpy_to_spark_mapping, cache_attr_name,
{
np.int8: T.ByteType(),
np.uint8: T.ShortType(),
np.int16: T.ShortType(),
np.uint16: T.IntegerType(),
np.int32: T.IntegerType(),
np.int64: T.LongType(),
np.float32: T.FloatType(),
np.float64: T.DoubleType(),
np.string_: T.StringType(),
np.str_: T.StringType(),
np.unicode_: T.StringType(),
np.bool_: T.BooleanType(),
})
return getattr(_numpy_to_spark_mapping, cache_attr_name)
# TODO: Changing fields in this class or the UnischemaField will break reading due to the schema being pickled next to
# the dataset on disk
示例7: test_decode_numpy_scalar_with_explicit_scalar_codec
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DoubleType [as 别名]
def test_decode_numpy_scalar_with_explicit_scalar_codec():
"""Decoding a row that has a field with the codec set explicitly"""
MatrixSchema = Unischema('TestSchema', [UnischemaField('scalar', np.float64, (), ScalarCodec(DoubleType()), False)])
row = {'scalar': 42.0}
decoded_value = decode_row(row, MatrixSchema)['scalar']
assert decoded_value == 42
assert isinstance(decoded_value, np.float64)
示例8: encode
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DoubleType [as 别名]
def encode(self, unischema_field, value):
# Lazy loading pyspark to avoid creating pyspark dependency on data reading code path
# (currently works only with make_batch_reader). We should move all pyspark related code into a separate module
import pyspark.sql.types as sql_types
# We treat ndarrays with shape=() as scalars
unsized_numpy_array = isinstance(value, np.ndarray) and value.shape == ()
# Validate the input to be a scalar (or an unsized numpy array)
if not unsized_numpy_array and hasattr(value, '__len__') and (not isinstance(value, str)):
raise TypeError('Expected a scalar as a value for field \'{}\'. '
'Got a non-numpy type\'{}\''.format(unischema_field.name, type(value)))
if unischema_field.shape:
raise ValueError('The shape field of unischema_field \'%s\' must be an empty tuple (i.e. \'()\' '
'to indicate a scalar. However, the actual shape is %s',
unischema_field.name, unischema_field.shape)
if isinstance(self._spark_type, (sql_types.ByteType, sql_types.ShortType, sql_types.IntegerType,
sql_types.LongType)):
return int(value)
if isinstance(self._spark_type, (sql_types.FloatType, sql_types.DoubleType)):
return float(value)
if isinstance(self._spark_type, sql_types.BooleanType):
return bool(value)
if isinstance(self._spark_type, sql_types.StringType):
if not isinstance(value, str):
raise ValueError(
'Expected a string value for field {}. Got type {}'.format(unischema_field.name, type(value)))
return str(value)
return value
示例9: outputDataType
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DoubleType [as 别名]
def outputDataType(self):
return DoubleType()
示例10: validateInputType
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DoubleType [as 别名]
def validateInputType(self, inputType):
if inputType != DoubleType():
raise TypeError("Bad input type: {}. ".format(inputType) +
"Requires Double.")
示例11: test_unary_transformer_validate_input_type
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DoubleType [as 别名]
def test_unary_transformer_validate_input_type(self):
shiftVal = 3
transformer = MockUnaryTransformer(shiftVal=shiftVal)\
.setInputCol("input").setOutputCol("output")
# should not raise any errors
transformer.validateInputType(DoubleType())
with self.assertRaises(TypeError):
# passing the wrong input type should raise an error
transformer.validateInputType(IntegerType())
示例12: __init__
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DoubleType [as 别名]
def __init__(self, scoreAndLabels):
sc = scoreAndLabels.ctx
sql_ctx = SQLContext.getOrCreate(sc)
df = sql_ctx.createDataFrame(scoreAndLabels, schema=StructType([
StructField("score", DoubleType(), nullable=False),
StructField("label", DoubleType(), nullable=False)]))
java_class = sc._jvm.org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
java_model = java_class(df._jdf)
super(BinaryClassificationMetrics, self).__init__(java_model)
示例13: test_fromString
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DoubleType [as 别名]
def test_fromString(self):
s = SmvSchema.fromString("a:string; b:double")
fields = s.spark_schema.fields
assert(len(fields) == 2)
assert(fields[0] == st.StructField('a', st.StringType()))
assert(fields[1] == st.StructField('b', st.DoubleType()))
示例14: load_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DoubleType [as 别名]
def load_schema(analysis_path):
type_map = {
'KEY': StringType(),
'NUMBER': DoubleType(),
'CATEGORY': StringType(),
'TEXT': StringType(),
'IMAGE_URL': StringType()
}
schema_file = os.path.join(analysis_path, 'schema.json')
schema_json = json.loads(file_io.read_file_to_string(schema_file))
fields = [StructField(x['name'], type_map[x['type']]) for x in schema_json]
return schema_json, StructType(fields)
示例15: load_schema
# 需要导入模块: from pyspark.sql import types [as 别名]
# 或者: from pyspark.sql.types import DoubleType [as 别名]
def load_schema(schema_file):
type_map = {
'KEY': StringType(),
'NUMBER': DoubleType(),
'CATEGORY': StringType(),
'TEXT': StringType(),
'IMAGE_URL': StringType()
}
schema_json = json.loads(file_io.read_file_to_string(schema_file))
fields = [StructField(x['name'], type_map[x['type']]) for x in schema_json]
return schema_json, StructType(fields)