本文整理汇总了Python中pyspark.ml.feature.VectorAssembler.write方法的典型用法代码示例。如果您正苦于以下问题:Python VectorAssembler.write方法的具体用法?Python VectorAssembler.write怎么用?Python VectorAssembler.write使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.ml.feature.VectorAssembler
的用法示例。
在下文中一共展示了VectorAssembler.write方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark.ml.feature import VectorAssembler [as 别名]
# 或者: from pyspark.ml.feature.VectorAssembler import write [as 别名]
def main(base_path):
APP_NAME = "train_spark_mllib_model.py"
# If there is no SparkSession, create the environment
try:
sc and spark
except NameError as e:
import findspark
findspark.init()
import pyspark
import pyspark.sql
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
#
# {
# "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
# "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
# "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
# }
#
from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
from pyspark.sql.types import StructType, StructField
from pyspark.sql.functions import udf
schema = StructType([
StructField("ArrDelay", DoubleType(), True),
StructField("CRSArrTime", TimestampType(), True),
StructField("CRSDepTime", TimestampType(), True),
StructField("Carrier", StringType(), True),
StructField("DayOfMonth", IntegerType(), True),
StructField("DayOfWeek", IntegerType(), True),
StructField("DayOfYear", IntegerType(), True),
StructField("DepDelay", DoubleType(), True),
StructField("Dest", StringType(), True),
StructField("Distance", DoubleType(), True),
StructField("FlightDate", DateType(), True),
StructField("FlightNum", StringType(), True),
StructField("Origin", StringType(), True),
StructField("Route", StringType(), True),
StructField("TailNum", StringType(), True),
StructField("EngineManufacturer", StringType(), True),
StructField("EngineModel", StringType(), True),
StructField("Manufacturer", StringType(), True),
StructField("ManufacturerYear", StringType(), True),
StructField("OwnerState", StringType(), True),
])
input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
base_path
)
features = spark.read.json(input_path, schema=schema)
features.first()
#
# Add the hour of day of scheduled arrival/departure
#
from pyspark.sql.functions import hour
features_with_hour = features.withColumn(
"CRSDepHourOfDay",
hour(features.CRSDepTime)
)
features_with_hour = features_with_hour.withColumn(
"CRSArrHourOfDay",
hour(features.CRSArrTime)
)
features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show()
#
# Check for nulls in features before using Spark ML
#
null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns]
cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
print("\nNull Value Report")
print("-----------------")
print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))
#
# Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
#
from pyspark.ml.feature import Bucketizer
# Setup the Bucketizer
splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
arrival_bucketizer = Bucketizer(
splits=splits,
inputCol="ArrDelay",
outputCol="ArrDelayBucket"
)
# Save the model
arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
# Apply the model
ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
#
#.........这里部分代码省略.........