本文整理汇总了Python中pyspark.ml.feature.VectorAssembler.load方法的典型用法代码示例。如果您正苦于以下问题:Python VectorAssembler.load方法的具体用法?Python VectorAssembler.load怎么用?Python VectorAssembler.load使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.ml.feature.VectorAssembler
的用法示例。
在下文中一共展示了VectorAssembler.load方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark.ml.feature import VectorAssembler [as 别名]
# 或者: from pyspark.ml.feature.VectorAssembler import load [as 别名]
def main(base_path):
APP_NAME = "make_predictions_streaming.py"
# Process data every 10 seconds
PERIOD = 10
BROKERS = 'localhost:9092'
PREDICTION_TOPIC = 'flight_delay_classification_request'
try:
sc and ssc
except NameError as e:
import findspark
# Add the streaming package and initialize
findspark.add_packages(["org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0"])
findspark.init()
import pyspark
import pyspark.sql
import pyspark.streaming
conf = SparkConf().set("spark.default.parallelism", 1)
sc = SparkContext(appName="Agile Data Science: PySpark Streaming 'Hello, World!'", conf=conf)
ssc = StreamingContext(sc, PERIOD)
spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
#
# Load all models to be used in making predictions
#
# Load the arrival delay bucketizer
from pyspark.ml.feature import Bucketizer
arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
arrival_bucketizer = Bucketizer.load(arrival_bucketizer_path)
# Load all the string field vectorizer pipelines into a dict
from pyspark.ml.feature import StringIndexerModel
string_indexer_models = {}
for column in ["Carrier", "Origin", "Dest", "Route"]:
string_indexer_model_path = "{}/models/string_indexer_model_{}.bin".format(
base_path,
column
)
string_indexer_model = StringIndexerModel.load(string_indexer_model_path)
string_indexer_models[column] = string_indexer_model
# Load the numeric vector assembler
from pyspark.ml.feature import VectorAssembler
vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path)
vector_assembler = VectorAssembler.load(vector_assembler_path)
# Load the classifier model
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
random_forest_model_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format(
base_path
)
rfc = RandomForestClassificationModel.load(
random_forest_model_path
)
#
# Process Prediction Requests in Streaming
#
stream = KafkaUtils.createDirectStream(
ssc,
[PREDICTION_TOPIC],
{
"metadata.broker.list": BROKERS,
"group.id": "0",
}
)
object_stream = stream.map(lambda x: json.loads(x[1]))
object_stream.pprint()
row_stream = object_stream.map(
lambda x: Row(
FlightDate=iso8601.parse_date(x['FlightDate']),
Origin=x['Origin'],
Distance=x['Distance'],
DayOfMonth=x['DayOfMonth'],
DayOfYear=x['DayOfYear'],
UUID=x['UUID'],
DepDelay=x['DepDelay'],
DayOfWeek=x['DayOfWeek'],
FlightNum=x['FlightNum'],
Dest=x['Dest'],
Timestamp=iso8601.parse_date(x['Timestamp']),
Carrier=x['Carrier']
)
)
row_stream.pprint()
#
# Create a dataframe from the RDD-based object stream
#
def classify_prediction_requests(rdd):
#.........这里部分代码省略.........