本文整理汇总了Python中pyspark.ml.feature.VectorAssembler方法的典型用法代码示例。如果您正苦于以下问题:Python feature.VectorAssembler方法的具体用法?Python feature.VectorAssembler怎么用?Python feature.VectorAssembler使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.ml.feature
的用法示例。
在下文中一共展示了feature.VectorAssembler方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: select_features
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import VectorAssembler [as 别名]
def select_features(
wiki: str,
num_features: int,
metadata: Dict
) -> mt.Transformer:
def transform(df: DataFrame) -> DataFrame:
# Compute the "best" features, per some metric
sc = df.sql_ctx.sparkSession.sparkContext
features = metadata['input_feature_meta']['features']
selected = mjolnir.feature_engineering.select_features(
sc, df, features, num_features, algo='mrmr')
metadata['wiki_features'][wiki] = selected
# Rebuild the `features` col with only the selected features
keep_cols = metadata['default_cols'] + selected
df_selected = df.select(*keep_cols)
assembler = VectorAssembler(
inputCols=selected, outputCol='features')
return assembler.transform(df_selected).drop(*selected)
return transform
示例2: get_features_importance
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import VectorAssembler [as 别名]
def get_features_importance(
rf_pipeline: pyspark.ml.PipelineModel, rf_index: int = -2, assembler_index: int = -3
) -> Dict[str, float]:
"""
Extract the features importance from a Pipeline model containing a RandomForestClassifier stage.
:param rf_pipeline: Input pipeline
:param rf_index: index of the RandomForestClassifier stage
:param assembler_index: index of the VectorAssembler stage
:return: feature importance for each feature in the RF model
"""
feature_names = [
x[: -len("_indexed")] if x.endswith("_indexed") else x
for x in rf_pipeline.stages[assembler_index].getInputCols()
]
return dict(zip(feature_names, rf_pipeline.stages[rf_index].featureImportances))
示例3: build_sparkml_operator_name_map
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import VectorAssembler [as 别名]
def build_sparkml_operator_name_map():
res = {k: "pyspark.ml.feature." + k.__name__ for k in [
Binarizer, BucketedRandomProjectionLSHModel, Bucketizer,
ChiSqSelectorModel, CountVectorizerModel, DCT, ElementwiseProduct, HashingTF, IDFModel, ImputerModel,
IndexToString, MaxAbsScalerModel, MinHashLSHModel, MinMaxScalerModel, NGram, Normalizer, OneHotEncoderModel,
PCAModel, PolynomialExpansion, QuantileDiscretizer, RegexTokenizer,
StandardScalerModel, StopWordsRemover, StringIndexerModel, Tokenizer, VectorAssembler, VectorIndexerModel,
VectorSlicer, Word2VecModel
]}
res.update({k: "pyspark.ml.classification." + k.__name__ for k in [
LinearSVCModel, LogisticRegressionModel, DecisionTreeClassificationModel, GBTClassificationModel,
RandomForestClassificationModel, NaiveBayesModel, MultilayerPerceptronClassificationModel, OneVsRestModel
]})
res.update({k: "pyspark.ml.regression." + k.__name__ for k in [
AFTSurvivalRegressionModel, DecisionTreeRegressionModel, GBTRegressionModel, GBTRegressionModel,
GeneralizedLinearRegressionModel, IsotonicRegressionModel, LinearRegressionModel, RandomForestRegressionModel
]})
return res
示例4: test_model_vector_assembler
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import VectorAssembler [as 别名]
def test_model_vector_assembler(self):
col_names = ["a", "b", "c"]
model = VectorAssembler(inputCols=col_names, outputCol='features')
data = self.spark.createDataFrame([(1., 0., 3.)], col_names)
model_onnx = convert_sparkml(model, 'Sparkml VectorAssembler', [
('a', FloatTensorType([1, 1])),
('b', FloatTensorType([1, 1])),
('c', FloatTensorType([1, 1]))
])
self.assertTrue(model_onnx is not None)
self.assertTrue(model_onnx.graph.node is not None)
# run the model
predicted = model.transform(data)
expected = predicted.select("features").toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values
data_np = {
'a': data.select('a').toPandas().values.astype(numpy.float32),
'b': data.select('b').toPandas().values.astype(numpy.float32),
'c': data.select('c').toPandas().values.astype(numpy.float32)
}
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlVectorAssembler")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['features'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例5: _bucketize
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import VectorAssembler [as 别名]
def _bucketize(df, input_cols):
def j_str_arr(arr):
gateway = SparkContext._gateway
j_str = gateway.jvm.java.lang.String
j_arr = gateway.new_array(j_str, len(arr))
for i, val in enumerate(arr):
j_arr[i] = val
return j_arr
output_cols = ['{}-bucketed'.format(x) for x in input_cols]
# Sadly the multi-col versions are only in scala, pyspark doesn't
# have them yet.
j_bucketizer = (
JavaParams._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer")
.setInputCols(j_str_arr(input_cols))
.setOutputCols(j_str_arr(output_cols))
.setNumBuckets(254)
.setRelativeError(1/2550)
.setHandleInvalid('error')
.fit(df._jdf))
j_df_bucketized = j_bucketizer.transform(df._jdf)
df_bucketized = DataFrame(j_df_bucketized, df.sql_ctx).drop(*input_cols)
# Now we need to assemble the bucketized values into vector
# form for the feature selector to work with.
assembler = VectorAssembler(
inputCols=output_cols, outputCol='features')
return assembler.transform(df_bucketized).drop(*output_cols)
示例6: to_numeric_df
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import VectorAssembler [as 别名]
def to_numeric_df(kdf: "ks.DataFrame") -> Tuple[pyspark.sql.DataFrame, List[Tuple[str, ...]]]:
"""
Takes a dataframe and turns it into a dataframe containing a single numerical
vector of doubles. This dataframe has a single field called '_1'.
TODO: index is not preserved currently
:param kdf: the Koalas dataframe.
:return: a pair of dataframe, list of strings (the name of the columns
that were converted to numerical types)
>>> to_numeric_df(ks.DataFrame({'A': [0, 1], 'B': [1, 0], 'C': ['x', 'y']}))
(DataFrame[__correlation_output__: vector], [('A',), ('B',)])
"""
# TODO, it should be more robust.
accepted_types = {
np.dtype(dt)
for dt in [np.int8, np.int16, np.int32, np.int64, np.float32, np.float64, np.bool_]
}
numeric_column_labels = [
label for label in kdf._internal.column_labels if kdf[label].dtype in accepted_types
]
numeric_df = kdf._internal.spark_frame.select(
*[kdf._internal.spark_column_for(idx) for idx in numeric_column_labels]
)
va = VectorAssembler(inputCols=numeric_df.columns, outputCol=CORRELATION_OUTPUT_COLUMN)
v = va.transform(numeric_df).select(CORRELATION_OUTPUT_COLUMN)
return v, numeric_column_labels
示例7: spark_model_iris
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import VectorAssembler [as 别名]
def spark_model_iris(iris_df):
feature_names, iris_pandas_df, iris_spark_df = iris_df
assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
pipeline = Pipeline(stages=[assembler, lr])
# Fit the model
model = pipeline.fit(iris_spark_df)
preds_df = model.transform(iris_spark_df)
preds = [x.prediction for x in preds_df.select("prediction").collect()]
return SparkModelWithData(model=model,
spark_df=iris_spark_df,
pandas_df=iris_pandas_df,
predictions=preds)
示例8: spark_model_transformer
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import VectorAssembler [as 别名]
def spark_model_transformer(iris_df):
feature_names, iris_pandas_df, iris_spark_df = iris_df
assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
# Fit the model
preds_df = assembler.transform(iris_spark_df)
preds = [x.features for x in preds_df.select("features").collect()]
return SparkModelWithData(model=assembler,
spark_df=iris_spark_df,
pandas_df=iris_pandas_df,
predictions=preds)
示例9: spark_model_estimator
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import VectorAssembler [as 别名]
def spark_model_estimator(iris_df, spark_context):
feature_names, iris_pandas_df, iris_spark_df = iris_df
assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
features_df = assembler.transform(iris_spark_df)
lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8)
# Fit the model
model = lr.fit(features_df)
preds_df = model.transform(features_df)
preds = [x.prediction for x in preds_df.select("prediction").collect()]
return SparkModelWithData(model=model,
spark_df=features_df,
pandas_df=iris_pandas_df,
predictions=preds)
示例10: test_model_pipeline_3_stage
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import VectorAssembler [as 别名]
def test_model_pipeline_3_stage(self):
this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
full_data = self.spark.read.format('csv')\
.options(header='true', inferschema='true').load(input_path)
cols = ['workclass', 'education', 'marital_status']
training_data, test_data = full_data.select(*cols).limit(1000).randomSplit([0.9, 0.1], seed=1)
stages = []
for col in cols:
stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip'))
# we need the dropLast option otherwise when assembled together (below)
# we won't be able to expand the features without difficulties
stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False))
stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features'))
pipeline = Pipeline(stages=stages)
model = pipeline.fit(training_data)
model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [
('workclass', StringTensorType([1, 1])),
('education', StringTensorType([1, 1])),
('marital_status', StringTensorType([1, 1]))
])
self.assertTrue(model_onnx is not None)
self.assertTrue(model_onnx.graph.node is not None)
# run the model
predicted = model.transform(test_data)
data_np = {
'workclass': test_data.select('workclass').toPandas().values,
'education': test_data.select('education').toPandas().values,
'marital_status': test_data.select('marital_status').toPandas().values
}
expected = predicted.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlPipeline_3Stage")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['features'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例11: test_model_pipeline_4_stage
# 需要导入模块: from pyspark.ml import feature [as 别名]
# 或者: from pyspark.ml.feature import VectorAssembler [as 别名]
def test_model_pipeline_4_stage(self):
this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
full_data = self.spark.read.format('csv')\
.options(header='true', inferschema='true').load(input_path)
cols = ['workclass', 'education', 'marital_status']
training_data, test_data = full_data.select('income', *cols).limit(1000).randomSplit([0.9, 0.1],seed=1)
stages = []
for col in cols:
stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip'))
stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False))
stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features'))
stages.append(StringIndexer(inputCol='income', outputCol='label', handleInvalid='skip'))
stages.append(LogisticRegression(maxIter=100, tol=0.0001))
pipeline = Pipeline(stages=stages)
model = pipeline.fit(training_data)
model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [
('income', StringTensorType([1, 1])),
('workclass', StringTensorType([1, 1])),
('education', StringTensorType([1, 1])),
('marital_status', StringTensorType([1, 1]))
])
self.assertTrue(model_onnx is not None)
self.assertTrue(model_onnx.graph.node is not None)
# run the model
predicted = model.transform(test_data)
data_np = {
'income': test_data.select('income').toPandas().values,
'workclass': test_data.select('workclass').toPandas().values,
'education': test_data.select('education').toPandas().values,
'marital_status': test_data.select('marital_status').toPandas().values
}
expected = [
predicted.toPandas().label.values.astype(numpy.float32),
predicted.toPandas().prediction.values.astype(numpy.float32),
predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
]
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlPipeline_4Stage")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['label', 'prediction', 'probability'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)