本文整理汇总了Python中pyspark.ml.classification.RandomForestClassifier方法的典型用法代码示例。如果您正苦于以下问题:Python classification.RandomForestClassifier方法的具体用法?Python classification.RandomForestClassifier怎么用?Python classification.RandomForestClassifier使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.ml.classification
的用法示例。
在下文中一共展示了classification.RandomForestClassifier方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_features_importance
# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import RandomForestClassifier [as 别名]
def get_features_importance(
rf_pipeline: pyspark.ml.PipelineModel, rf_index: int = -2, assembler_index: int = -3
) -> Dict[str, float]:
"""
Extract the features importance from a Pipeline model containing a RandomForestClassifier stage.
:param rf_pipeline: Input pipeline
:param rf_index: index of the RandomForestClassifier stage
:param assembler_index: index of the VectorAssembler stage
:return: feature importance for each feature in the RF model
"""
feature_names = [
x[: -len("_indexed")] if x.endswith("_indexed") else x
for x in rf_pipeline.stages[assembler_index].getInputCols()
]
return dict(zip(feature_names, rf_pipeline.stages[rf_index].featureImportances))
示例2: test_random_forrest_classification
# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import RandomForestClassifier [as 别名]
def test_random_forrest_classification(self):
this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt")
original_data = self.spark.read.format("libsvm").load(input_path)
#
# truncate the features
#
feature_count = 5
self.spark.udf.register("truncateFeatures",
lambda x: SparseVector(feature_count, range(0,feature_count), x.toArray()[125:130]),
VectorUDT())
data = original_data.selectExpr("cast(label as string) as label", "truncateFeatures(features) as features")
label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures",
maxCategories=10, handleInvalid='keep')
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)
pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf])
model = pipeline.fit(data)
model_onnx = convert_sparkml(model, 'Sparkml RandomForest Classifier', [
('label', StringTensorType([1, 1])),
('features', FloatTensorType([1, feature_count]))
], spark_session=self.spark)
self.assertTrue(model_onnx is not None)
# run the model
predicted = model.transform(data)
data_np = {
'label': data.toPandas().label.values,
'features': data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
}
expected = [
predicted.toPandas().indexedLabel.values.astype(numpy.int64),
predicted.toPandas().prediction.values.astype(numpy.float32),
predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
]
paths = save_data_models(data_np, expected, model, model_onnx,
basename="SparkmlRandomForestClassifier")
onnx_model_path = paths[3]
output, output_shapes = run_onnx_model(['indexedLabel', 'prediction', 'probability'], data_np, onnx_model_path)
compare_results(expected, output, decimal=5)
示例3: main
# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import RandomForestClassifier [as 别名]
def main(argv):
# Name of prediction column
label = argv[1]
start = time.time()
spark = SparkSession.builder \
.master("local[*]") \
.appName("datasetClassifier") \
.getOrCreate()
data = spark.read.parquet(argv[0]).cache()
vector = data.first()["features"]
featureCount = len(vector)
print(f"Feature count : {featureCount}")
classCount = int(data.select(label).distinct().count())
print(f"Class count : {classCount}")
print(f"Dataset size (unbalanced) : {data.count()}")
data.groupby(label).count().show(classCount)
data = datasetBalancer.downsample(data, label, 1)
print(f"Dataset size (balanced) : {data.count()}")
data.groupby(label).count().show(classCount)
testFraction = 0.3
seed = 123
# DecisionTree
dtc = DecisionTreeClassifier()
mcc = SparkMultiClassClassifier(dtc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")
# RandomForest
rfc = RandomForestClassifier()
mcc = SparkMultiClassClassifier(rfc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")
# LogisticRegression
lr = LogisticRegression()
mcc = SparkMultiClassClassifier(lr, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")
# MultilayerPerceptronClassifier
layers = [featureCount, 10, classCount]
mpc = MultilayerPerceptronClassifier().setLayers(layers) \
.setBlockSize(128) \
.setSeed(1234) \
.setMaxIter(200)
mcc = SparkMultiClassClassifier(mpc, label, testFraction, seed)
matrics = mcc.fit(data)
for k,v in matrics.items(): print(f"{k}\t{v}")
end = time.time()
print("Time: %f sec." %(end-start))