Python classification.RandomForestClassifier方法代码示例

本文整理汇总了Python中pyspark.ml.classification.RandomForestClassifier方法的典型用法代码示例。如果您正苦于以下问题：Python classification.RandomForestClassifier方法的具体用法？Python classification.RandomForestClassifier怎么用？Python classification.RandomForestClassifier使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.ml.classification的用法示例。

在下文中一共展示了classification.RandomForestClassifier方法的3个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_features_importance

# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import RandomForestClassifier [as 别名]
def get_features_importance(
    rf_pipeline: pyspark.ml.PipelineModel, rf_index: int = -2, assembler_index: int = -3
) -> Dict[str, float]:
    """
    Extract the features importance from a Pipeline model containing a RandomForestClassifier stage.

    :param rf_pipeline: Input pipeline
    :param rf_index: index of the RandomForestClassifier stage
    :param assembler_index: index of the VectorAssembler stage
    :return: feature importance for each feature in the RF model
    """

    feature_names = [
        x[: -len("_indexed")] if x.endswith("_indexed") else x
        for x in rf_pipeline.stages[assembler_index].getInputCols()
    ]

    return dict(zip(feature_names, rf_pipeline.stages[rf_index].featureImportances))

开发者ID:broadinstitute，项目名称:gnomad_methods，代码行数:20，代码来源:random_forest.py

示例2: test_random_forrest_classification

# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import RandomForestClassifier [as 别名]
def test_random_forrest_classification(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt")
        original_data = self.spark.read.format("libsvm").load(input_path)
        #
        # truncate the features
        #
        feature_count = 5
        self.spark.udf.register("truncateFeatures",
                                lambda x: SparseVector(feature_count, range(0,feature_count), x.toArray()[125:130]),
                                VectorUDT())
        data = original_data.selectExpr("cast(label as string) as label", "truncateFeatures(features) as features")
        label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
        feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures",
                                        maxCategories=10, handleInvalid='keep')

        rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)
        pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf])
        model = pipeline.fit(data)
        model_onnx = convert_sparkml(model, 'Sparkml RandomForest Classifier', [
            ('label', StringTensorType([1, 1])),
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data)
        data_np = {
            'label': data.toPandas().label.values,
            'features': data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        }
        expected = [
            predicted.toPandas().indexedLabel.values.astype(numpy.int64),
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlRandomForestClassifier")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['indexedLabel', 'prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)

开发者ID:onnx，项目名称:onnxmltools，代码行数:42，代码来源:test_random_forest_classifier.py

示例3: main

# 需要导入模块: from pyspark.ml import classification [as 别名]
# 或者: from pyspark.ml.classification import RandomForestClassifier [as 别名]
def main(argv):

    # Name of prediction column
    label = argv[1]

    start = time.time()

    spark = SparkSession.builder \
                        .master("local[*]") \
                        .appName("datasetClassifier") \
                        .getOrCreate()

    data = spark.read.parquet(argv[0]).cache()

    vector = data.first()["features"]
    featureCount = len(vector)

    print(f"Feature count    : {featureCount}")
    classCount = int(data.select(label).distinct().count())
    print(f"Class count    : {classCount}")
    print(f"Dataset size (unbalanced)    : {data.count()}")
    data.groupby(label).count().show(classCount)

    data = datasetBalancer.downsample(data, label, 1)

    print(f"Dataset size (balanced)  : {data.count()}")
    data.groupby(label).count().show(classCount)

    testFraction = 0.3
    seed = 123

    # DecisionTree
    dtc = DecisionTreeClassifier()
    mcc = SparkMultiClassClassifier(dtc, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k,v in matrics.items(): print(f"{k}\t{v}")

    # RandomForest
    rfc = RandomForestClassifier()
    mcc = SparkMultiClassClassifier(rfc, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k,v in matrics.items(): print(f"{k}\t{v}")

    # LogisticRegression
    lr = LogisticRegression()
    mcc = SparkMultiClassClassifier(lr, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k,v in matrics.items(): print(f"{k}\t{v}")

    # MultilayerPerceptronClassifier
    layers = [featureCount, 10, classCount]
    mpc = MultilayerPerceptronClassifier().setLayers(layers) \
                                          .setBlockSize(128) \
                                          .setSeed(1234) \
                                          .setMaxIter(200)
    mcc = SparkMultiClassClassifier(mpc, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k,v in matrics.items(): print(f"{k}\t{v}")

    end = time.time()
    print("Time: %f  sec." %(end-start))

开发者ID:sbl-sdsc，项目名称:mmtf-pyspark，代码行数:63，代码来源:datasetClassifier.py

注：本文中的pyspark.ml.classification.RandomForestClassifier方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。