Python classification.RandomForestClassifier方法代碼示例

本文整理匯總了Python中pyspark.ml.classification.RandomForestClassifier方法的典型用法代碼示例。如果您正苦於以下問題：Python classification.RandomForestClassifier方法的具體用法？Python classification.RandomForestClassifier怎麽用？Python classification.RandomForestClassifier使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.ml.classification的用法示例。

在下文中一共展示了classification.RandomForestClassifier方法的3個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: get_features_importance

# 需要導入模塊: from pyspark.ml import classification [as 別名]
# 或者: from pyspark.ml.classification import RandomForestClassifier [as 別名]
def get_features_importance(
    rf_pipeline: pyspark.ml.PipelineModel, rf_index: int = -2, assembler_index: int = -3
) -> Dict[str, float]:
    """
    Extract the features importance from a Pipeline model containing a RandomForestClassifier stage.

    :param rf_pipeline: Input pipeline
    :param rf_index: index of the RandomForestClassifier stage
    :param assembler_index: index of the VectorAssembler stage
    :return: feature importance for each feature in the RF model
    """

    feature_names = [
        x[: -len("_indexed")] if x.endswith("_indexed") else x
        for x in rf_pipeline.stages[assembler_index].getInputCols()
    ]

    return dict(zip(feature_names, rf_pipeline.stages[rf_index].featureImportances))

開發者ID:broadinstitute，項目名稱:gnomad_methods，代碼行數:20，代碼來源:random_forest.py

示例2: test_random_forrest_classification

# 需要導入模塊: from pyspark.ml import classification [as 別名]
# 或者: from pyspark.ml.classification import RandomForestClassifier [as 別名]
def test_random_forrest_classification(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt")
        original_data = self.spark.read.format("libsvm").load(input_path)
        #
        # truncate the features
        #
        feature_count = 5
        self.spark.udf.register("truncateFeatures",
                                lambda x: SparseVector(feature_count, range(0,feature_count), x.toArray()[125:130]),
                                VectorUDT())
        data = original_data.selectExpr("cast(label as string) as label", "truncateFeatures(features) as features")
        label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
        feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures",
                                        maxCategories=10, handleInvalid='keep')

        rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10)
        pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf])
        model = pipeline.fit(data)
        model_onnx = convert_sparkml(model, 'Sparkml RandomForest Classifier', [
            ('label', StringTensorType([1, 1])),
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data)
        data_np = {
            'label': data.toPandas().label.values,
            'features': data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        }
        expected = [
            predicted.toPandas().indexedLabel.values.astype(numpy.int64),
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlRandomForestClassifier")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['indexedLabel', 'prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)

開發者ID:onnx，項目名稱:onnxmltools，代碼行數:42，代碼來源:test_random_forest_classifier.py

示例3: main

# 需要導入模塊: from pyspark.ml import classification [as 別名]
# 或者: from pyspark.ml.classification import RandomForestClassifier [as 別名]
def main(argv):

    # Name of prediction column
    label = argv[1]

    start = time.time()

    spark = SparkSession.builder \
                        .master("local[*]") \
                        .appName("datasetClassifier") \
                        .getOrCreate()

    data = spark.read.parquet(argv[0]).cache()

    vector = data.first()["features"]
    featureCount = len(vector)

    print(f"Feature count    : {featureCount}")
    classCount = int(data.select(label).distinct().count())
    print(f"Class count    : {classCount}")
    print(f"Dataset size (unbalanced)    : {data.count()}")
    data.groupby(label).count().show(classCount)

    data = datasetBalancer.downsample(data, label, 1)

    print(f"Dataset size (balanced)  : {data.count()}")
    data.groupby(label).count().show(classCount)

    testFraction = 0.3
    seed = 123

    # DecisionTree
    dtc = DecisionTreeClassifier()
    mcc = SparkMultiClassClassifier(dtc, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k,v in matrics.items(): print(f"{k}\t{v}")

    # RandomForest
    rfc = RandomForestClassifier()
    mcc = SparkMultiClassClassifier(rfc, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k,v in matrics.items(): print(f"{k}\t{v}")

    # LogisticRegression
    lr = LogisticRegression()
    mcc = SparkMultiClassClassifier(lr, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k,v in matrics.items(): print(f"{k}\t{v}")

    # MultilayerPerceptronClassifier
    layers = [featureCount, 10, classCount]
    mpc = MultilayerPerceptronClassifier().setLayers(layers) \
                                          .setBlockSize(128) \
                                          .setSeed(1234) \
                                          .setMaxIter(200)
    mcc = SparkMultiClassClassifier(mpc, label, testFraction, seed)
    matrics = mcc.fit(data)
    for k,v in matrics.items(): print(f"{k}\t{v}")

    end = time.time()
    print("Time: %f  sec." %(end-start))

開發者ID:sbl-sdsc，項目名稱:mmtf-pyspark，代碼行數:63，代碼來源:datasetClassifier.py

注：本文中的pyspark.ml.classification.RandomForestClassifier方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。