当前位置: 首页>>代码示例>>Python>>正文


Python evaluation.MulticlassClassificationEvaluator类代码示例

本文整理汇总了Python中pyspark.ml.evaluation.MulticlassClassificationEvaluator的典型用法代码示例。如果您正苦于以下问题:Python MulticlassClassificationEvaluator类的具体用法?Python MulticlassClassificationEvaluator怎么用?Python MulticlassClassificationEvaluator使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了MulticlassClassificationEvaluator类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: RunRandomForest

def RunRandomForest(tf, ctx):
	sqlContext = SQLContext(ctx)
	rdd = tf.map(parseForRandomForest)
	# The schema is encoded in a string.
	schema = ['genre', 'track_id', 'features']
	# Apply the schema to the RDD.
	songDF = sqlContext.createDataFrame(rdd, schema)

	# Register the DataFrame as a table.
	songDF.registerTempTable("genclass")
	labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF)

	trainingData, testData = songDF.randomSplit([0.8, 0.2])

	labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)

	rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features")
	#rfc = SVMModel([.5, 10, 20], 5)
	#rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features")

	pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter])
	model = pipeline.fit(trainingData)

	predictions = model.transform(testData)
	predictions.show()

	evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision")
	accuracy = evaluator.evaluate(predictions)
	print 'Accuracy of RandomForest = ', accuracy * 100
	print "Test Error = ", (1.0 - accuracy) * 100
开发者ID:Sunhick,项目名称:music-cognita,代码行数:30,代码来源:genre_classification.py

示例2: testLogisticMLPipeline1

 def testLogisticMLPipeline1(self):
     training = sqlCtx.createDataFrame([
         ("a b c d e spark", 1.0),
         ("b d", 2.0),
         ("spark f g h", 1.0),
         ("hadoop mapreduce", 2.0),
         ("b spark who", 1.0),
         ("g d a y", 2.0),
         ("spark fly", 1.0),
         ("was mapreduce", 2.0),
         ("e spark program", 1.0),
         ("a e c l", 2.0),
         ("spark compile", 1.0),
         ("hadoop software", 2.0)
         ], ["text", "label"])
     tokenizer = Tokenizer(inputCol="text", outputCol="words")
     hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
     lr = LogisticRegression(sqlCtx)
     pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
     model = pipeline.fit(training)
     test = sqlCtx.createDataFrame([
         ("spark i j k", 1.0),
         ("l m n", 2.0),
         ("mapreduce spark", 1.0),
         ("apache hadoop", 2.0)], ["text", "label"])
     result = model.transform(test)
     predictionAndLabels = result.select("prediction", "label")
     evaluator = MulticlassClassificationEvaluator()
     score = evaluator.evaluate(predictionAndLabels)
     self.failUnless(score == 1.0)
开发者ID:d-behi,项目名称:incubator-systemml,代码行数:30,代码来源:test_mllearn.py

示例3: main

def main(sc, spark):
    # Load and vectorize the corpus
    corpus = load_corpus(sc, spark)
    vector = make_vectorizer().fit(corpus)

    # Index the labels of the classification
    labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
    labelIndex = labelIndex.fit(corpus)

    # Split the data into training and test sets
    training, test = corpus.randomSplit([0.8, 0.2])

    # Create the classifier
    clf = LogisticRegression(
        maxIter=10, regParam=0.3, elasticNetParam=0.8,
        family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")

    # Create the model
    model = Pipeline(stages=[
        vector, labelIndex, clf
    ]).fit(training)

    # Make predictions
    predictions = model.transform(test)
    predictions.select("prediction", "indexedLabel", "tfidf").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    gbtModel = model.stages[2]
    print(gbtModel)  # summary only
开发者ID:yokeyong,项目名称:atap,代码行数:34,代码来源:sc_classification.py

示例4: textPredict

def textPredict(request):
    """6.文本聚类,热度预测"""
    label = request.POST['label']
    title = request.POST['title']

    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    """处理数据集,生成特征向量"""
    dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
    print(dfTitles.dtypes)
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(dfTitles)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()
    for features_label in rescaledData.select("features", "rawFeatures").take(3):
        print(features_label)
    """决策树模型培训"""
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
    (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    model = pipeline.fit(trainingData)
    """模型测试"""
    predictions = model.transform(testData)
    predictions.show()
    predictions.select("prediction", "indexedLabel", "features").show(5)
    """用户数据测试,单个新闻测试"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    print("==================================================")
    myprediction.show()
    resultList = convertDfToList(myprediction)

    """模型评估"""
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    print(treeModel)

    sc.stop()
    return render(request,{'resultList':resultList})
开发者ID:JallyHe,项目名称:networkPublicOpinionAnalysisSystem,代码行数:58,代码来源:views.py

示例5: sparking_your_interest

def sparking_your_interest():
	df = SQLContext.read.json('speeches_dataset.json')
	df_fillna=df.fillna("")
	print(df_fillna.count())
	print(df_fillna.printSchema())

	df_utf=call_utf_encoder(df)
	df_cleaned=call_para_cleanup(df_utf)
	print(df_cleaned)
	df_with_bigrams = call_ngrams(df_cleaned, 2)
	df_with_trigrams = call_ngrams(df_with_bigrams, 3)
	df_with_4grams = call_ngrams(df_with_trigrams, 4)
	df_with_5grams = call_ngrams(df_with_4grams, 4)
	df_with_6grams = call_ngrams(df_with_5grams, 4)
	df_with_vocab_score = call_speech_vocab(df_with_6grams)

	df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams')
	df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams')
	df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams')
	assembler = VectorAssembler(
	    inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"],
	    outputCol="features")
	assembler_output = assembler.transform(df_with_4grams_idf_vectors)
	output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features')
	print(output.show())
	print(output.count())

	output_tordd = output.rdd
	train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123)
	train_df = train_rdd.toDF()
	test_df = test_rdd.toDF()
	print(train_df)
	print(test_df)

	print('Train DF - Count: ')
	print(train_df.count())
	print('Test DF - Count: ')
	print(test_df.count())

	print("Initializing RF Model")
	labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df)       
	rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32)
	pipeline = Pipeline(stages=[labelIndexer,rf])
	model = pipeline.fit(output)
	print("Completed RF Model")

	predictions = model.transform(test_df)
	evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
	accuracy = evaluator.evaluate(predictions)
	print("Test Error = %g" % (1.0 - accuracy))
	rfModel = model.stages[1]
	print(rfModel)  # summary only
	print("Predictions: ")
	print(predictions.show())
开发者ID:vikaasa,项目名称:Spark_Workshop,代码行数:54,代码来源:sparking_your_interest.py

示例6: model

def model(classifier, ftrain, fvalid, fprediction):

    startTime = time.time()

    ctx = SparkContext(appName="model_on_Spark")
    sqlContext = SQLContext(ctx)
    logger = SparkLogger(ctx)
    logger.set_level('ERROR')

    # load and prepare training and validation data
    rawTrain, train = prepData(sqlContext, ctx, ftrain)
    rawValid, valid = prepData(sqlContext, ctx, fvalid)

    # is needed to join columns
    valid = indexData(valid)
    rawValid = indexData(rawValid)

    classifiers = {
        "RandomForestClassifier" : RFC
    }

    clf = classifiers[classifier]()

    labelIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")

    # train and predict
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf])
    model = pipeline.fit(train)

    predictions = model.transform(valid)

    # write to file:

    subsetPrediction = predictions.select("prediction", "index")
    subsetValidData = rawValid.select("dataset", "index")

    output = (subsetValidData
               .join(subsetPrediction, subsetPrediction.index == subsetValidData.index)
                    .drop("index")
                    .drop("index"))

    lines = output.map(toCSVLine)
    lines.saveAsTextFile('output')

    evaluator = MulticlassClassificationEvaluator(
       labelCol="label", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print "Test Error = %g" % (1.0 - accuracy)

    executionTime = time.time() - startTime
    row=classifier+','+str(executionTime)
    ctx.parallelize([row]).saveAsTextFile("timing")
开发者ID:KiprasKancys,项目名称:DMWMAnalytics,代码行数:53,代码来源:pyspark_ml.py

示例7: build_decision_tree

def build_decision_tree(sqlContext, features, interested):
	print '-----------------------------------------'
	data = sqlContext.createDataFrame(
			[Row(label=interested[i],features=Vectors.dense(features[i])) for i in xrange(len(features))])
	data.printSchema()
	data.show(5)
	print 'created data frame'

	# Index the label column & adding metadata.
	labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
	print 'created label indexer'

	# Mark the features with < 4 distinct values as categorical
	featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

	# Split the data into training and test sets
	(trainingData, testData) = data.randomSplit([0.8, 0.2])

	# Train a DecisionTree model
	dt = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
#	dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
#	dt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)

	# Chain the indexers together with DecisionTree
	pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

	# Train the model
	model = pipeline.fit(trainingData)

	# Make predictions
	predictions = model.transform(testData)

	predictions.select("prediction", "indexedLabel", "features").show(5)

	# Select (prediction, true label) & compute test error
	evaluator = MulticlassClassificationEvaluator(
			labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
	precision = evaluator.evaluate(predictions)

	treeModel = model.stages[2]
	return (1 - precision, model)
开发者ID:ngthanhtrung23,项目名称:Personalized-Events-Recommendation,代码行数:41,代码来源:decision_tree.py

示例8: naiveBayeseian

def naiveBayeseian():

    def parseLine(line):
        keys  = [float(x) for x in line.split(",")]
        #return LabeledPoint(keys[0],keys[1:])
        return keys
    scdata1 = sc.textFile("/home/ubantu/TwoClassfeatureSet.csv")
    data= scdata1.map(parseLine)
    splits = data.randomSplit([0.8, 0.2], 1234)
    train = splits[0]
    test = splits[1]
    layers = [30, 20, 20, 2]
    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
    # train the model
    model = trainer.fit(train)
    # compute precision on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="precision")
    print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
开发者ID:vikaspalkar,项目名称:Parallel-Music-Genre-Classification,代码行数:21,代码来源:SparkClassn.py

示例9: price_predict

def price_predict(path, windows=5, spark_contest=None, sql_context=None):
    if spark_contest is None:
        spark_contest, sql_context = load_spark_context()
    input_data = DataParser(path=path, window_size=windows)
    close_train_df, close_test_df, open_train_df, open_test_df = input_data.get_n_days_history_data(
        data_type=DATA_FRAME, spark_context=spark_contest, sql_context=sql_context)
    evaluator = MulticlassClassificationEvaluator(metricName=PREDICTION)

    # handle open data
    open_trainer = MultilayerPerceptronClassifier(maxIter=1, layers=[4, 5, 4, 3], blockSize=128,
                                                  featuresCol=FEATURES, labelCol=LABEL, seed=1234)
    open_model = open_trainer.fit(open_train_df)
    open_result = open_model.transform(open_test_df)
    open_prediction_labels = open_result.select(PREDICTION, LABEL)
    print("Precision:" + str(evaluator.evaluate(open_prediction_labels)))

    # handle close data
    close_trainer = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3], blockSize=128,
                                                   featuresCol=FEATURES, labelCol=LABEL, seed=1234)
    close_model = close_trainer.fit(close_train_df)
    close_result = close_model.transform(close_test_df)
    close_prediction_labels = close_result.select(PREDICTION, LABEL)
    print("Precision:" + str(evaluator.evaluate(close_prediction_labels)))
开发者ID:WarnWang,项目名称:Dissertation,代码行数:23,代码来源:neutral_network_MLP.py

示例10: print_evaluation_metrics

def print_evaluation_metrics(model, test_df, labelCol="label", featuresCol="features"):
    """
    Prints evaluation metrics.
    :param model: Used model.
    :param test_df: dataframe containing test data.
    :param labelCol: label column.
    :param featuresCol: features column.
    :return: A DataFrame.
    """
    predictions = model.transform(test_df)


    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol=labelCol, predictionCol="prediction",)
    accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
    f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
    weighted_precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
    weighted_recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
    print "Accuracy:", accuracy
    print "f1:", f1
    print "Precision:", weighted_precision
    print "Recall:", weighted_recall
开发者ID:rhasan,项目名称:machine-learning,代码行数:23,代码来源:Quora.py

示例11: calculate_accuracy_metrics

def calculate_accuracy_metrics(predictions):

    """
    Calculates accuracy metrics for a Prediction DataFrame

    :param predictions:
    :return:
    """
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                                  predictionCol="prediction")
    accuracy = round(evaluator.evaluate(predictions, {evaluator.metricName: "precision"}), 2)
    recall = round(evaluator.evaluate(predictions, {evaluator.metricName: "recall"}), 2)

    positive_cases = predictions.filter(predictions["indexedLabel"] == 1.0)
    negative_cases = predictions.filter(predictions["indexedLabel"] == 0.0)
    false_positive_cases = negative_cases.filter(positive_cases["prediction"] == 1.0)
    false_negative_cases = positive_cases.filter(positive_cases["prediction"] == 0.0)

    return [accuracy,
            recall,
            positive_cases.count(),
            negative_cases.count(),
            false_positive_cases.count(),
            false_negative_cases.count()]
开发者ID:USF-ML2,项目名称:SKYNET-,代码行数:24,代码来源:modeling_utils.py

示例12: display

# Select results to view
display(predictions.select("label", "prediction", "probability"))

# COMMAND ----------

# MAGIC %md
# MAGIC #### Model Evaluation
# MAGIC 
# MAGIC To evaluate our model, we will be making use of the Evaluator in MulticlassClassification. Note that f1-score is the default metric for the MulticlassClassificationEvaluator.

# COMMAND ----------

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="precision")
accuracy = evaluator.evaluate(predictions)
print "Model Accuracy: ", accuracy

# COMMAND ----------

# MAGIC %md
# MAGIC The Evaluator is able to use a few metrics such as f1-score, precision, recall, weightedPrecision and weightedRecall.
# MAGIC 
# MAGIC evaluator.setMetricName("insert_metric_here") can be used to change the metric used to evaluate models.

# COMMAND ----------

evaluator.explainParam("metricName")

# COMMAND ----------
开发者ID:Inscrutive,项目名称:spark,代码行数:31,代码来源:NaiveBayes.py

示例13: LogisticRegression

    # $example on$
    # load data file.
    inputData = spark.read.format("libsvm") \
        .load("data/mllib/sample_multiclass_classification_data.txt")

    # generate the train/test split.
    (train, test) = inputData.randomSplit([0.8, 0.2])

    # instantiate the base classifier.
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

    # instantiate the One Vs Rest Classifier.
    ovr = OneVsRest(classifier=lr)

    # train the multiclass model.
    ovrModel = ovr.fit(train)

    # score the model on test data.
    predictions = ovrModel.transform(test)

    # obtain evaluator.
    evaluator = MulticlassClassificationEvaluator(metricName="precision")

    # compute the classification error on test data.
    precision = evaluator.evaluate(predictions)
    print("Test Error : " + str(1 - precision))
    # $example off$

    spark.stop()
开发者ID:15652101501,项目名称:spark,代码行数:29,代码来源:one_vs_rest_example.py

示例14: HashingTF

)
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])


# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************

evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)

grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()

print "Grid is build"

cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator)

print "CV Estimator is defined"

cv_model = cv.fit(dfTrain)

print "Model is fitted"

df_test_pred = cv_model.transform(dfTest)
开发者ID:pifouuu,项目名称:ProjetBigData,代码行数:31,代码来源:script1.py

示例15: time

print "Done in {} second".format(round(tt,3))


# In[18]:

from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


print "Fitting the classifier on selected features"
t0 = time()

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
lr = LogisticRegression(featuresCol='selectedFeatures',labelCol='target_indexed',maxIter=30, regParam=0.01)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

string_indexer_model = string_indexer.fit(dfTrainSelect)
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect).cache()
lrModel = lr.fit(dfTrainIndexed)

tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[19]:

print "Testing precision of the model"
t0 = time()

dfValidSelect=dfValid.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(['selectedFeatures','label']).cache()
开发者ID:pifouuu,项目名称:ProjetBigData,代码行数:31,代码来源:script3_ter.py


注:本文中的pyspark.ml.evaluation.MulticlassClassificationEvaluator类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。