本文整理汇总了Python中pyspark.ml.evaluation.MulticlassClassificationEvaluator.evaluate方法的典型用法代码示例。如果您正苦于以下问题:Python MulticlassClassificationEvaluator.evaluate方法的具体用法?Python MulticlassClassificationEvaluator.evaluate怎么用?Python MulticlassClassificationEvaluator.evaluate使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.ml.evaluation.MulticlassClassificationEvaluator
的用法示例。
在下文中一共展示了MulticlassClassificationEvaluator.evaluate方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark.ml.evaluation import MulticlassClassificationEvaluator [as 别名]
# 或者: from pyspark.ml.evaluation.MulticlassClassificationEvaluator import evaluate [as 别名]
def main(sc, spark):
# Load and vectorize the corpus
corpus = load_corpus(sc, spark)
vector = make_vectorizer().fit(corpus)
# Index the labels of the classification
labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
labelIndex = labelIndex.fit(corpus)
# Split the data into training and test sets
training, test = corpus.randomSplit([0.8, 0.2])
# Create the classifier
clf = LogisticRegression(
maxIter=10, regParam=0.3, elasticNetParam=0.8,
family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")
# Create the model
model = Pipeline(stages=[
vector, labelIndex, clf
]).fit(training)
# Make predictions
predictions = model.transform(test)
predictions.select("prediction", "indexedLabel", "tfidf").show(5)
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
gbtModel = model.stages[2]
print(gbtModel) # summary only
示例2: testLogisticMLPipeline1
# 需要导入模块: from pyspark.ml.evaluation import MulticlassClassificationEvaluator [as 别名]
# 或者: from pyspark.ml.evaluation.MulticlassClassificationEvaluator import evaluate [as 别名]
def testLogisticMLPipeline1(self):
training = sqlCtx.createDataFrame([
("a b c d e spark", 1.0),
("b d", 2.0),
("spark f g h", 1.0),
("hadoop mapreduce", 2.0),
("b spark who", 1.0),
("g d a y", 2.0),
("spark fly", 1.0),
("was mapreduce", 2.0),
("e spark program", 1.0),
("a e c l", 2.0),
("spark compile", 1.0),
("hadoop software", 2.0)
], ["text", "label"])
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
lr = LogisticRegression(sqlCtx)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
model = pipeline.fit(training)
test = sqlCtx.createDataFrame([
("spark i j k", 1.0),
("l m n", 2.0),
("mapreduce spark", 1.0),
("apache hadoop", 2.0)], ["text", "label"])
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator()
score = evaluator.evaluate(predictionAndLabels)
self.failUnless(score == 1.0)
示例3: RunRandomForest
# 需要导入模块: from pyspark.ml.evaluation import MulticlassClassificationEvaluator [as 别名]
# 或者: from pyspark.ml.evaluation.MulticlassClassificationEvaluator import evaluate [as 别名]
def RunRandomForest(tf, ctx):
sqlContext = SQLContext(ctx)
rdd = tf.map(parseForRandomForest)
# The schema is encoded in a string.
schema = ['genre', 'track_id', 'features']
# Apply the schema to the RDD.
songDF = sqlContext.createDataFrame(rdd, schema)
# Register the DataFrame as a table.
songDF.registerTempTable("genclass")
labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF)
trainingData, testData = songDF.randomSplit([0.8, 0.2])
labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features")
#rfc = SVMModel([.5, 10, 20], 5)
#rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features")
pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter])
model = pipeline.fit(trainingData)
predictions = model.transform(testData)
predictions.show()
evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision")
accuracy = evaluator.evaluate(predictions)
print 'Accuracy of RandomForest = ', accuracy * 100
print "Test Error = ", (1.0 - accuracy) * 100
示例4: textPredict
# 需要导入模块: from pyspark.ml.evaluation import MulticlassClassificationEvaluator [as 别名]
# 或者: from pyspark.ml.evaluation.MulticlassClassificationEvaluator import evaluate [as 别名]
def textPredict(request):
"""6.文本聚类,热度预测"""
label = request.POST['label']
title = request.POST['title']
conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
"""处理数据集,生成特征向量"""
dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
print(dfTitles.dtypes)
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(dfTitles)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.show()
for features_label in rescaledData.select("features", "rawFeatures").take(3):
print(features_label)
"""决策树模型培训"""
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
featureIndexer =\
VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
(trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
model = pipeline.fit(trainingData)
"""模型测试"""
predictions = model.transform(testData)
predictions.show()
predictions.select("prediction", "indexedLabel", "features").show(5)
"""用户数据测试,单个新闻测试"""
sentenceData = sqlContext.createDataFrame([
(label,title),
],['label',"title"])
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
rescaledData = idfModel.transform(featurizedData)
myprediction = model.transform(rescaledData)
print("==================================================")
myprediction.show()
resultList = convertDfToList(myprediction)
"""模型评估"""
evaluator = MulticlassClassificationEvaluator(
labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))
treeModel = model.stages[2]
print(treeModel)
sc.stop()
return render(request,{'resultList':resultList})
示例5: sparking_your_interest
# 需要导入模块: from pyspark.ml.evaluation import MulticlassClassificationEvaluator [as 别名]
# 或者: from pyspark.ml.evaluation.MulticlassClassificationEvaluator import evaluate [as 别名]
def sparking_your_interest():
df = SQLContext.read.json('speeches_dataset.json')
df_fillna=df.fillna("")
print(df_fillna.count())
print(df_fillna.printSchema())
df_utf=call_utf_encoder(df)
df_cleaned=call_para_cleanup(df_utf)
print(df_cleaned)
df_with_bigrams = call_ngrams(df_cleaned, 2)
df_with_trigrams = call_ngrams(df_with_bigrams, 3)
df_with_4grams = call_ngrams(df_with_trigrams, 4)
df_with_5grams = call_ngrams(df_with_4grams, 4)
df_with_6grams = call_ngrams(df_with_5grams, 4)
df_with_vocab_score = call_speech_vocab(df_with_6grams)
df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams')
df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams')
df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams')
assembler = VectorAssembler(
inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"],
outputCol="features")
assembler_output = assembler.transform(df_with_4grams_idf_vectors)
output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features')
print(output.show())
print(output.count())
output_tordd = output.rdd
train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123)
train_df = train_rdd.toDF()
test_df = test_rdd.toDF()
print(train_df)
print(test_df)
print('Train DF - Count: ')
print(train_df.count())
print('Test DF - Count: ')
print(test_df.count())
print("Initializing RF Model")
labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df)
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32)
pipeline = Pipeline(stages=[labelIndexer,rf])
model = pipeline.fit(output)
print("Completed RF Model")
predictions = model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
rfModel = model.stages[1]
print(rfModel) # summary only
print("Predictions: ")
print(predictions.show())
示例6: model
# 需要导入模块: from pyspark.ml.evaluation import MulticlassClassificationEvaluator [as 别名]
# 或者: from pyspark.ml.evaluation.MulticlassClassificationEvaluator import evaluate [as 别名]
def model(classifier, ftrain, fvalid, fprediction):
startTime = time.time()
ctx = SparkContext(appName="model_on_Spark")
sqlContext = SQLContext(ctx)
logger = SparkLogger(ctx)
logger.set_level('ERROR')
# load and prepare training and validation data
rawTrain, train = prepData(sqlContext, ctx, ftrain)
rawValid, valid = prepData(sqlContext, ctx, fvalid)
# is needed to join columns
valid = indexData(valid)
rawValid = indexData(rawValid)
classifiers = {
"RandomForestClassifier" : RFC
}
clf = classifiers[classifier]()
labelIndexer = StringIndexer(inputCol="label", outputCol="indexed")
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")
# train and predict
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf])
model = pipeline.fit(train)
predictions = model.transform(valid)
# write to file:
subsetPrediction = predictions.select("prediction", "index")
subsetValidData = rawValid.select("dataset", "index")
output = (subsetValidData
.join(subsetPrediction, subsetPrediction.index == subsetValidData.index)
.drop("index")
.drop("index"))
lines = output.map(toCSVLine)
lines.saveAsTextFile('output')
evaluator = MulticlassClassificationEvaluator(
labelCol="label", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print "Test Error = %g" % (1.0 - accuracy)
executionTime = time.time() - startTime
row=classifier+','+str(executionTime)
ctx.parallelize([row]).saveAsTextFile("timing")
示例7: price_predict
# 需要导入模块: from pyspark.ml.evaluation import MulticlassClassificationEvaluator [as 别名]
# 或者: from pyspark.ml.evaluation.MulticlassClassificationEvaluator import evaluate [as 别名]
def price_predict(path, windows=5, spark_contest=None, sql_context=None):
if spark_contest is None:
spark_contest, sql_context = load_spark_context()
input_data = DataParser(path=path, window_size=windows)
close_train_df, close_test_df, open_train_df, open_test_df = input_data.get_n_days_history_data(
data_type=DATA_FRAME, spark_context=spark_contest, sql_context=sql_context)
evaluator = MulticlassClassificationEvaluator(metricName=PREDICTION)
# handle open data
open_trainer = MultilayerPerceptronClassifier(maxIter=1, layers=[4, 5, 4, 3], blockSize=128,
featuresCol=FEATURES, labelCol=LABEL, seed=1234)
open_model = open_trainer.fit(open_train_df)
open_result = open_model.transform(open_test_df)
open_prediction_labels = open_result.select(PREDICTION, LABEL)
print("Precision:" + str(evaluator.evaluate(open_prediction_labels)))
# handle close data
close_trainer = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3], blockSize=128,
featuresCol=FEATURES, labelCol=LABEL, seed=1234)
close_model = close_trainer.fit(close_train_df)
close_result = close_model.transform(close_test_df)
close_prediction_labels = close_result.select(PREDICTION, LABEL)
print("Precision:" + str(evaluator.evaluate(close_prediction_labels)))
示例8: print_evaluation_metrics
# 需要导入模块: from pyspark.ml.evaluation import MulticlassClassificationEvaluator [as 别名]
# 或者: from pyspark.ml.evaluation.MulticlassClassificationEvaluator import evaluate [as 别名]
def print_evaluation_metrics(model, test_df, labelCol="label", featuresCol="features"):
"""
Prints evaluation metrics.
:param model: Used model.
:param test_df: dataframe containing test data.
:param labelCol: label column.
:param featuresCol: features column.
:return: A DataFrame.
"""
predictions = model.transform(test_df)
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
labelCol=labelCol, predictionCol="prediction",)
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
weighted_precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
weighted_recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
print "Accuracy:", accuracy
print "f1:", f1
print "Precision:", weighted_precision
print "Recall:", weighted_recall
示例9: calculate_accuracy_metrics
# 需要导入模块: from pyspark.ml.evaluation import MulticlassClassificationEvaluator [as 别名]
# 或者: from pyspark.ml.evaluation.MulticlassClassificationEvaluator import evaluate [as 别名]
def calculate_accuracy_metrics(predictions):
"""
Calculates accuracy metrics for a Prediction DataFrame
:param predictions:
:return:
"""
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
predictionCol="prediction")
accuracy = round(evaluator.evaluate(predictions, {evaluator.metricName: "precision"}), 2)
recall = round(evaluator.evaluate(predictions, {evaluator.metricName: "recall"}), 2)
positive_cases = predictions.filter(predictions["indexedLabel"] == 1.0)
negative_cases = predictions.filter(predictions["indexedLabel"] == 0.0)
false_positive_cases = negative_cases.filter(positive_cases["prediction"] == 1.0)
false_negative_cases = positive_cases.filter(positive_cases["prediction"] == 0.0)
return [accuracy,
recall,
positive_cases.count(),
negative_cases.count(),
false_positive_cases.count(),
false_negative_cases.count()]
示例10: build_decision_tree
# 需要导入模块: from pyspark.ml.evaluation import MulticlassClassificationEvaluator [as 别名]
# 或者: from pyspark.ml.evaluation.MulticlassClassificationEvaluator import evaluate [as 别名]
def build_decision_tree(sqlContext, features, interested):
print '-----------------------------------------'
data = sqlContext.createDataFrame(
[Row(label=interested[i],features=Vectors.dense(features[i])) for i in xrange(len(features))])
data.printSchema()
data.show(5)
print 'created data frame'
# Index the label column & adding metadata.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
print 'created label indexer'
# Mark the features with < 4 distinct values as categorical
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
# Split the data into training and test sets
(trainingData, testData) = data.randomSplit([0.8, 0.2])
# Train a DecisionTree model
dt = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
# dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
# dt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)
# Chain the indexers together with DecisionTree
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
# Train the model
model = pipeline.fit(trainingData)
# Make predictions
predictions = model.transform(testData)
predictions.select("prediction", "indexedLabel", "features").show(5)
# Select (prediction, true label) & compute test error
evaluator = MulticlassClassificationEvaluator(
labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
precision = evaluator.evaluate(predictions)
treeModel = model.stages[2]
return (1 - precision, model)
示例11: naiveBayeseian
# 需要导入模块: from pyspark.ml.evaluation import MulticlassClassificationEvaluator [as 别名]
# 或者: from pyspark.ml.evaluation.MulticlassClassificationEvaluator import evaluate [as 别名]
def naiveBayeseian():
def parseLine(line):
keys = [float(x) for x in line.split(",")]
#return LabeledPoint(keys[0],keys[1:])
return keys
scdata1 = sc.textFile("/home/ubantu/TwoClassfeatureSet.csv")
data= scdata1.map(parseLine)
splits = data.randomSplit([0.8, 0.2], 1234)
train = splits[0]
test = splits[1]
layers = [30, 20, 20, 2]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
# train the model
model = trainer.fit(train)
# compute precision on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="precision")
print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
示例12: generateROC
# 需要导入模块: from pyspark.ml.evaluation import MulticlassClassificationEvaluator [as 别名]
# 或者: from pyspark.ml.evaluation.MulticlassClassificationEvaluator import evaluate [as 别名]
ax0.set_title('First Model', color='#999999')
ax1.set_title('Second Model', color='#999999')
generateROC(axList[0], labelsAndScores)
generateROC(axList[1], labelsAndScores2)
display(fig)
# COMMAND ----------
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
metric = 'precision'
multiclassEval = MulticlassClassificationEvaluator()
multiclassEval.setMetricName(metric)
print 'Model one {0}: {1:.3f}'.format(metric, multiclassEval.evaluate(irisTestPredictions))
print 'Model two {0}: {1:.3f}\n'.format(metric, multiclassEval.evaluate(irisTestPredictions2))
# COMMAND ----------
import inspect
print inspect.getsource(MulticlassClassificationEvaluator)
# COMMAND ----------
# MAGIC %md
# MAGIC #### Using MLlib instead of ML
# MAGIC
# MAGIC We've been using `ml` transformers, estimators, pipelines, and evaluators. How can we accomplish the same things with MLlib?
# COMMAND ----------
示例13: LogisticRegression
# 需要导入模块: from pyspark.ml.evaluation import MulticlassClassificationEvaluator [as 别名]
# 或者: from pyspark.ml.evaluation.MulticlassClassificationEvaluator import evaluate [as 别名]
# $example on$
# load data file.
inputData = spark.read.format("libsvm") \
.load("data/mllib/sample_multiclass_classification_data.txt")
# generate the train/test split.
(train, test) = inputData.randomSplit([0.8, 0.2])
# instantiate the base classifier.
lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)
# train the multiclass model.
ovrModel = ovr.fit(train)
# score the model on test data.
predictions = ovrModel.transform(test)
# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="precision")
# compute the classification error on test data.
precision = evaluator.evaluate(predictions)
print("Test Error : " + str(1 - precision))
# $example off$
spark.stop()
示例14: MultilayerPerceptronClassifier
# 需要导入模块: from pyspark.ml.evaluation import MulticlassClassificationEvaluator [as 别名]
# 或者: from pyspark.ml.evaluation.MulticlassClassificationEvaluator import evaluate [as 别名]
# $example on$
# Load training data
data = spark.read.format("libsvm")\
.load("data/mllib/sample_multiclass_classification_data.txt")
# Split the data into train and test
splits = data.randomSplit([0.6, 0.4], 1234)
train = splits[0]
test = splits[1]
# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [4, 5, 4, 3]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
# train the model
model = trainer.fit(train)
# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
# $example off$
spark.stop()
示例15: Pipeline
# 需要导入模块: from pyspark.ml.evaluation import MulticlassClassificationEvaluator [as 别名]
# 或者: from pyspark.ml.evaluation.MulticlassClassificationEvaluator import evaluate [as 别名]
pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])
# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************
evaluator = MulticlassClassificationEvaluator(
predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)
grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()
print "Grid is build"
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator)
print "CV Estimator is defined"
cv_model = cv.fit(dfTrain)
print "Model is fitted"
df_test_pred = cv_model.transform(dfTest)
print "Labels are predicted"
print evaluator.evaluate(df_test_pred)