本文整理汇总了Python中pyspark.ml.evaluation.MulticlassClassificationEvaluator类的典型用法代码示例。如果您正苦于以下问题:Python MulticlassClassificationEvaluator类的具体用法?Python MulticlassClassificationEvaluator怎么用?Python MulticlassClassificationEvaluator使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了MulticlassClassificationEvaluator类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: RunRandomForest
def RunRandomForest(tf, ctx):
sqlContext = SQLContext(ctx)
rdd = tf.map(parseForRandomForest)
# The schema is encoded in a string.
schema = ['genre', 'track_id', 'features']
# Apply the schema to the RDD.
songDF = sqlContext.createDataFrame(rdd, schema)
# Register the DataFrame as a table.
songDF.registerTempTable("genclass")
labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF)
trainingData, testData = songDF.randomSplit([0.8, 0.2])
labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)
rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features")
#rfc = SVMModel([.5, 10, 20], 5)
#rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features")
pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter])
model = pipeline.fit(trainingData)
predictions = model.transform(testData)
predictions.show()
evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision")
accuracy = evaluator.evaluate(predictions)
print 'Accuracy of RandomForest = ', accuracy * 100
print "Test Error = ", (1.0 - accuracy) * 100
示例2: testLogisticMLPipeline1
def testLogisticMLPipeline1(self):
training = sqlCtx.createDataFrame([
("a b c d e spark", 1.0),
("b d", 2.0),
("spark f g h", 1.0),
("hadoop mapreduce", 2.0),
("b spark who", 1.0),
("g d a y", 2.0),
("spark fly", 1.0),
("was mapreduce", 2.0),
("e spark program", 1.0),
("a e c l", 2.0),
("spark compile", 1.0),
("hadoop software", 2.0)
], ["text", "label"])
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
lr = LogisticRegression(sqlCtx)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
model = pipeline.fit(training)
test = sqlCtx.createDataFrame([
("spark i j k", 1.0),
("l m n", 2.0),
("mapreduce spark", 1.0),
("apache hadoop", 2.0)], ["text", "label"])
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator()
score = evaluator.evaluate(predictionAndLabels)
self.failUnless(score == 1.0)
示例3: main
def main(sc, spark):
# Load and vectorize the corpus
corpus = load_corpus(sc, spark)
vector = make_vectorizer().fit(corpus)
# Index the labels of the classification
labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
labelIndex = labelIndex.fit(corpus)
# Split the data into training and test sets
training, test = corpus.randomSplit([0.8, 0.2])
# Create the classifier
clf = LogisticRegression(
maxIter=10, regParam=0.3, elasticNetParam=0.8,
family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")
# Create the model
model = Pipeline(stages=[
vector, labelIndex, clf
]).fit(training)
# Make predictions
predictions = model.transform(test)
predictions.select("prediction", "indexedLabel", "tfidf").show(5)
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
gbtModel = model.stages[2]
print(gbtModel) # summary only
示例4: textPredict
def textPredict(request):
"""6.文本聚类,热度预测"""
label = request.POST['label']
title = request.POST['title']
conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
"""处理数据集,生成特征向量"""
dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
print(dfTitles.dtypes)
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(dfTitles)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.show()
for features_label in rescaledData.select("features", "rawFeatures").take(3):
print(features_label)
"""决策树模型培训"""
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
featureIndexer =\
VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
(trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
model = pipeline.fit(trainingData)
"""模型测试"""
predictions = model.transform(testData)
predictions.show()
predictions.select("prediction", "indexedLabel", "features").show(5)
"""用户数据测试,单个新闻测试"""
sentenceData = sqlContext.createDataFrame([
(label,title),
],['label',"title"])
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
rescaledData = idfModel.transform(featurizedData)
myprediction = model.transform(rescaledData)
print("==================================================")
myprediction.show()
resultList = convertDfToList(myprediction)
"""模型评估"""
evaluator = MulticlassClassificationEvaluator(
labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))
treeModel = model.stages[2]
print(treeModel)
sc.stop()
return render(request,{'resultList':resultList})
示例5: sparking_your_interest
def sparking_your_interest():
df = SQLContext.read.json('speeches_dataset.json')
df_fillna=df.fillna("")
print(df_fillna.count())
print(df_fillna.printSchema())
df_utf=call_utf_encoder(df)
df_cleaned=call_para_cleanup(df_utf)
print(df_cleaned)
df_with_bigrams = call_ngrams(df_cleaned, 2)
df_with_trigrams = call_ngrams(df_with_bigrams, 3)
df_with_4grams = call_ngrams(df_with_trigrams, 4)
df_with_5grams = call_ngrams(df_with_4grams, 4)
df_with_6grams = call_ngrams(df_with_5grams, 4)
df_with_vocab_score = call_speech_vocab(df_with_6grams)
df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams')
df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams')
df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams')
assembler = VectorAssembler(
inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"],
outputCol="features")
assembler_output = assembler.transform(df_with_4grams_idf_vectors)
output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features')
print(output.show())
print(output.count())
output_tordd = output.rdd
train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123)
train_df = train_rdd.toDF()
test_df = test_rdd.toDF()
print(train_df)
print(test_df)
print('Train DF - Count: ')
print(train_df.count())
print('Test DF - Count: ')
print(test_df.count())
print("Initializing RF Model")
labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df)
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32)
pipeline = Pipeline(stages=[labelIndexer,rf])
model = pipeline.fit(output)
print("Completed RF Model")
predictions = model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
rfModel = model.stages[1]
print(rfModel) # summary only
print("Predictions: ")
print(predictions.show())
示例6: model
def model(classifier, ftrain, fvalid, fprediction):
startTime = time.time()
ctx = SparkContext(appName="model_on_Spark")
sqlContext = SQLContext(ctx)
logger = SparkLogger(ctx)
logger.set_level('ERROR')
# load and prepare training and validation data
rawTrain, train = prepData(sqlContext, ctx, ftrain)
rawValid, valid = prepData(sqlContext, ctx, fvalid)
# is needed to join columns
valid = indexData(valid)
rawValid = indexData(rawValid)
classifiers = {
"RandomForestClassifier" : RFC
}
clf = classifiers[classifier]()
labelIndexer = StringIndexer(inputCol="label", outputCol="indexed")
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")
# train and predict
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf])
model = pipeline.fit(train)
predictions = model.transform(valid)
# write to file:
subsetPrediction = predictions.select("prediction", "index")
subsetValidData = rawValid.select("dataset", "index")
output = (subsetValidData
.join(subsetPrediction, subsetPrediction.index == subsetValidData.index)
.drop("index")
.drop("index"))
lines = output.map(toCSVLine)
lines.saveAsTextFile('output')
evaluator = MulticlassClassificationEvaluator(
labelCol="label", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print "Test Error = %g" % (1.0 - accuracy)
executionTime = time.time() - startTime
row=classifier+','+str(executionTime)
ctx.parallelize([row]).saveAsTextFile("timing")
示例7: build_decision_tree
def build_decision_tree(sqlContext, features, interested):
print '-----------------------------------------'
data = sqlContext.createDataFrame(
[Row(label=interested[i],features=Vectors.dense(features[i])) for i in xrange(len(features))])
data.printSchema()
data.show(5)
print 'created data frame'
# Index the label column & adding metadata.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
print 'created label indexer'
# Mark the features with < 4 distinct values as categorical
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
# Split the data into training and test sets
(trainingData, testData) = data.randomSplit([0.8, 0.2])
# Train a DecisionTree model
dt = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
# dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
# dt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)
# Chain the indexers together with DecisionTree
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
# Train the model
model = pipeline.fit(trainingData)
# Make predictions
predictions = model.transform(testData)
predictions.select("prediction", "indexedLabel", "features").show(5)
# Select (prediction, true label) & compute test error
evaluator = MulticlassClassificationEvaluator(
labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
precision = evaluator.evaluate(predictions)
treeModel = model.stages[2]
return (1 - precision, model)
示例8: naiveBayeseian
def naiveBayeseian():
def parseLine(line):
keys = [float(x) for x in line.split(",")]
#return LabeledPoint(keys[0],keys[1:])
return keys
scdata1 = sc.textFile("/home/ubantu/TwoClassfeatureSet.csv")
data= scdata1.map(parseLine)
splits = data.randomSplit([0.8, 0.2], 1234)
train = splits[0]
test = splits[1]
layers = [30, 20, 20, 2]
# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
# train the model
model = trainer.fit(train)
# compute precision on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="precision")
print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
示例9: price_predict
def price_predict(path, windows=5, spark_contest=None, sql_context=None):
if spark_contest is None:
spark_contest, sql_context = load_spark_context()
input_data = DataParser(path=path, window_size=windows)
close_train_df, close_test_df, open_train_df, open_test_df = input_data.get_n_days_history_data(
data_type=DATA_FRAME, spark_context=spark_contest, sql_context=sql_context)
evaluator = MulticlassClassificationEvaluator(metricName=PREDICTION)
# handle open data
open_trainer = MultilayerPerceptronClassifier(maxIter=1, layers=[4, 5, 4, 3], blockSize=128,
featuresCol=FEATURES, labelCol=LABEL, seed=1234)
open_model = open_trainer.fit(open_train_df)
open_result = open_model.transform(open_test_df)
open_prediction_labels = open_result.select(PREDICTION, LABEL)
print("Precision:" + str(evaluator.evaluate(open_prediction_labels)))
# handle close data
close_trainer = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3], blockSize=128,
featuresCol=FEATURES, labelCol=LABEL, seed=1234)
close_model = close_trainer.fit(close_train_df)
close_result = close_model.transform(close_test_df)
close_prediction_labels = close_result.select(PREDICTION, LABEL)
print("Precision:" + str(evaluator.evaluate(close_prediction_labels)))
示例10: print_evaluation_metrics
def print_evaluation_metrics(model, test_df, labelCol="label", featuresCol="features"):
"""
Prints evaluation metrics.
:param model: Used model.
:param test_df: dataframe containing test data.
:param labelCol: label column.
:param featuresCol: features column.
:return: A DataFrame.
"""
predictions = model.transform(test_df)
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
labelCol=labelCol, predictionCol="prediction",)
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
weighted_precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
weighted_recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
print "Accuracy:", accuracy
print "f1:", f1
print "Precision:", weighted_precision
print "Recall:", weighted_recall
示例11: calculate_accuracy_metrics
def calculate_accuracy_metrics(predictions):
"""
Calculates accuracy metrics for a Prediction DataFrame
:param predictions:
:return:
"""
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
predictionCol="prediction")
accuracy = round(evaluator.evaluate(predictions, {evaluator.metricName: "precision"}), 2)
recall = round(evaluator.evaluate(predictions, {evaluator.metricName: "recall"}), 2)
positive_cases = predictions.filter(predictions["indexedLabel"] == 1.0)
negative_cases = predictions.filter(predictions["indexedLabel"] == 0.0)
false_positive_cases = negative_cases.filter(positive_cases["prediction"] == 1.0)
false_negative_cases = positive_cases.filter(positive_cases["prediction"] == 0.0)
return [accuracy,
recall,
positive_cases.count(),
negative_cases.count(),
false_positive_cases.count(),
false_negative_cases.count()]
示例12: display
# Select results to view
display(predictions.select("label", "prediction", "probability"))
# COMMAND ----------
# MAGIC %md
# MAGIC #### Model Evaluation
# MAGIC
# MAGIC To evaluate our model, we will be making use of the Evaluator in MulticlassClassification. Note that f1-score is the default metric for the MulticlassClassificationEvaluator.
# COMMAND ----------
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
metricName="precision")
accuracy = evaluator.evaluate(predictions)
print "Model Accuracy: ", accuracy
# COMMAND ----------
# MAGIC %md
# MAGIC The Evaluator is able to use a few metrics such as f1-score, precision, recall, weightedPrecision and weightedRecall.
# MAGIC
# MAGIC evaluator.setMetricName("insert_metric_here") can be used to change the metric used to evaluate models.
# COMMAND ----------
evaluator.explainParam("metricName")
# COMMAND ----------
示例13: LogisticRegression
# $example on$
# load data file.
inputData = spark.read.format("libsvm") \
.load("data/mllib/sample_multiclass_classification_data.txt")
# generate the train/test split.
(train, test) = inputData.randomSplit([0.8, 0.2])
# instantiate the base classifier.
lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)
# train the multiclass model.
ovrModel = ovr.fit(train)
# score the model on test data.
predictions = ovrModel.transform(test)
# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="precision")
# compute the classification error on test data.
precision = evaluator.evaluate(predictions)
print("Test Error : " + str(1 - precision))
# $example off$
spark.stop()
示例14: HashingTF
)
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)
pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])
# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************
evaluator = MulticlassClassificationEvaluator(
predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)
grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()
print "Grid is build"
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator)
print "CV Estimator is defined"
cv_model = cv.fit(dfTrain)
print "Model is fitted"
df_test_pred = cv_model.transform(dfTest)
示例15: time
print "Done in {} second".format(round(tt,3))
# In[18]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
print "Fitting the classifier on selected features"
t0 = time()
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
lr = LogisticRegression(featuresCol='selectedFeatures',labelCol='target_indexed',maxIter=30, regParam=0.01)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
string_indexer_model = string_indexer.fit(dfTrainSelect)
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect).cache()
lrModel = lr.fit(dfTrainIndexed)
tt = time() - t0
print "Done in {} second".format(round(tt,3))
# In[19]:
print "Testing precision of the model"
t0 = time()
dfValidSelect=dfValid.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(['selectedFeatures','label']).cache()