本文整理汇总了Python中pyspark.mllib.util.MLUtils.loadLibSVMFile方法的典型用法代码示例。如果您正苦于以下问题:Python MLUtils.loadLibSVMFile方法的具体用法?Python MLUtils.loadLibSVMFile怎么用?Python MLUtils.loadLibSVMFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.util.MLUtils
的用法示例。
在下文中一共展示了MLUtils.loadLibSVMFile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Random_Forest
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
def Random_Forest(filename, sc):
filename = "/Users/Jacob/SparkService/data/sample_libsvm_data.txt"
# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, filename)
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a RandomForest model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
print(model.toDebugString())
# Save and load model
#model.save(sc, "target/tmp/myRandomForestClassificationModel")
#sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")
示例2: split_data
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
def split_data():
try:
#pat_proc = sc.textFile("hdfs://master:54310/bibudh/healthcare/data/cloudera_challenge/pat_proc_libsvm_format")
#sqlContext.createDataFrame(pat_proc.map(lambda x: custom_encode(x)).take(10000)).foreach(check_for_ascending)
#map(lambda w: check_for_ascending(w), pat_proc.map(lambda x: custom_encode(x)).take(10000000))
#pat_proc = sqlContext.read.format("libsvm").load(home_folder + '/healthcare/data/cloudera_challenge/pat_proc_libsvm_format/part-*') #This gives a DataFrame
pat_proc = MLUtils.loadLibSVMFile(sc, home_folder + '/healthcare/data/cloudera_challenge/pat_proc_libsvm_format/part-*').toDF() #Naive Bayes expects
#data as an RDD of LabeledPoint
print("pat_proc.count() = " + str(pat_proc.count())) #150,127 rows, the two columns are ['label', 'features']
anom = pat_proc.filter(pat_proc.label == 1) #This can be done since we have called toDF() on output of loadLibSVMFile()
benign = pat_proc.filter(pat_proc.label == 0)
n_benign = benign.count()
#Take a random sample of 50K from benign
frac = 50000/n_benign
(into_model, for_finding_more) = benign.randomSplit([frac, 1 - frac])
print("into_model.count() = " + str(into_model.count()) + ", for_finding_more.count() = " + str(for_finding_more.count()))
for_modeling = anom.unionAll(into_model)
#for_modeling = for_modeling.rdd #LogisticRegressionWithSGD works on RDD of LabeledPoint objects
(train, test) = for_modeling.randomSplit([0.5, 0.5])
test_data_size = test.count()
print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size))
ret_obj = {'train': train, 'test': test, 'for_finding_more': for_finding_more}
except Exception:
print("Exception in user code:")
traceback.print_exc(file = sys.stdout)
return ret_obj
示例3: LinearRegression
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
def LinearRegression(trainFile, testFile, taskid,sc):
# filename = "/Users/Jacob/repository/SparkService/data/lpsa.data"
# data = sc.textFile(filename)
# parsedData = data.map(parsePoint)
trainData = MLUtils.loadLibSVMFile(sc, trainFile)
testData = MLUtils.loadLibSVMFile(sc, testFile)
# train the model
model = LinearRegressionWithSGD.train(trainData)
# Evaluate the model on training data
# predictionAndLabels = parsedData.map(lambda p: (p.label, model.predict(p.features)))
predictionAndLabels = testData.map(lambda p: (p.label, model.predict(p.features)))
MSE = predictionAndLabels.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / predictionAndLabels.count()
print("\n\n\n\n\n\nMean Squared Error = " + str(MSE) + "\n\n\n\n\n")
# Save and load model
#model.save(sc, "myModelPath")
#sameModel = LinearRegressionModel.load(sc, "myModelPath")
示例4: npmat_to_rdd_wreadwrite
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
def npmat_to_rdd_wreadwrite(sc,X,Y,f_name,delete_file=False):
"""
Takes a data prepared for scikit model X in numpy matrix format, Y one-dimensional numpy array
and writes to file in libsvm format with filename string f_name provided (could delete automatically),
then reads from file directly into spark RDD object (for given Sparkcontext sc)
"""
sklearn.datasets.dump_svmlight_file(X,Y,f_name,zero_based=False)
read_rdd= MLUtils.loadLibSVMFile(sc, f_name)
if delete_file:
os.remove(f_name)
return read_rdd
示例5: Gradient_BoostedTrees
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
def Gradient_BoostedTrees(filename, sc):
# Load and parse the data file.
data = MLUtils.loadLibSVMFile(sc, "/Users/Jacob/SparkService/data/sample_libsvm_data.txt")
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a GradientBoostedTrees model.
# Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
# (b) Use more iterations in practice.
model = GradientBoostedTrees.trainClassifier(trainingData,
categoricalFeaturesInfo={}, numIterations=3)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification GBT model:')
print(model.toDebugString())
示例6: SparkContext
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
# $example on$
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
# $example off$
from pyspark import SparkContext
if __name__ == "__main__":
sc = SparkContext(appName="MultiClassMetricsExample")
# Several of the methods available in scala are currently missing from pyspark
# $example on$
# Load training data in LIBSVM format
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")
# Split data into training (60%) and test (40%)
training, test = data.randomSplit([0.6, 0.4], seed=11)
training.cache()
# Run training algorithm to build the model
model = LogisticRegressionWithLBFGS.train(training, numClasses=3)
# Compute raw scores on the test set
predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
# Overall statistics
示例7: SparkContext
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
from math import log, exp
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
sc = SparkContext()
sqlContext = SQLContext(sc)
data = MLUtils.loadLibSVMFile(sc, "hdfs:///hndata/docvecs")
data = data.map(lambda lp: LabeledPoint(exp(lp.label)-1.0, lp.features))
# Split the data into training and test sets
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a RandomForest model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
rr = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
numTrees=5, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32)
predictions = rr.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest rr:')
示例8: print
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
sys.path.append("/path/to/spark/python")
try:
from pyspark import SparkContext, SparkConf
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
print ("Successfully imported Spark Modules")
except ImportError as e:
print ("Can not import Spark Modules", e)
sys.exit(1)
if __name__ == "__main__":
conf = SparkConf().setAppName("RandomForest_Iris")
sc = SparkContext(conf = conf)
print "Loading data..."
data = MLUtils.loadLibSVMFile(sc, '../../data/iris/iris.scale')
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a RandomForest model.
model = RandomForest.trainClassifier(trainingData, numClasses=4,
categoricalFeaturesInfo={},
numTrees=5, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
print(model.toDebugString())
# Save model
model.save(sc, "model")
示例9: SparkContext
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
from __future__ import print_function
import sys
from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
# $example off$
if __name__ == "__main__":
sc = SparkContext(appName="PythonRandomForestClassificationExample")
# $example on$
# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, 'deathproject/datanew2.txt')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a RandomForest model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
print('Starting...')
model = RandomForest.trainClassifier(trainingData, numClasses=8, categoricalFeaturesInfo={0: 4, 1: 19, 2: 9, 3: 2, 4: 6, 6: 8, 7: 4, 8: 3, 9: 16, 10: 8, 11: 11},
numTrees=8, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
#model = RandomForest.trainClassifier(trainingData, numClasses=8, categoricalFeaturesInfo={1: 4, 1},
#numTrees=10, featureSubsetStrategy="auto",
#impurity='gini', maxDepth=4, maxBins=32)
示例10: GBTRegressor
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel")
model = rf.fit(train)
predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
.map(lambda x: (x.prediction, x.indexedLabel))
metrics = RegressionMetrics(predictionAndLabels)
print("rmse %.3f" % metrics.rootMeanSquaredError)
print("r2 %.3f" % metrics.r2)
print("mae %.3f" % metrics.meanAbsoluteError)
if __name__ == "__main__":
if len(sys.argv) > 1:
print("Usage: gradient_boosted_trees", file=sys.stderr)
exit(1)
sc = SparkContext(appName="Jay")
sqlContext = SQLContext(sc)
# Load and parse the data file into a dataframe.
df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()
# Map labels into an indexed column of labels in [0, numLabels)
stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
si_model = stringIndexer.fit(df)
td = si_model.transform(df)
[train, test] = td.randomSplit([0.7, 0.3])
testClassification(train, test)
testRegression(train, test)
sc.stop()
示例11: logsreg
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
def logsreg(loadTrainingFilePath, sc):
# Load training data in LIBSVM format
loadTrainingFilePath = '/Users/Jacob/repository/SparkService/data/sample_libsvm_data.txt'
data = MLUtils.loadLibSVMFile(sc, loadTrainingFilePath)
# Split data into training (60%) and test (40%)
traindata, testdata = data.randomSplit([0.6, 0.4], seed = 11L)
traindata.cache()
# Load testing data in LIBSVM format
#testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath)
# Run training algorithm to build the model
model = LogisticRegressionWithLBFGS.train(traindata, numClasses=3)
# Compute raw scores on the test set
predictionAndLabels = testdata.map(lambda lp: (float(model.predict(lp.features)), lp.label))
Json.generateJson("LogisticRegression", "12345678", traindata, predictionAndLabels);
print 'Completed.'
# Instantiate metrics object
# metrics = MulticlassMetrics(predictionAndLabels)
# # Overall statistics
# precision = metrics.precision()
# recall = metrics.recall()
# f1Score = metrics.fMeasure()
# #confusion_matrix = metrics.confusionMatrix().toArray()
# print("Summary Stats")
# print("Precision = %s" % precision)
# print("Recall = %s" % recall)
# print("F1 Score = %s" % f1Score)
# # Statistics by class
# labels = traindata.map(lambda lp: lp.label).distinct().collect()
# for label in sorted(labels):
# print("Class %s precision = %s" % (label, metrics.precision(label)))
# print("Class %s recall = %s" % (label, metrics.recall(label)))
# print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
# # Weighted stats
# print("Weighted recall = %s" % metrics.weightedRecall)
# print("Weighted precision = %s" % metrics.weightedPrecision)
# print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
# print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
# print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
# #return model parameters
# res = [('1','Yes','TP Rate', metrics.truePositiveRate(0.0)),
# ('2','Yes','FP Rate', metrics.falsePositiveRate(0.0)),
# ('3','Yes','Precision', metrics.precision(0.0)),
# ('4','Yes','Recall', metrics.recall(0.0)),
# ('5','Yes','F-Measure', metrics.fMeasure(0.0, beta=1.0)),
# ('1','Yes','TP Rate', metrics.truePositiveRate(1.0)),
# ('2','Yes','FP Rate', metrics.falsePositiveRate(1.0)),
# ('3','Yes','Precision', metrics.precision(1.0)),
# ('4','Yes','Recall', metrics.recall(1.0)),
# ('5','Yes','F-Measure', metrics.fMeasure(1.0, beta=1.0)),
# ('1','Yes','TP Rate', metrics.truePositiveRate(2.0)),
# ('2','Yes','FP Rate', metrics.falsePositiveRate(2.0)),
# ('3','Yes','Precision', metrics.precision(2.0)),
# ('4','Yes','Recall', metrics.recall(2.0)),
# ('5','Yes','F-Measure', metrics.fMeasure(2.0, beta=1.0))]
# #save output file path as JSON and dump into dumpFilePath
# rdd = sc.parallelize(res)
# SQLContext.createDataFrame(rdd).collect()
# df = SQLContext.createDataFrame(rdd,['Order','CLass','Name', 'Value'])
#tempDumpFilePath = dumpFilePath + "/part-00000"
#if os.path.exists(tempDumpFilePath):
# os.remove(tempDumpFilePath)
#df.toJSON().saveAsTextFile(hdfsFilePath)
#tmpHdfsFilePath = hdfsFilePath + "/part-00000"
#subprocess.call(["hadoop","fs","-copyToLocal", tmpHdfsFilePath, dumpFilePath])
# Save and load model
#clusters.save(sc, "myModel")
#sameModel = KMeansModel.load(sc, "myModel")
示例12: SparkContext
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
# $example on$
import math
from pyspark.mllib.regression import LabeledPoint, IsotonicRegression, IsotonicRegressionModel
from pyspark.mllib.util import MLUtils
# $example off$
if __name__ == "__main__":
sc = SparkContext(appName="PythonIsotonicRegressionExample")
# $example on$
# Load and parse the data
def parsePoint(labeledData):
return (labeledData.label, labeledData.features[0], 1.0)
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_isotonic_regression_libsvm_data.txt")
# Create label, feature, weight tuples from input data with weight set to default value 1.0.
parsedData = data.map(parsePoint)
# Split data into training (60%) and test (40%) sets.
training, test = parsedData.randomSplit([0.6, 0.4], 11)
# Create isotonic regression model from training data.
# Isotonic parameter defaults to true so it is only shown for demonstration
model = IsotonicRegression.train(training)
# Create tuples of predicted and real labels.
predictionAndLabel = test.map(lambda p: (model.predict(p[1]), p[0]))
# Calculate mean squared error between predicted and real labels.
示例13:
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
# MAGIC Upon completing this lab you should understand how to read from and write to files in Spark, convert between `RDDs` and `DataFrames`, and build a model using both the ML and MLlib APIs.
# COMMAND ----------
# MAGIC %md
# MAGIC #### Loading the data
# MAGIC
# MAGIC First, we need to load data into Spark. We'll use a built-in utility to load a [libSVM file](www.csie.ntu.edu.tw/~cjlin/libsvm/faq.html), which is stored in an S3 bucket on AWS. We'll use `MLUtils.loadLibSVMFile` to load our file. Here are the [Python](http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.util.MLUtils.loadLibSVMFile) and [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) APIs.
# COMMAND ----------
from pyspark.mllib.util import MLUtils
baseDir = '/mnt/ml-amsterdam/'
irisPath = baseDir + 'iris.scale'
irisRDD = MLUtils.loadLibSVMFile(sc, irisPath, minPartitions=20).cache()
# We get back an RDD of LabeledPoints. Note that the libSVM format uses SparseVectors.
irisRDD.take(5)
# COMMAND ----------
# MAGIC %md
# MAGIC What if we wanted to see the first few lines of the libSVM file to see what the format looks like?
# COMMAND ----------
sc.textFile(irisPath).take(5)
# COMMAND ----------
示例14: len
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
from pyspark.mllib.stat import Statistics
from pyspark.mllib.util import MLUtils
if __name__ == "__main__":
if len(sys.argv) not in [1, 2]:
print("Usage: correlations (<file>)", file=sys.stderr)
exit(-1)
sc = SparkContext(appName="PythonCorrelations")
if len(sys.argv) == 2:
filepath = sys.argv[1]
else:
filepath = 'data/mllib/sample_linear_regression_data.txt'
corrType = 'pearson'
points = MLUtils.loadLibSVMFile(sc, filepath)\
.map(lambda lp: LabeledPoint(lp.label, lp.features.toArray()))
print()
print('Summary of data file: ' + filepath)
print('%d data points' % points.count())
# Statistics (correlations)
print()
print('Correlation (%s) between label and each feature' % corrType)
print('Feature\tCorrelation')
numFeatures = points.take(1)[0].features.size
labelRDD = points.map(lambda lp: lp.label)
for i in range(numFeatures):
featureRDD = points.map(lambda lp: lp.features[i])
corr = Statistics.corr(labelRDD, featureRDD, corrType)
print('%d\t%g' % (i, corr))
示例15: sets
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, 'file')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
impurity='entropy', maxDepth=5, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())
# Save and load model
model.save(sc, "myModelPath")
sameModel = DecisionTreeModel.load(sc, "myModelPath")