本文整理汇总了Python中pyspark.mllib.util.MLUtils类的典型用法代码示例。如果您正苦于以下问题:Python MLUtils类的具体用法?Python MLUtils怎么用?Python MLUtils使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了MLUtils类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Random_Forest
def Random_Forest(filename, sc):
filename = "/Users/Jacob/SparkService/data/sample_libsvm_data.txt"
# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, filename)
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a RandomForest model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
print(model.toDebugString())
# Save and load model
#model.save(sc, "target/tmp/myRandomForestClassificationModel")
#sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")
示例2: test_append_bias_with_sp_vector
def test_append_bias_with_sp_vector(self):
data = Vectors.sparse(3, {0: 2.0, 2: 2.0})
expected = Vectors.sparse(4, {0: 2.0, 2: 2.0, 3: 1.0})
# Returned value must be SparseVector
ret = MLUtils.appendBias(data)
self.assertEqual(ret, expected)
self.assertEqual(type(ret), SparseVector)
示例3: split_data
def split_data():
try:
#pat_proc = sc.textFile("hdfs://master:54310/bibudh/healthcare/data/cloudera_challenge/pat_proc_libsvm_format")
#sqlContext.createDataFrame(pat_proc.map(lambda x: custom_encode(x)).take(10000)).foreach(check_for_ascending)
#map(lambda w: check_for_ascending(w), pat_proc.map(lambda x: custom_encode(x)).take(10000000))
#pat_proc = sqlContext.read.format("libsvm").load(home_folder + '/healthcare/data/cloudera_challenge/pat_proc_libsvm_format/part-*') #This gives a DataFrame
pat_proc = MLUtils.loadLibSVMFile(sc, home_folder + '/healthcare/data/cloudera_challenge/pat_proc_libsvm_format/part-*').toDF() #Naive Bayes expects
#data as an RDD of LabeledPoint
print("pat_proc.count() = " + str(pat_proc.count())) #150,127 rows, the two columns are ['label', 'features']
anom = pat_proc.filter(pat_proc.label == 1) #This can be done since we have called toDF() on output of loadLibSVMFile()
benign = pat_proc.filter(pat_proc.label == 0)
n_benign = benign.count()
#Take a random sample of 50K from benign
frac = 50000/n_benign
(into_model, for_finding_more) = benign.randomSplit([frac, 1 - frac])
print("into_model.count() = " + str(into_model.count()) + ", for_finding_more.count() = " + str(for_finding_more.count()))
for_modeling = anom.unionAll(into_model)
#for_modeling = for_modeling.rdd #LogisticRegressionWithSGD works on RDD of LabeledPoint objects
(train, test) = for_modeling.randomSplit([0.5, 0.5])
test_data_size = test.count()
print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size))
ret_obj = {'train': train, 'test': test, 'for_finding_more': for_finding_more}
except Exception:
print("Exception in user code:")
traceback.print_exc(file = sys.stdout)
return ret_obj
示例4: LinearRegression
def LinearRegression(trainFile, testFile, taskid,sc):
# filename = "/Users/Jacob/repository/SparkService/data/lpsa.data"
# data = sc.textFile(filename)
# parsedData = data.map(parsePoint)
trainData = MLUtils.loadLibSVMFile(sc, trainFile)
testData = MLUtils.loadLibSVMFile(sc, testFile)
# train the model
model = LinearRegressionWithSGD.train(trainData)
# Evaluate the model on training data
# predictionAndLabels = parsedData.map(lambda p: (p.label, model.predict(p.features)))
predictionAndLabels = testData.map(lambda p: (p.label, model.predict(p.features)))
MSE = predictionAndLabels.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / predictionAndLabels.count()
print("\n\n\n\n\n\nMean Squared Error = " + str(MSE) + "\n\n\n\n\n")
# Save and load model
#model.save(sc, "myModelPath")
#sameModel = LinearRegressionModel.load(sc, "myModelPath")
示例5: npmat_to_rdd_wreadwrite
def npmat_to_rdd_wreadwrite(sc,X,Y,f_name,delete_file=False):
"""
Takes a data prepared for scikit model X in numpy matrix format, Y one-dimensional numpy array
and writes to file in libsvm format with filename string f_name provided (could delete automatically),
then reads from file directly into spark RDD object (for given Sparkcontext sc)
"""
sklearn.datasets.dump_svmlight_file(X,Y,f_name,zero_based=False)
read_rdd= MLUtils.loadLibSVMFile(sc, f_name)
if delete_file:
os.remove(f_name)
return read_rdd
示例6: Gradient_BoostedTrees
def Gradient_BoostedTrees(filename, sc):
# Load and parse the data file.
data = MLUtils.loadLibSVMFile(sc, "/Users/Jacob/SparkService/data/sample_libsvm_data.txt")
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a GradientBoostedTrees model.
# Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
# (b) Use more iterations in practice.
model = GradientBoostedTrees.trainClassifier(trainingData,
categoricalFeaturesInfo={}, numIterations=3)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification GBT model:')
print(model.toDebugString())
示例7: test_load_vectors
def test_load_vectors(self):
import shutil
data = [
[1.0, 2.0, 3.0],
[1.0, 2.0, 3.0]
]
temp_dir = tempfile.mkdtemp()
load_vectors_path = os.path.join(temp_dir, "test_load_vectors")
try:
self.sc.parallelize(data).saveAsTextFile(load_vectors_path)
ret_rdd = MLUtils.loadVectors(self.sc, load_vectors_path)
ret = ret_rdd.collect()
self.assertEqual(len(ret), 2)
self.assertEqual(ret[0], DenseVector([1.0, 2.0, 3.0]))
self.assertEqual(ret[1], DenseVector([1.0, 2.0, 3.0]))
except:
self.fail()
finally:
shutil.rmtree(load_vectors_path)
示例8: main
def main(input_file):
sc = pyspark.SparkContext(appName="DecisionTree")
data = MLUtils.loadLabeledPoints(sc, input_file)
trainingData, testData = data.randomSplit([0.70, 0.3])
# Cache in memory for faster training
trainingData.cache()
model = DecisionTree.trainClassifier(trainingData, numClasses=4, impurity='gini',
categoricalFeaturesInfo={}, maxDepth=16, maxBins=10)
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
# print tree_model.toDebugString()
print ""
print ""
print "Test Erros: {}".format(round(testErr,4))
示例9: SparkContext
# $example on$
import math
from pyspark.mllib.regression import LabeledPoint, IsotonicRegression, IsotonicRegressionModel
from pyspark.mllib.util import MLUtils
# $example off$
if __name__ == "__main__":
sc = SparkContext(appName="PythonIsotonicRegressionExample")
# $example on$
# Load and parse the data
def parsePoint(labeledData):
return (labeledData.label, labeledData.features[0], 1.0)
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_isotonic_regression_libsvm_data.txt")
# Create label, feature, weight tuples from input data with weight set to default value 1.0.
parsedData = data.map(parsePoint)
# Split data into training (60%) and test (40%) sets.
training, test = parsedData.randomSplit([0.6, 0.4], 11)
# Create isotonic regression model from training data.
# Isotonic parameter defaults to true so it is only shown for demonstration
model = IsotonicRegression.train(training)
# Create tuples of predicted and real labels.
predictionAndLabel = test.map(lambda p: (model.predict(p[1]), p[0]))
# Calculate mean squared error between predicted and real labels.
示例10: enumerate
d2.take(2)
# In[21]:
from pyspark.mllib.util import MLUtils
dataOutput="libsvm_data.txt"
import os.path
import shutil
if os.path.exists(dataOutput):
shutil.rmtree(dataOutput)#os.rmdir(dataOutput)
print dataOutput
MLUtils.saveAsLibSVMFile(d2,"libsvm_data.txt")
# In[22]:
for i,x in enumerate(features): print i,x
# In[23]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = d2.randomSplit([0.7, 0.3])
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
示例11: SparkContext
# Import DecisionTree / DecisionTreeModel
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.util import MLUtils
from pyspark import SparkContext
sc = SparkContext("local", "SVM")
# Loading and parsing data into RDD of LabeledPoint
# Sample data provided by Spark 1.3.1 folder
# To run locally
#data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt')
# To run on hadoop server
data = MLUtils.loadLibSVMFile(sc, 'jingrong/sample_libsvm_data.txt')
# Splits data - Approximately 70% training , 30% testing
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train the decision tree model
# Empty categoricalFeaturesInfo indicates that all features are continuous.
model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32)
# Evaluate the model on test instances, compute test error
allPredictions = model.predict(testData.map(lambda x: x.features))
predictionsAndLabels = testData.map(lambda pl: pl.label).zip(allPredictions)
testMeanSquaredError = predictionsAndLabels.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
# Printing results
print "Tested Mean Squared Error: ", testMeanSquaredError
示例12: SparkContext
# $example on$
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
# $example off$
from pyspark import SparkContext
if __name__ == "__main__":
sc = SparkContext(appName="MultiClassMetricsExample")
# Several of the methods available in scala are currently missing from pyspark
# $example on$
# Load training data in LIBSVM format
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")
# Split data into training (60%) and test (40%)
training, test = data.randomSplit([0.6, 0.4], seed=11)
training.cache()
# Run training algorithm to build the model
model = LogisticRegressionWithLBFGS.train(training, numClasses=3)
# Compute raw scores on the test set
predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))
# Instantiate metrics object
metrics = MulticlassMetrics(predictionAndLabels)
# Overall statistics
示例13: __init__
def __init__(self,sc, path):
# Load and parse the data file into an RDD of LabeledPoint.
self.data = MLUtils.loadLibSVMFile(sc, path)
示例14: sets
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, 'file')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
impurity='entropy', maxDepth=5, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())
# Save and load model
model.save(sc, "myModelPath")
sameModel = DecisionTreeModel.load(sc, "myModelPath")
示例15: SparkContext
from math import log, exp
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
sc = SparkContext()
sqlContext = SQLContext(sc)
data = MLUtils.loadLibSVMFile(sc, "hdfs:///hndata/docvecs")
data = data.map(lambda lp: LabeledPoint(exp(lp.label)-1.0, lp.features))
# Split the data into training and test sets
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a RandomForest model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
rr = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
numTrees=5, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32)
predictions = rr.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest rr:')