当前位置: 首页>>代码示例>>Python>>正文


Python MLUtils.loadLibSVMFile方法代码示例

本文整理汇总了Python中pyspark.mllib.util.MLUtils.loadLibSVMFile方法的典型用法代码示例。如果您正苦于以下问题:Python MLUtils.loadLibSVMFile方法的具体用法?Python MLUtils.loadLibSVMFile怎么用?Python MLUtils.loadLibSVMFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.mllib.util.MLUtils的用法示例。


在下文中一共展示了MLUtils.loadLibSVMFile方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: Random_Forest

# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
def Random_Forest(filename, sc):

	filename = "/Users/Jacob/SparkService/data/sample_libsvm_data.txt"
	# Load and parse the data file into an RDD of LabeledPoint.
	data = MLUtils.loadLibSVMFile(sc, filename)
	# Split the data into training and test sets (30% held out for testing)
	(trainingData, testData) = data.randomSplit([0.7, 0.3])

	# Train a RandomForest model.
	#  Empty categoricalFeaturesInfo indicates all features are continuous.
	#  Note: Use larger numTrees in practice.
	#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
	model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
	                                     numTrees=3, featureSubsetStrategy="auto",
	                                     impurity='gini', maxDepth=4, maxBins=32)

	# Evaluate model on test instances and compute test error
	predictions = model.predict(testData.map(lambda x: x.features))
	labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
	testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
	print('Test Error = ' + str(testErr))
	print('Learned classification forest model:')
	print(model.toDebugString())

	# Save and load model
	#model.save(sc, "target/tmp/myRandomForestClassificationModel")
	#sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")
开发者ID:bangjieliu,项目名称:SparkService,代码行数:29,代码来源:random_forest.py

示例2: split_data

# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
def split_data():
  try:
    #pat_proc = sc.textFile("hdfs://master:54310/bibudh/healthcare/data/cloudera_challenge/pat_proc_libsvm_format") 
    #sqlContext.createDataFrame(pat_proc.map(lambda x: custom_encode(x)).take(10000)).foreach(check_for_ascending)
    #map(lambda w: check_for_ascending(w), pat_proc.map(lambda x: custom_encode(x)).take(10000000))
    #pat_proc = sqlContext.read.format("libsvm").load(home_folder + '/healthcare/data/cloudera_challenge/pat_proc_libsvm_format/part-*') #This gives a DataFrame
    pat_proc = MLUtils.loadLibSVMFile(sc, home_folder + '/healthcare/data/cloudera_challenge/pat_proc_libsvm_format/part-*').toDF() #Naive Bayes expects
    #data as an RDD of LabeledPoint
    print("pat_proc.count() = " + str(pat_proc.count())) #150,127 rows, the two columns are ['label', 'features']
    
    anom = pat_proc.filter(pat_proc.label == 1) #This can be done since we have called toDF() on output of loadLibSVMFile()
    benign = pat_proc.filter(pat_proc.label == 0)
    n_benign = benign.count()
    
    #Take a random sample of 50K from benign
    frac = 50000/n_benign
    (into_model, for_finding_more) = benign.randomSplit([frac, 1 - frac])
    print("into_model.count() = " + str(into_model.count()) + ", for_finding_more.count() = " + str(for_finding_more.count()))
    
    for_modeling = anom.unionAll(into_model)
    #for_modeling = for_modeling.rdd #LogisticRegressionWithSGD works on RDD of LabeledPoint objects
    (train, test) = for_modeling.randomSplit([0.5, 0.5])
    test_data_size = test.count()
    print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size))
    ret_obj = {'train': train, 'test': test, 'for_finding_more': for_finding_more}
  except Exception:
    print("Exception in user code:")
    traceback.print_exc(file = sys.stdout)
  return ret_obj
开发者ID:bibudhlahiri,项目名称:healthcare,代码行数:31,代码来源:analyze_anomaly_full_data.py

示例3: LinearRegression

# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
def LinearRegression(trainFile, testFile, taskid,sc):
	# filename = "/Users/Jacob/repository/SparkService/data/lpsa.data"
	# data = sc.textFile(filename)
	# parsedData = data.map(parsePoint)

	trainData = MLUtils.loadLibSVMFile(sc, trainFile)
	testData = MLUtils.loadLibSVMFile(sc, testFile)

	# train the model
	model = LinearRegressionWithSGD.train(trainData)

	# Evaluate the model on training data
	# predictionAndLabels = parsedData.map(lambda p: (p.label, model.predict(p.features)))
	predictionAndLabels = testData.map(lambda p: (p.label, model.predict(p.features)))
	MSE = predictionAndLabels.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / predictionAndLabels.count()
	print("\n\n\n\n\n\nMean Squared Error = " + str(MSE) + "\n\n\n\n\n")

	# Save and load model
	#model.save(sc, "myModelPath")
	#sameModel = LinearRegressionModel.load(sc, "myModelPath")
开发者ID:honeycombcmu,项目名称:SparkService,代码行数:22,代码来源:linear_regression.py

示例4: npmat_to_rdd_wreadwrite

# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
def npmat_to_rdd_wreadwrite(sc,X,Y,f_name,delete_file=False):
    """
    Takes a data prepared for scikit model X in numpy matrix format, Y one-dimensional numpy array 
    and writes to file in libsvm format with filename string f_name provided (could delete automatically), 
    then reads from file directly into spark RDD object (for given Sparkcontext sc)

    """
    sklearn.datasets.dump_svmlight_file(X,Y,f_name,zero_based=False)
    read_rdd= MLUtils.loadLibSVMFile(sc, f_name)
    if delete_file:
      os.remove(f_name)
    return read_rdd
开发者ID:Kapetis,项目名称:binary_classifiers,代码行数:14,代码来源:binaryclassifier.py

示例5: Gradient_BoostedTrees

# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
def Gradient_BoostedTrees(filename, sc):
	# Load and parse the data file.
	data = MLUtils.loadLibSVMFile(sc, "/Users/Jacob/SparkService/data/sample_libsvm_data.txt")
	# Split the data into training and test sets (30% held out for testing)
	(trainingData, testData) = data.randomSplit([0.7, 0.3])

	# Train a GradientBoostedTrees model.
	#  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
	#         (b) Use more iterations in practice.
	model = GradientBoostedTrees.trainClassifier(trainingData,
	                                             categoricalFeaturesInfo={}, numIterations=3)

	# Evaluate model on test instances and compute test error
	predictions = model.predict(testData.map(lambda x: x.features))
	labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
	testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
	print('Test Error = ' + str(testErr))
	print('Learned classification GBT model:')
	print(model.toDebugString())
开发者ID:bangjieliu,项目名称:SparkService,代码行数:21,代码来源:gradient_boostedtrees.py

示例6: SparkContext

# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
# $example on$
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
# $example off$

from pyspark import SparkContext

if __name__ == "__main__":
    sc = SparkContext(appName="MultiClassMetricsExample")

    # Several of the methods available in scala are currently missing from pyspark
    # $example on$
    # Load training data in LIBSVM format
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")

    # Split data into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4], seed=11)
    training.cache()

    # Run training algorithm to build the model
    model = LogisticRegressionWithLBFGS.train(training, numClasses=3)

    # Compute raw scores on the test set
    predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # Overall statistics
开发者ID:lhfei,项目名称:spark-in-action,代码行数:32,代码来源:multi_class_metrics_example.py

示例7: SparkContext

# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
from math import log, exp
from pyspark import SparkContext
from pyspark.sql import SQLContext

from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint

sc = SparkContext()
sqlContext = SQLContext(sc)


data = MLUtils.loadLibSVMFile(sc, "hdfs:///hndata/docvecs")
data = data.map(lambda lp: LabeledPoint(exp(lp.label)-1.0, lp.features))

# Split the data into training and test sets
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Note: Use larger numTrees in practice.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
rr = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
                                    numTrees=5, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=4, maxBins=32)

predictions = rr.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest rr:')
开发者ID:yiransheng,项目名称:w251-project,代码行数:33,代码来源:random_forrest2.py

示例8: print

# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
sys.path.append("/path/to/spark/python")

try:
    from pyspark import SparkContext, SparkConf
    from pyspark.mllib.tree import RandomForest, RandomForestModel
    from pyspark.mllib.util import MLUtils
    print ("Successfully imported Spark Modules")
except ImportError as e:
    print ("Can not import Spark Modules", e)
    sys.exit(1)

if __name__ == "__main__":
    conf = SparkConf().setAppName("RandomForest_Iris")
    sc = SparkContext(conf = conf)
    print "Loading data..."
    data = MLUtils.loadLibSVMFile(sc, '../../data/iris/iris.scale')
    (trainingData, testData) = data.randomSplit([0.7, 0.3])
    # Train a RandomForest model.
    model = RandomForest.trainClassifier(trainingData, numClasses=4,
                                         categoricalFeaturesInfo={},
                                         numTrees=5, featureSubsetStrategy="auto",
                                         impurity='gini', maxDepth=4, maxBins=32)
    # Evaluate model on test instances and compute test error
    predictions = model.predict(testData.map(lambda x: x.features))
    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
    print('Test Error = ' + str(testErr))
    print('Learned classification forest model:')
    print(model.toDebugString())
    # Save model
    model.save(sc, "model")
开发者ID:MarioPerezEsteso,项目名称:PySpark-Projects,代码行数:33,代码来源:__init__.py

示例9: SparkContext

# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
from __future__ import print_function

import sys

from pyspark import SparkContext
# $example on$
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="PythonRandomForestClassificationExample")
    # $example on$
    # Load and parse the data file into an RDD of LabeledPoint.
    data = MLUtils.loadLibSVMFile(sc, 'deathproject/datanew2.txt')
    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    #  Empty categoricalFeaturesInfo indicates all features are continuous.
    #  Note: Use larger numTrees in practice.
    #  Setting featureSubsetStrategy="auto" lets the algorithm choose.
    print('Starting...')
    model = RandomForest.trainClassifier(trainingData, numClasses=8, categoricalFeaturesInfo={0: 4, 1: 19, 2: 9, 3: 2, 4: 6, 6: 8, 7: 4, 8: 3, 9: 16, 10: 8, 11: 11},
                                         numTrees=8, featureSubsetStrategy="auto",
                                         impurity='gini', maxDepth=4, maxBins=32)
    #model = RandomForest.trainClassifier(trainingData, numClasses=8, categoricalFeaturesInfo={1: 4, 1},
                                         #numTrees=10, featureSubsetStrategy="auto",
                                         #impurity='gini', maxDepth=4, maxBins=32)
开发者ID:adarsh-murthy,项目名称:AdvancedBigDataProject,代码行数:31,代码来源:dp.py

示例10: GBTRegressor

# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
    rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel")

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = RegressionMetrics(predictionAndLabels)
    print("rmse %.3f" % metrics.rootMeanSquaredError)
    print("r2 %.3f" % metrics.r2)
    print("mae %.3f" % metrics.meanAbsoluteError)


if __name__ == "__main__":
    if len(sys.argv) > 1:
        print("Usage: gradient_boosted_trees", file=sys.stderr)
        exit(1)
    sc = SparkContext(appName="Jay")
    sqlContext = SQLContext(sc)

    # Load and parse the data file into a dataframe.
    df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()

    # Map labels into an indexed column of labels in [0, numLabels)
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
    si_model = stringIndexer.fit(df)
    td = si_model.transform(df)
    [train, test] = td.randomSplit([0.7, 0.3])
    testClassification(train, test)
    testRegression(train, test)
    sc.stop()
开发者ID:bravekjh,项目名称:Spark,代码行数:32,代码来源:pyspark_gradient_boosted_trees.py

示例11: logsreg

# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
def logsreg(loadTrainingFilePath, sc):
	# Load training data in LIBSVM format
	loadTrainingFilePath = '/Users/Jacob/repository/SparkService/data/sample_libsvm_data.txt'
	data = MLUtils.loadLibSVMFile(sc, loadTrainingFilePath)
	
	
	# Split data into training (60%) and test (40%)
	traindata, testdata = data.randomSplit([0.6, 0.4], seed = 11L)
	traindata.cache()

	# Load testing data in LIBSVM format
	#testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath)

	# Run training algorithm to build the model
	model = LogisticRegressionWithLBFGS.train(traindata, numClasses=3)

	# Compute raw scores on the test set
	predictionAndLabels = testdata.map(lambda lp: (float(model.predict(lp.features)), lp.label))

	Json.generateJson("LogisticRegression", "12345678", traindata, predictionAndLabels);

	print 'Completed.'
	# Instantiate metrics object
	# metrics = MulticlassMetrics(predictionAndLabels)

	# # Overall statistics
	# precision = metrics.precision()
	# recall = metrics.recall()
	# f1Score = metrics.fMeasure()
	# #confusion_matrix = metrics.confusionMatrix().toArray()

	# print("Summary Stats")
	# print("Precision = %s" % precision)
	# print("Recall = %s" % recall)
	# print("F1 Score = %s" % f1Score)


	# # Statistics by class
	# labels = traindata.map(lambda lp: lp.label).distinct().collect()
	# for label in sorted(labels):
	#     print("Class %s precision = %s" % (label, metrics.precision(label)))
	#     print("Class %s recall = %s" % (label, metrics.recall(label)))
	#     print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

	# # Weighted stats
	# print("Weighted recall = %s" % metrics.weightedRecall)
	# print("Weighted precision = %s" % metrics.weightedPrecision)
	# print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
	# print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
	# print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

	# #return model parameters
	# res = [('1','Yes','TP Rate', metrics.truePositiveRate(0.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(0.0)),
	# 	   ('3','Yes','Precision', metrics.precision(0.0)),
	# 	   ('4','Yes','Recall', metrics.recall(0.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(0.0, beta=1.0)),
	#        ('1','Yes','TP Rate', metrics.truePositiveRate(1.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(1.0)),
	#        ('3','Yes','Precision', metrics.precision(1.0)),
	# 	   ('4','Yes','Recall', metrics.recall(1.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(1.0, beta=1.0)),
	#        ('1','Yes','TP Rate', metrics.truePositiveRate(2.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(2.0)),
	#        ('3','Yes','Precision', metrics.precision(2.0)),
	#        ('4','Yes','Recall', metrics.recall(2.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(2.0, beta=1.0))]	

	# #save output file path as JSON and dump into dumpFilePath
	# rdd = sc.parallelize(res)
	# SQLContext.createDataFrame(rdd).collect()
	# df = SQLContext.createDataFrame(rdd,['Order','CLass','Name', 'Value'])

	#tempDumpFilePath = dumpFilePath + "/part-00000"
	#if os.path.exists(tempDumpFilePath):
	#	os.remove(tempDumpFilePath)

	#df.toJSON().saveAsTextFile(hdfsFilePath)
	#tmpHdfsFilePath = hdfsFilePath + "/part-00000"
	#subprocess.call(["hadoop","fs","-copyToLocal", tmpHdfsFilePath, dumpFilePath])

	# Save and load model
	#clusters.save(sc, "myModel")
	#sameModel = KMeansModel.load(sc, "myModel")
开发者ID:honeycombcmu,项目名称:SparkService,代码行数:86,代码来源:logs_reg.py

示例12: SparkContext

# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
# $example on$
import math
from pyspark.mllib.regression import LabeledPoint, IsotonicRegression, IsotonicRegressionModel
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PythonIsotonicRegressionExample")

    # $example on$
    # Load and parse the data
    def parsePoint(labeledData):
        return (labeledData.label, labeledData.features[0], 1.0)

    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_isotonic_regression_libsvm_data.txt")

    # Create label, feature, weight tuples from input data with weight set to default value 1.0.
    parsedData = data.map(parsePoint)

    # Split data into training (60%) and test (40%) sets.
    training, test = parsedData.randomSplit([0.6, 0.4], 11)

    # Create isotonic regression model from training data.
    # Isotonic parameter defaults to true so it is only shown for demonstration
    model = IsotonicRegression.train(training)

    # Create tuples of predicted and real labels.
    predictionAndLabel = test.map(lambda p: (model.predict(p[1]), p[0]))

    # Calculate mean squared error between predicted and real labels.
开发者ID:11wzy001,项目名称:spark,代码行数:33,代码来源:isotonic_regression_example.py

示例13:

# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
# MAGIC Upon completing this lab you should understand how to read from and write to files in Spark, convert between `RDDs` and `DataFrames`, and build a model using both the ML and MLlib APIs.

# COMMAND ----------

# MAGIC %md
# MAGIC #### Loading the data
# MAGIC  
# MAGIC First, we need to load data into Spark.  We'll use a built-in utility to load a [libSVM file](www.csie.ntu.edu.tw/~cjlin/libsvm/faq.html), which is stored in an S3 bucket on AWS.  We'll use `MLUtils.loadLibSVMFile` to load our file.  Here are the [Python](http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.util.MLUtils.loadLibSVMFile) and [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.mllib.util.MLUtils$) APIs.

# COMMAND ----------

from pyspark.mllib.util import MLUtils

baseDir = '/mnt/ml-amsterdam/'
irisPath = baseDir + 'iris.scale'
irisRDD = MLUtils.loadLibSVMFile(sc, irisPath, minPartitions=20).cache()

# We get back an RDD of LabeledPoints.  Note that the libSVM format uses SparseVectors.
irisRDD.take(5)

# COMMAND ----------

# MAGIC %md
# MAGIC What if we wanted to see the first few lines of the libSVM file to see what the format looks like?

# COMMAND ----------

sc.textFile(irisPath).take(5)

# COMMAND ----------
开发者ID:Inscrutive,项目名称:spark,代码行数:32,代码来源:B.py

示例14: len

# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
from pyspark.mllib.stat import Statistics
from pyspark.mllib.util import MLUtils


if __name__ == "__main__":
    if len(sys.argv) not in [1, 2]:
        print("Usage: correlations (<file>)", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="PythonCorrelations")
    if len(sys.argv) == 2:
        filepath = sys.argv[1]
    else:
        filepath = 'data/mllib/sample_linear_regression_data.txt'
    corrType = 'pearson'

    points = MLUtils.loadLibSVMFile(sc, filepath)\
        .map(lambda lp: LabeledPoint(lp.label, lp.features.toArray()))

    print()
    print('Summary of data file: ' + filepath)
    print('%d data points' % points.count())

    # Statistics (correlations)
    print()
    print('Correlation (%s) between label and each feature' % corrType)
    print('Feature\tCorrelation')
    numFeatures = points.take(1)[0].features.size
    labelRDD = points.map(lambda lp: lp.label)
    for i in range(numFeatures):
        featureRDD = points.map(lambda lp: lp.features[i])
        corr = Statistics.corr(labelRDD, featureRDD, corrType)
        print('%d\t%g' % (i, corr))
开发者ID:lanimc,项目名称:BIGDATAProject,代码行数:34,代码来源:CorrelationTwo.py

示例15: sets

# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLibSVMFile [as 别名]
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, 'file')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='entropy', maxDepth=5, maxBins=32)

# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

# Save and load model
model.save(sc, "myModelPath")
sameModel = DecisionTreeModel.load(sc, "myModelPath")
开发者ID:shashankadidamu,项目名称:OttoGroupClassification,代码行数:27,代码来源:decision_tree.py


注:本文中的pyspark.mllib.util.MLUtils.loadLibSVMFile方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。