本文整理汇总了Python中pyspark.mllib.util.MLUtils.loadLabeledPoints方法的典型用法代码示例。如果您正苦于以下问题:Python MLUtils.loadLabeledPoints方法的具体用法?Python MLUtils.loadLabeledPoints怎么用?Python MLUtils.loadLabeledPoints使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.util.MLUtils
的用法示例。
在下文中一共展示了MLUtils.loadLabeledPoints方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLabeledPoints [as 别名]
def main(input_file):
sc = pyspark.SparkContext(appName="DecisionTree")
data = MLUtils.loadLabeledPoints(sc, input_file)
trainingData, testData = data.randomSplit([0.70, 0.3])
# Cache in memory for faster training
trainingData.cache()
model = DecisionTree.trainClassifier(trainingData, numClasses=4, impurity='gini',
categoricalFeaturesInfo={}, maxDepth=16, maxBins=10)
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
# print tree_model.toDebugString()
print ""
print ""
print "Test Erros: {}".format(round(testErr,4))
示例2: float
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLabeledPoints [as 别名]
import pyspark
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
sc = pyspark.SparkContext(appName="RandomForest")
# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLabeledPoints(sc, 'gs://cs123data/Output/AmountVectors2/')
# Split the data into training and test sets
trainingData, testData = data.randomSplit([0.7, 0.3])
trainingData.cache()
testData.cache()
model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
numTrees=20, featureSubsetStrategy="auto",
impurity='variance', maxDepth=3, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest model:')
print(model.toDebugString())
示例3: float
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import loadLabeledPoints [as 别名]
import pyspark
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
sc = pyspark.SparkContext(appName="RandomForest")
# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLabeledPoints(sc, 'gs://cs123data/Output/PartyVectors/')
# Split the data into training and test sets
trainingData, testData = data.randomSplit([0.7, 0.3])
trainingData.cache()
# The depth of the tree proved to be a significant bottle neck
model = RandomForest.trainClassifier(trainingData, numClasses=4, categoricalFeaturesInfo={},
numTrees=700, featureSubsetStrategy="auto",
impurity='gini', maxDepth=8, maxBins=12)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print("")
print("")
print('Test Error: ' + str(testErr))