本文整理汇总了Python中pyspark.mllib.util.MLUtils.saveAsLibSVMFile方法的典型用法代码示例。如果您正苦于以下问题:Python MLUtils.saveAsLibSVMFile方法的具体用法?Python MLUtils.saveAsLibSVMFile怎么用?Python MLUtils.saveAsLibSVMFile使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.util.MLUtils
的用法示例。
在下文中一共展示了MLUtils.saveAsLibSVMFile方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: enumerate
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import saveAsLibSVMFile [as 别名]
d2.take(2)
# In[21]:
from pyspark.mllib.util import MLUtils
dataOutput="libsvm_data.txt"
import os.path
import shutil
if os.path.exists(dataOutput):
shutil.rmtree(dataOutput)#os.rmdir(dataOutput)
print dataOutput
MLUtils.saveAsLibSVMFile(d2,"libsvm_data.txt")
# In[22]:
for i,x in enumerate(features): print i,x
# In[23]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = d2.randomSplit([0.7, 0.3])
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
示例2: LabeledPoint
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import saveAsLibSVMFile [as 别名]
from pyspark.mllib.util import MLUtils
#>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
#>>> tempFile = NamedTemporaryFile(delete=True)
#>>> tempFile.close()
#>>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF, IDF
from pyspark import SparkContext
sc=SparkContext("local","dd")
train = sc.parallelize(open("/home/madhura/ML_Spring16/MLProject/data/OriginalTraining.txt").read().splitlines()).map(lambda x: x.split(","))
trainlabels = train.map(lambda(a,b): int(b))
traintf = HashingTF().transform(train.map(lambda(a,b): a.split()))
trainidf = IDF().fit(traintf)
traintfidf = trainidf.transform(traintf)
#densetrain = traintfidf.map(lambda x: pyspark.mllib.linalg.DenseVector(x.toArray()))
#zippeddata = trainlabels.zip(densetrain)
#new = zippeddata.map(lambda (a,vec) : (a,vec.toArray()))
training = trainlabels.zip(traintfidf).map(lambda x : LabeledPoint(x[0], x[1]))
MLUtils.saveAsLibSVMFile(training.coalesce(1),"/home/madhura/ML_Spring16/MLProject/data/libsvmfile")
data = MLUtils.loadLibSVMFile(sc, "/home/madhura/ML_Spring16/MLProject/data/libsvmfile/part-00000")
(trainingData, testData) = data.randomSplit([0.7, 0.3])
model = RandomForest.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
model.save(sc, "/home/madhura/ML_Spring16/MLProject/SentimentAnalysis_NLTK_NB/src/myRandomForestClassificationModel")
示例3: print
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import saveAsLibSVMFile [as 别名]
# Try and import the PySpark classes
try:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.classification import LabeledPoint
from pyspark.mllib.util import MLUtils
print("Successfully loaded Spark and MLlib classes...")
except ImportError as e:
print("Error importing spark modules", e)
sys.exit(1)
from numpy import array
conf = SparkConf().setAppName("RecessionPredictionModel").setMaster("local")
sc = SparkContext(conf=conf)
data = sc.textFile("/Users/agaram/development/DataScienceExperiments/econometricsPoc/EconometricsDataSlope.csv/Sheet1-Table1.csv")
parsedData = data.map(lambda line: LabeledPoint([float(x) for x in line.split(',')[1:8]][6],
array([float(x) for x in line.split(',')[1:8]])))
MLUtils.saveAsLibSVMFile(parsedData, "/Users/agaram/development/DataScienceExperiments/econometricsPoc/svmDataSlope")
示例4: loadVecs
# 需要导入模块: from pyspark.mllib.util import MLUtils [as 别名]
# 或者: from pyspark.mllib.util.MLUtils import saveAsLibSVMFile [as 别名]
'''
scores = df.where("score IS NOT NULL") \
.where("type='story'") \
.where("title IS NOT NULL") \
.map(lambda row: row.score)
def loadVecs(score_pairs):
import numpy as np
docvecs = np.load("/data/_hndata/hn.docvecs.doctag_syn0.npy")
return [(s, docvecs[i]) for (s,i) in score_pairs]
vecs = scores.zipWithIndex().mapPartitions(loadVecs)
data = vecs.map(lambda pair: LabeledPoint(log(float(pair[0])+1.0), pair[1]))
MLUtils.saveAsLibSVMFile(data, "hdfs:///hndata/docvecs")
# Split the data into training and test sets
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a RandomForest model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
rr = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
numTrees=5, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32)
predictions = rr.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: log(lp.label+1.0)).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())