本文整理汇总了Python中pyspark.mllib.feature.StandardScaler.transform方法的典型用法代码示例。如果您正苦于以下问题:Python StandardScaler.transform方法的具体用法?Python StandardScaler.transform怎么用?Python StandardScaler.transform使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.feature.StandardScaler
的用法示例。
在下文中一共展示了StandardScaler.transform方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_model_transform
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
def test_model_transform(self):
data = [
[1.0, 2.0, 3.0],
[2.0, 3.0, 4.0],
[3.0, 4.0, 5.0]
]
model = StandardScaler().fit(self.sc.parallelize(data))
self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
示例2: test_model_setters
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
def test_model_setters(self):
data = [
[1.0, 2.0, 3.0],
[2.0, 3.0, 4.0],
[3.0, 4.0, 5.0]
]
model = StandardScaler().fit(self.sc.parallelize(data))
self.assertIsNotNone(model.setWithMean(True))
self.assertIsNotNone(model.setWithStd(True))
self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([-1.0, -1.0, -1.0]))
示例3: SparkConf
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
# step 1 - create spark context
conf = SparkConf().setAppName("KMeans-Content")\
.set("spark.executor.memory","1g")
sc = SparkContext()
# step 2 - load in input file
data = MLUtils.loadLibSVMFile(sc,"/Users/Ellen/Desktop/movie_features_dataset.dat")
labels = data.map(lambda x:x.label)
features = data.map(lambda x:x.features)
# step 3 - standarize the data with unit values and 0 mean
scaler = StandardScaler(withMean=False,withStd=True).fit(features)
data2 = labels.zip(scaler.transform(features))
numFeatures = len(data2.values().take(10)[0])
print "Type of data2: ",type(data2) #RDD
print "Type of data2.values(): ",type(data2.values()) # pipelinedrdd
print "Sample: ",data2.values().take(1)[0]
# splitting up the data to training, validation and testing models.
train,val,test = data2.randomSplit([.80,.10,.10])
print "Training Dataset Size:",train.count()
print "Validation Dataset size:",val.count()
print "Test Dataset Size:",test.count()
示例4: StandardScaler
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
housingData = housingVals.map(toLabeledPoint)
#Section 7.4.5
sets = housingData.randomSplit([0.8, 0.2])
housingTrain = sets[0]
housingValid = sets[1]
#Section 7.4.6
from pyspark.mllib.feature import StandardScaler
scaler = StandardScaler(True, True).fit(housingTrain.map(lambda x: x.features))
trainLabel = housingTrain.map(lambda x: x.label)
trainFeatures = housingTrain.map(lambda x: x.features)
validLabel = housingValid.map(lambda x: x.label)
validFeatures = housingValid.map(lambda x: x.features)
trainScaled = trainLabel.zip(scaler.transform(trainFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))
validScaled = validLabel.zip(scaler.transform(validFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))
#Section 7.5
from pyspark.mllib.regression import LinearRegressionWithSGD
alg = LinearRegressionWithSGD()
trainScaled.cache()
validScaled.cache()
model = alg.train(trainScaled, iterations=200, intercept=True)
#Section 7.5.1
validPredicts = validScaled.map(lambda x: (float(model.predict(x.features)), x.label))
validPredicts.collect()
import math
RMSE = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
示例5: LabeledPoint
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
# 27 = tempo
# 28 = time_signature
allData = trackRocks.join(songData).map(lambda (tr, (rocks, data)): (tr, (0.0 if rocks is None else rocks, data)))
allData.take(3)
# label data
# only uses one feature for now
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [data[6]]))
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [random.random() + (.5 if rocks == 1 else 0)]))
labels = allData.map(lambda (tr, (rocks, data)): rocks)
features = allData.map(lambda (tr, (rocks, data)): data)
std = StandardScaler(True, True).fit(features)
scaledFeatures = std.transform(features)
labeledData = labels.zip(scaledFeatures).map(lambda (label, data): LabeledPoint(label, data))
# uses all extracted
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [x for x in data]))
labeledData.take(3)
# make sample sizes equal
labeledRock = labeledData.filter(lambda p: p.label == 1.0)
labeledRock.count()
labeledRock.map(lambda p: p.features[0]).mean()
nrock = labeledRock.count()
labeledNotRock = labeledData.filter(lambda p: p.label != 1.0)
示例6: norm
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
def norm(features):
scaler = StandardScaler(withMean=False, withStd=False).fit(features)
return scaler.transform(features)
示例7: main
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
def main():
appName = "BadOrGood;zl"
conf = (SparkConf()
.setAppName(appName)
.set("spark.executor.memory", "5g")
.set("spark.executor.cores","3")
.set("spark.executor.instance", "3")
)
sc = SparkContext(conf = conf)
hc = HiveContext(sc)
#fetch data
#filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd'
#fetchDataToFile(hc, filepath)
#load data
# AllDataRawrdd = sc.pickleFile(filepath) \
# .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \
# .repartition(10)
AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10)
#standardizer for train and test data
model = StandardScaler(True, True) \
.fit( AllDataRawrdd \
.map( lambda _: Vectors.dense(_['feature']) )
)
labels = AllDataRawrdd.map(lambda _: _['label'])
featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) )
AllDataRawrdd = labels \
.zip(featureTransformed) \
.map( lambda _: { 'label':_[0], 'feature':_[1] } )
#sampling
trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100)
trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist()
testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist()
#prediction & test
lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1")
resultrdd = test(lrmLBFGS, testDatardd)
lrmLBFGSFone = fone(resultrdd)
lrmLBFGSac = accuracy(resultrdd)
lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1")
resultrdd = test(lrmSGD, testDatardd)
lrmSGDFone = fone(resultrdd)
lrmSGDac = accuracy(resultrdd)
dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10)
resultrdd = test(dt, testDatardd)
dtFone = fone(resultrdd)
dtac = accuracy(resultrdd)
rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10)
resultrdd = test(rf, testDatardd)
rfFone = fone(resultrdd)
rfac = accuracy(resultrdd)
print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac)
print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac)
print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac)
print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac)
print lrmLBFGS.weights
print lrmSGD.weights
sc.stop()
示例8: main
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
def main(argv):
verbose = False
dbpath = '/root/data/AdditionalFiles/'
tagstring = 'rock'
usealldata = False
holdout = 0.1
model_iterations = 100
model_step = 1.0
model_intercept = True
# possible types logistic and svm
model_type = 'logistic'
try:
opts, args = getopt.getopt(argv,"hvd:t:am:s:i:o:c",["help","verbose","datapath=","tagstring=","alldata","model=","step=","iterations=","holdout=","intercept"])
except getopt.GetoptError:
print 'rockTag.py -d <data path> -t <tag string>'
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('rockTag.py -d <data path> -t <tag string>')
sys.exit()
elif opt in ("-v", "--verbose"):
verbose = True
elif opt in ("-d", "--datapath"):
dbpath = arg
elif opt in ("-t", "--tagstring"):
tagstring = str(arg).lower()
elif opt in ("-a", "--alldata"):
usealldata = True
elif opt in ("-m", "--model"):
if str(arg).lower() in ['logistic','svm']:
model_type = str(arg).lower
else:
print('valid models are logistic and svm')
sys.exit()
elif opt in ("-s", "--step"):
model_step = float(arg)
elif opt in ("-i", "--iterations"):
model_iterations = int(arg)
elif opt in ("-o", "--holdout"):
holdout = float(arg)
if holdout <= 0 | holdout >= 1:
print('holdout must be greater than 0 and less than 1')
elif opt in ("-c", "--intercept"):
model_intercept = True
if verbose:
print('data path: ' + dbpath)
print('tag string: ' + tagstring)
labels, features = getLabelsAndFeatures(dbpath, tagstring=tagstring, verbose=verbose, usealldata=usealldata)
# scale features
std = StandardScaler(True, True).fit(features)
features = std.transform(features)
# make labeled data
labeledData = labels.zip(features).map(lambda (label, data): LabeledPoint(label, data))
if verbose: labeledData.take(3)
# rebalance samples
equalSampleData = rebalanceSample(labeledData, verbose=verbose)
# split data
trainData, testData = randomSplit(equalSampleData, [1-holdout, holdout])
if verbose: trainData.map(lambda p: (p.label, p.features)).take(3)
# train model
if model_type == 'logistic':
model = LogisticRegressionWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step)
elif model_type == 'svm':
model = SVMWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step)
evalString = evaluateModel(model, testData)
print(evalString)
示例9: StandardScaler
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
path = sys.argv[1]
#path = "/Users/jamesledoux/Documents/BigData/netflixrecommender/movie_features_dataset.dat/"
data = MLUtils.loadLibSVMFile(sc, path)
labels = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)
#normalize:
#scaler = StandardScaler(withMean = True, withStd = True).fit(features) #data needs to be dense (zeros included)
scaler = StandardScaler(withMean = False, withStd = True).fit(features) #becomes dense if using withMean. may run out of memory locally
#convert data to dense vector to be normalized
#data2 = labels.zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
data2 = labels.zip(scaler.transform(features)) #use this line if having memory issues
#hide 10% of the data for final test
data, test = data2.randomSplit([.9, .1])
#get size of chunks for 10-fold cross-validation
num_folds = 10
partitionSize = (len(data.collect())/num_folds) #parameterize this value as num_folds (in loop as well)
#train/validate 10 times on each k
i = 0
j = partitionSize
data = data.collect()
cv_error_storage = []
#10 fold is better, but I use 5 here in the interest of time
示例10: StandardScaler
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
max_time = 23 * 3600 + 59 * 60 + 59
#max_time = 16 * 60
low = 0
high = 15 * 60
modelList = []
while low < max_time: # Temp should run once
timeseries = df.filter(lambda x: low < x.timestamp < high)
#if timeseries.count() > 0:
features = timeseries.map(lambda row: row[1:])
#print "Possible points"
#print features.collect()
model = StandardScaler().fit(features)
features_t = model.transform(features)
label = timeseries.map(lambda row: row[0])
labeled_data = label.zip(features_t)
final_data = labeled_data.map(lambda row: LabeledPoint(row[0], row[1]))
model = LinearRegressionWithSGD.train(final_data, 1000, .0000001, intercept=True)
#model = RidgeRegressionWithSGD.train(final_data, 1000, .00000001, intercept=True)
#model = LassoWithSGD.train(final_data, 1000, .00000001, intercept=True)
modelList.append(model)
#print ""
#print "Model1 weights " + str(model.weights)
#print ""
示例11: SparkContext
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
from pyspark.mllib.util import MLUtils
# $example off$
if __name__ == "__main__":
sc = SparkContext(appName="StandardScalerExample") # SparkContext
# $example on$
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
label = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)
scaler1 = StandardScaler().fit(features)
scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)
# data1 will be unit variance.
data1 = label.zip(scaler1.transform(features))
# data2 will be unit variance and zero mean.
data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
# $example off$
print("data1:")
for each in data1.collect():
print(each)
print("data2:")
for each in data2.collect():
print(each)
sc.stop()
示例12: setLevel
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
def parsePoint(data):
#return LabeledPoint(data[3],np.append(data[0:3],data[4:]))
return LabeledPoint(data[0],data[1:])
# store the data from cassandra to a data frame and remove the NA value
data=sc.cassandraTable("msd_01", "songs").select("song_hotttnesss","loudness","year","sentiment","tempo","unique_words").toDF()
data=data.filter("year>0").na.drop()
print data.count()
# Scale the features with Standard Scaler
data2=data.map(lambda x: [x.song_hotttnesss, x.loudness,x.year, x.sentiment,x.tempo,x.unique_words])#Convert each sql.row to an array
scaler= StandardScaler(withMean=True, withStd=True).fit(data2) #fit a scaler on the every column
scaledData = scaler.transform(data2)# transform our data
# Transform to a labelled vector
parsedData = scaledData.map(parsePoint)
# # Build the model
model = LinearRegressionWithSGD.train(parsedData, iterations=1000,regParam=1.0,regType="l2",intercept=True)
# Evaluate the model on training data
print ("intercept",model.intercept)
print zip(["loudness","year","sentiment","tempo","unique_words"],model.weights)
sc.stop()