当前位置: 首页>>代码示例>>Python>>正文


Python StandardScaler.transform方法代码示例

本文整理汇总了Python中pyspark.mllib.feature.StandardScaler.transform方法的典型用法代码示例。如果您正苦于以下问题:Python StandardScaler.transform方法的具体用法?Python StandardScaler.transform怎么用?Python StandardScaler.transform使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.mllib.feature.StandardScaler的用法示例。


在下文中一共展示了StandardScaler.transform方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_model_transform

# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
 def test_model_transform(self):
     data = [
         [1.0, 2.0, 3.0],
         [2.0, 3.0, 4.0],
         [3.0, 4.0, 5.0]
     ]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
开发者ID:HodaAlemi,项目名称:spark,代码行数:10,代码来源:tests.py

示例2: test_model_setters

# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
 def test_model_setters(self):
     data = [
         [1.0, 2.0, 3.0],
         [2.0, 3.0, 4.0],
         [3.0, 4.0, 5.0]
     ]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertIsNotNone(model.setWithMean(True))
     self.assertIsNotNone(model.setWithStd(True))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([-1.0, -1.0, -1.0]))
开发者ID:HodaAlemi,项目名称:spark,代码行数:12,代码来源:tests.py

示例3: SparkConf

# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
    # step 1 - create spark context
    conf = SparkConf().setAppName("KMeans-Content")\
       .set("spark.executor.memory","1g")
    sc = SparkContext()


    # step 2 - load in input file
    data = MLUtils.loadLibSVMFile(sc,"/Users/Ellen/Desktop/movie_features_dataset.dat")
    labels = data.map(lambda x:x.label)
    features = data.map(lambda x:x.features)

  
    # step 3 - standarize the data with unit values and 0 mean
    scaler = StandardScaler(withMean=False,withStd=True).fit(features)

    data2 = labels.zip(scaler.transform(features))

    numFeatures = len(data2.values().take(10)[0])
    print "Type of data2: ",type(data2) #RDD
    print "Type of data2.values(): ",type(data2.values()) # pipelinedrdd
    print "Sample: ",data2.values().take(1)[0]

    # splitting up the data to training, validation and testing models.
    train,val,test = data2.randomSplit([.80,.10,.10])


    print "Training Dataset Size:",train.count()
    print "Validation Dataset size:",val.count()
    print "Test Dataset Size:",test.count()

开发者ID:ellenkimsy,项目名称:Big-Data-Homework,代码行数:31,代码来源:Content_KMeans.py

示例4: StandardScaler

# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
housingData = housingVals.map(toLabeledPoint)

#Section 7.4.5
sets = housingData.randomSplit([0.8, 0.2])
housingTrain = sets[0]
housingValid = sets[1]

#Section 7.4.6
from pyspark.mllib.feature import StandardScaler
scaler = StandardScaler(True, True).fit(housingTrain.map(lambda x: x.features))
trainLabel = housingTrain.map(lambda x: x.label)
trainFeatures = housingTrain.map(lambda x: x.features)
validLabel = housingValid.map(lambda x: x.label)
validFeatures = housingValid.map(lambda x: x.features)
trainScaled = trainLabel.zip(scaler.transform(trainFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))
validScaled = validLabel.zip(scaler.transform(validFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))

#Section 7.5
from pyspark.mllib.regression import LinearRegressionWithSGD
alg = LinearRegressionWithSGD()
trainScaled.cache()
validScaled.cache()
model = alg.train(trainScaled, iterations=200, intercept=True)

#Section 7.5.1
validPredicts = validScaled.map(lambda x: (float(model.predict(x.features)), x.label))
validPredicts.collect()
import math
RMSE = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
开发者ID:AkiraKane,项目名称:first-edition,代码行数:31,代码来源:ch07-listings.py

示例5: LabeledPoint

# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
# 27 = tempo
# 28 = time_signature

allData = trackRocks.join(songData).map(lambda (tr, (rocks, data)): (tr, (0.0 if rocks is None else rocks, data)))
allData.take(3)

# label data

# only uses one feature for now
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [data[6]]))
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [random.random() + (.5 if rocks == 1 else 0)]))

labels = allData.map(lambda (tr, (rocks, data)): rocks)
features = allData.map(lambda (tr, (rocks, data)): data)
std = StandardScaler(True, True).fit(features)
scaledFeatures = std.transform(features)

labeledData = labels.zip(scaledFeatures).map(lambda (label, data): LabeledPoint(label, data))

# uses all extracted
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [x for x in data]))

labeledData.take(3)

# make sample sizes equal
labeledRock = labeledData.filter(lambda p: p.label == 1.0)
labeledRock.count()
labeledRock.map(lambda p: p.features[0]).mean()
nrock = labeledRock.count()

labeledNotRock = labeledData.filter(lambda p: p.label != 1.0)
开发者ID:ScalingUpMusic,项目名称:SUMsandbox,代码行数:33,代码来源:rock_ml.py

示例6: norm

# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
def norm(features):
    scaler = StandardScaler(withMean=False, withStd=False).fit(features)
    return scaler.transform(features)
开发者ID:aymen82,项目名称:kaggler-competitions-scripts,代码行数:5,代码来源:script.py

示例7: main

# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
def main():
    appName = "BadOrGood;zl"
    
    conf = (SparkConf()
            .setAppName(appName)
            .set("spark.executor.memory", "5g")
            .set("spark.executor.cores","3")
            .set("spark.executor.instance", "3")
            )
    sc = SparkContext(conf = conf)
    hc = HiveContext(sc)

    #fetch data
    #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd'
    #fetchDataToFile(hc, filepath)
    
    #load data
    # AllDataRawrdd = sc.pickleFile(filepath) \
                    # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \
                    # .repartition(10)
    
    AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10)
    
    
    #standardizer for train and test data
    model = StandardScaler(True, True) \
            .fit( AllDataRawrdd \
                  .map( lambda _: Vectors.dense(_['feature']) ) 
            )
    labels = AllDataRawrdd.map(lambda _: _['label'])
    featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) )
    AllDataRawrdd = labels \
                    .zip(featureTransformed) \
                    .map( lambda _: { 'label':_[0], 'feature':_[1] } )
    #sampling
    trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100)
    trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist()
    testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist()
    
    #prediction & test
    lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1")
    resultrdd = test(lrmLBFGS, testDatardd)
    lrmLBFGSFone = fone(resultrdd)
    lrmLBFGSac = accuracy(resultrdd)

    lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1")
    resultrdd = test(lrmSGD, testDatardd)
    lrmSGDFone = fone(resultrdd)
    lrmSGDac = accuracy(resultrdd)
  
    dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10)
    resultrdd = test(dt, testDatardd)
    dtFone = fone(resultrdd)
    dtac = accuracy(resultrdd)
  
    rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10)
    resultrdd = test(rf, testDatardd)
    rfFone = fone(resultrdd)
    rfac = accuracy(resultrdd)

    print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac)
    print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac)
    print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac)
    print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac)

    print lrmLBFGS.weights
    print lrmSGD.weights

    sc.stop()
开发者ID:retanoj,项目名称:ss_homework,代码行数:71,代码来源:BadOrGood.py

示例8: main

# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
def main(argv):

	verbose = False

	dbpath = '/root/data/AdditionalFiles/'
	tagstring = 'rock'
	usealldata = False

	holdout = 0.1
	model_iterations = 100
	model_step = 1.0
	model_intercept = True

	# possible types logistic and svm
	model_type = 'logistic'

	try:
		opts, args = getopt.getopt(argv,"hvd:t:am:s:i:o:c",["help","verbose","datapath=","tagstring=","alldata","model=","step=","iterations=","holdout=","intercept"])
	except getopt.GetoptError:
		print 'rockTag.py -d <data path> -t <tag string>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print('rockTag.py -d <data path> -t <tag string>')
			sys.exit()
		elif opt in ("-v", "--verbose"):
			verbose = True
		elif opt in ("-d", "--datapath"):
			dbpath = arg
		elif opt in ("-t", "--tagstring"):
			tagstring = str(arg).lower()
		elif opt in ("-a", "--alldata"):
			usealldata = True
		elif opt in ("-m", "--model"):
			if str(arg).lower() in ['logistic','svm']:
				model_type = str(arg).lower
			else:
				print('valid models are logistic and svm')
				sys.exit()
		elif opt in ("-s", "--step"):
			model_step = float(arg)
		elif opt in ("-i", "--iterations"):
			model_iterations = int(arg)
		elif opt in ("-o", "--holdout"):
			holdout = float(arg)
			if holdout <= 0 | holdout >= 1:
				print('holdout must be greater than 0 and less than 1')
		elif opt in ("-c", "--intercept"):
			model_intercept = True

	if verbose:
		print('data path: ' + dbpath)
		print('tag string: ' + tagstring)

	labels, features = getLabelsAndFeatures(dbpath, tagstring=tagstring, verbose=verbose, usealldata=usealldata)

	# scale features
	std = StandardScaler(True, True).fit(features)
	features = std.transform(features)

	# make labeled data
	labeledData = labels.zip(features).map(lambda (label, data): LabeledPoint(label, data))
	if verbose: labeledData.take(3)

	# rebalance samples
	equalSampleData = rebalanceSample(labeledData, verbose=verbose)

	# split data
	trainData, testData = randomSplit(equalSampleData, [1-holdout, holdout])
	if verbose: trainData.map(lambda p: (p.label, p.features)).take(3)

	# train model
	if model_type == 'logistic':
		model = LogisticRegressionWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step)
	elif model_type == 'svm':
		model = SVMWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step)

	evalString = evaluateModel(model, testData)
	print(evalString)
开发者ID:ScalingUpMusic,项目名称:SUMapplication,代码行数:81,代码来源:oneTag_1_4.py

示例9: StandardScaler

# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
path = sys.argv[1]

#path = "/Users/jamesledoux/Documents/BigData/netflixrecommender/movie_features_dataset.dat/"
data = MLUtils.loadLibSVMFile(sc, path)

labels = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)


#normalize:
#scaler = StandardScaler(withMean = True, withStd = True).fit(features)  #data needs to be dense (zeros included)
scaler = StandardScaler(withMean = False, withStd = True).fit(features)  #becomes dense if using withMean. may run out of memory locally

#convert data to dense vector to be normalized
#data2 = labels.zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
data2 = labels.zip(scaler.transform(features))   #use this line if having memory issues

#hide 10% of the data for final test
data, test = data2.randomSplit([.9, .1])

#get size of chunks for 10-fold cross-validation
num_folds = 10
partitionSize = (len(data.collect())/num_folds)   #parameterize this value as num_folds (in loop as well)

#train/validate 10 times on each k
i = 0
j = partitionSize
data = data.collect()
cv_error_storage = []

#10 fold is better, but I use 5 here in the interest of time
开发者ID:Aniketsaoji,项目名称:NetflixRecommender,代码行数:33,代码来源:KMeans_content.py

示例10: StandardScaler

# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
max_time = 23 * 3600 + 59 * 60 + 59
#max_time = 16 * 60
low = 0
high = 15 * 60
modelList = []

while low < max_time: # Temp should run once
	timeseries = df.filter(lambda x: low < x.timestamp < high)	

	#if timeseries.count() > 0:
	features = timeseries.map(lambda row: row[1:])
		#print "Possible points"
		#print features.collect()

	model = StandardScaler().fit(features)
	features_t = model.transform(features)
	
	label = timeseries.map(lambda row: row[0])
	labeled_data = label.zip(features_t)

	final_data = labeled_data.map(lambda row: LabeledPoint(row[0], row[1]))
	
	model = LinearRegressionWithSGD.train(final_data, 1000, .0000001, intercept=True)
		#model = RidgeRegressionWithSGD.train(final_data, 1000, .00000001, intercept=True)
		#model = LassoWithSGD.train(final_data, 1000, .00000001, intercept=True)
	modelList.append(model)
		

		#print ""
		#print "Model1 weights " + str(model.weights)
		#print ""
开发者ID:benCoomes,项目名称:projectSol,代码行数:33,代码来源:spark_linear_regression.py

示例11: SparkContext

# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="StandardScalerExample")  # SparkContext

    # $example on$
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    label = data.map(lambda x: x.label)
    features = data.map(lambda x: x.features)

    scaler1 = StandardScaler().fit(features)
    scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)

    # data1 will be unit variance.
    data1 = label.zip(scaler1.transform(features))

    # data2 will be unit variance and zero mean.
    data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
    # $example off$

    print("data1:")
    for each in data1.collect():
        print(each)

    print("data2:")
    for each in data2.collect():
        print(each)

    sc.stop()
开发者ID:11wzy001,项目名称:spark,代码行数:32,代码来源:standard_scaler_example.py

示例12: setLevel

# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import transform [as 别名]
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )

def parsePoint(data):
	#return LabeledPoint(data[3],np.append(data[0:3],data[4:]))
	return LabeledPoint(data[0],data[1:])

# store the data from cassandra to a data frame and remove the NA value 
data=sc.cassandraTable("msd_01", "songs").select("song_hotttnesss","loudness","year","sentiment","tempo","unique_words").toDF()

data=data.filter("year>0").na.drop()
print data.count()


# Scale the features with Standard Scaler
data2=data.map(lambda x: [x.song_hotttnesss, x.loudness,x.year, x.sentiment,x.tempo,x.unique_words])#Convert each sql.row to an array
scaler= StandardScaler(withMean=True, withStd=True).fit(data2) #fit a scaler on the every column
scaledData = scaler.transform(data2)# transform our data

# Transform to a labelled vector
parsedData = scaledData.map(parsePoint)

# # Build the model
model = LinearRegressionWithSGD.train(parsedData, iterations=1000,regParam=1.0,regType="l2",intercept=True)

# Evaluate the model on training data
print ("intercept",model.intercept)
print zip(["loudness","year","sentiment","tempo","unique_words"],model.weights)

sc.stop()
开发者ID:StephTruong,项目名称:W251-MillionSong,代码行数:33,代码来源:songHotnessRegression.py


注:本文中的pyspark.mllib.feature.StandardScaler.transform方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。