本文整理汇总了Python中pyspark.mllib.feature.StandardScaler类的典型用法代码示例。如果您正苦于以下问题:Python StandardScaler类的具体用法?Python StandardScaler怎么用?Python StandardScaler使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了StandardScaler类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_model_transform
def test_model_transform(self):
data = [
[1.0, 2.0, 3.0],
[2.0, 3.0, 4.0],
[3.0, 4.0, 5.0]
]
model = StandardScaler().fit(self.sc.parallelize(data))
self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
示例2: test_model_setters
def test_model_setters(self):
data = [
[1.0, 2.0, 3.0],
[2.0, 3.0, 4.0],
[3.0, 4.0, 5.0]
]
model = StandardScaler().fit(self.sc.parallelize(data))
self.assertIsNotNone(model.setWithMean(True))
self.assertIsNotNone(model.setWithStd(True))
self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([-1.0, -1.0, -1.0]))
示例3: extract_features
def extract_features(self, feat='tfidf', **kwargs):
"""
Converts each subtitle into its TF/TFIDF representation.
Normalizes if necessary.
Parameters
--------
Feat: 'tf' or 'tfidf'.
kwargs: num_features, minDocFreq, or other arguments to be passed
to the MLLib objects.
Returns
--------
RDD of features with key.
"""
# transform BOW into TF vectors
num_features = kwargs.get('num_features', 10000)
htf = HashingTF(num_features)
feat_rdd = self.RDD.mapValues(htf.transform).cache()
# transform TF vectors into IDF vectors
if feat == 'tfidf':
keys, tf_vecs = feat_rdd.keys(), feat_rdd.values()
minDocFreq = kwargs.get('minDocFreq', 2)
idf = IDF(minDocFreq=minDocFreq)
idf_model = idf.fit(tf_vecs)
idf_rdd = idf_model.transform(tf_vecs.map(lambda vec: vec.toArray()))
feat_rdd = keys.zip(idf_rdd)
if self.model_type == 'log_reg':
normalizer = StandardScaler(withMean=True, withStd=True)
keys, vecs = feat_rdd.keys(), feat_rdd.values()
norm_model = normalizer.fit(vecs)
norm_rdd = norm_model.transform(vecs.map(lambda vec: vec.toArray()))
feat_rdd = keys.zip(norm_rdd)
return feat_rdd
示例4: LabeledPoint
# 24 = mode
# 27 = tempo
# 28 = time_signature
allData = trackRocks.join(songData).map(lambda (tr, (rocks, data)): (tr, (0.0 if rocks is None else rocks, data)))
allData.take(3)
# label data
# only uses one feature for now
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [data[6]]))
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [random.random() + (.5 if rocks == 1 else 0)]))
labels = allData.map(lambda (tr, (rocks, data)): rocks)
features = allData.map(lambda (tr, (rocks, data)): data)
std = StandardScaler(True, True).fit(features)
scaledFeatures = std.transform(features)
labeledData = labels.zip(scaledFeatures).map(lambda (label, data): LabeledPoint(label, data))
# uses all extracted
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [x for x in data]))
labeledData.take(3)
# make sample sizes equal
labeledRock = labeledData.filter(lambda p: p.label == 1.0)
labeledRock.count()
labeledRock.map(lambda p: p.features[0]).mean()
nrock = labeledRock.count()
示例5: StandardScaler
df = sqlContext.createDataFrame(dictList)
df.show()
pdf = df.toPandas
table = pd.pivot_table(pdf, index=['datetime'], columns=['data:temp'], aggfunc=numpy.mean)
print table.values
# For Testing
#df.show()
#df.describe(['data:temp', 'datetime', 'sensorName', 'data:humidity']).show()
df = df.select('data:temp', 'data:humidity', 'data:chlPPM', 'data:co2', 'data:flo', 'data:psi')
#df.show()
temp = df.map(lambda line:LabeledPoint(line[0], [line[1:]]))
# Scale the data
features = df.map(lambda row: row[1:])
standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)
print features_transform.take(5)
lab = df.map(lambda row: row[0])
transformedData = lab.zip(features_transform)
transformedData = transformedData.map(lambda row: LabeledPoint(row[0], [row[1]]))
trainingData, testingData = transformedData.randomSplit([.8, .2], seed=1234)
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
linearModel = LinearRegressionWithSGD.train(trainingData, 1000, .0002)
示例6: main
def main():
appName = "BadOrGood;zl"
conf = (SparkConf()
.setAppName(appName)
.set("spark.executor.memory", "5g")
.set("spark.executor.cores","3")
.set("spark.executor.instance", "3")
)
sc = SparkContext(conf = conf)
hc = HiveContext(sc)
#fetch data
#filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd'
#fetchDataToFile(hc, filepath)
#load data
# AllDataRawrdd = sc.pickleFile(filepath) \
# .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \
# .repartition(10)
AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10)
#standardizer for train and test data
model = StandardScaler(True, True) \
.fit( AllDataRawrdd \
.map( lambda _: Vectors.dense(_['feature']) )
)
labels = AllDataRawrdd.map(lambda _: _['label'])
featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) )
AllDataRawrdd = labels \
.zip(featureTransformed) \
.map( lambda _: { 'label':_[0], 'feature':_[1] } )
#sampling
trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100)
trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist()
testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist()
#prediction & test
lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1")
resultrdd = test(lrmLBFGS, testDatardd)
lrmLBFGSFone = fone(resultrdd)
lrmLBFGSac = accuracy(resultrdd)
lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1")
resultrdd = test(lrmSGD, testDatardd)
lrmSGDFone = fone(resultrdd)
lrmSGDac = accuracy(resultrdd)
dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10)
resultrdd = test(dt, testDatardd)
dtFone = fone(resultrdd)
dtac = accuracy(resultrdd)
rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10)
resultrdd = test(rf, testDatardd)
rfFone = fone(resultrdd)
rfac = accuracy(resultrdd)
print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac)
print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac)
print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac)
print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac)
print lrmLBFGS.weights
print lrmSGD.weights
sc.stop()
示例7: norm
def norm(features):
scaler = StandardScaler(withMean=False, withStd=False).fit(features)
return scaler.transform(features)
示例8: return
parts = line.strip().split("::")
return (int(parts[0])-1, int(parts[1])-1, float(parts[2]))
#load in input file
path = sys.argv[1]
#path = "/Users/jamesledoux/Documents/BigData/netflixrecommender/movie_features_dataset.dat/"
data = MLUtils.loadLibSVMFile(sc, path)
labels = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)
#normalize:
#scaler = StandardScaler(withMean = True, withStd = True).fit(features) #data needs to be dense (zeros included)
scaler = StandardScaler(withMean = False, withStd = True).fit(features) #becomes dense if using withMean. may run out of memory locally
#convert data to dense vector to be normalized
#data2 = labels.zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
data2 = labels.zip(scaler.transform(features)) #use this line if having memory issues
#hide 10% of the data for final test
data, test = data2.randomSplit([.9, .1])
#get size of chunks for 10-fold cross-validation
num_folds = 10
partitionSize = (len(data.collect())/num_folds) #parameterize this value as num_folds (in loop as well)
#train/validate 10 times on each k
i = 0
j = partitionSize
示例9: OrderedDict
label_counts = labels.countByValue()
sorted_labels = OrderedDict(sorted(label_counts.items(), key=lambda t: t[1], reverse=True))
for label, count in sorted_labels.items():
print label, count
# Prepare data for clustering input
# the data contains non-numeric features, we want to exclude them since
# k-means works with numeric features. These are the first three and the last
# column in each data row
print "Parsing dataset..."
parsed_data = raw_data.map(parse_interaction)
parsed_data_values = parsed_data.values().cache()
# Standardize data
print "Standardizing data..."
standardizer = StandardScaler(True, True)
standardizer_model = standardizer.fit(parsed_data_values)
standardized_data_values = standardizer_model.transform(parsed_data_values)
# Evaluate values of k from 5 to 40
print "Calculating total in within cluster distance for different k values (10 to %(max_k)d):" % {"max_k": max_k}
scores = map(lambda k: clustering_score(standardized_data_values, k), range(10,max_k+1,10))
# Obtain min score k
min_k = min(scores, key=lambda x: x[2])[0]
print "Best k value is %(best_k)d" % {"best_k": min_k}
# Use the best model to assign a cluster to each datum
# We use here standardized data - it is more appropriate for exploratory purposes
print "Obtaining clustering result sample for k=%(min_k)d..." % {"min_k": min_k}
best_model = min(scores, key=lambda x: x[2])[1]
示例10: StandardScaler
# This should be the maximum possible time
max_time = 23 * 3600 + 59 * 60 + 59
#max_time = 16 * 60
low = 0
high = 15 * 60
modelList = []
while low < max_time: # Temp should run once
timeseries = df.filter(lambda x: low < x.timestamp < high)
#if timeseries.count() > 0:
features = timeseries.map(lambda row: row[1:])
#print "Possible points"
#print features.collect()
model = StandardScaler().fit(features)
features_t = model.transform(features)
label = timeseries.map(lambda row: row[0])
labeled_data = label.zip(features_t)
final_data = labeled_data.map(lambda row: LabeledPoint(row[0], row[1]))
model = LinearRegressionWithSGD.train(final_data, 1000, .0000001, intercept=True)
#model = RidgeRegressionWithSGD.train(final_data, 1000, .00000001, intercept=True)
#model = LassoWithSGD.train(final_data, 1000, .00000001, intercept=True)
modelList.append(model)
#print ""
#print "Model1 weights " + str(model.weights)
示例11: SparkContext
from pyspark import SparkContext
# $example on$
from pyspark.mllib.feature import StandardScaler, StandardScalerModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.util import MLUtils
# $example off$
if __name__ == "__main__":
sc = SparkContext(appName="StandardScalerExample") # SparkContext
# $example on$
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
label = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)
scaler1 = StandardScaler().fit(features)
scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)
# data1 will be unit variance.
data1 = label.zip(scaler1.transform(features))
# data2 will be unit variance and zero mean.
data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
# $example off$
print("data1:")
for each in data1.collect():
print(each)
print("data2:")
for each in data2.collect():
示例12: SparkContext
#Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set.
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.mllib.feature import StandardScaler
sc = SparkContext()
vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])]
dataset = sc.parallelize(vs)
#all false, do nothing.
standardizer = StandardScaler(False, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect(): print r
print("\n")
#deducts the mean
standardizer = StandardScaler(True, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect(): print r
print("\n")
#divides the length of vector
示例13: toLabeledPoint
#Section 7.4.4
from pyspark.mllib.regression import LabeledPoint
def toLabeledPoint(x):
a = x.toArray()
return LabeledPoint(a[-1], Vectors.dense(a[0:-1]))
housingData = housingVals.map(toLabeledPoint)
#Section 7.4.5
sets = housingData.randomSplit([0.8, 0.2])
housingTrain = sets[0]
housingValid = sets[1]
#Section 7.4.6
from pyspark.mllib.feature import StandardScaler
scaler = StandardScaler(True, True).fit(housingTrain.map(lambda x: x.features))
trainLabel = housingTrain.map(lambda x: x.label)
trainFeatures = housingTrain.map(lambda x: x.features)
validLabel = housingValid.map(lambda x: x.label)
validFeatures = housingValid.map(lambda x: x.features)
trainScaled = trainLabel.zip(scaler.transform(trainFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))
validScaled = validLabel.zip(scaler.transform(validFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))
#Section 7.5
from pyspark.mllib.regression import LinearRegressionWithSGD
alg = LinearRegressionWithSGD()
trainScaled.cache()
validScaled.cache()
model = alg.train(trainScaled, iterations=200, intercept=True)
#Section 7.5.1
示例14: main
def main(argv):
verbose = False
dbpath = '/root/data/AdditionalFiles/'
tagstring = 'rock'
usealldata = False
holdout = 0.1
model_iterations = 100
model_step = 1.0
model_intercept = True
# possible types logistic and svm
model_type = 'logistic'
try:
opts, args = getopt.getopt(argv,"hvd:t:am:s:i:o:c",["help","verbose","datapath=","tagstring=","alldata","model=","step=","iterations=","holdout=","intercept"])
except getopt.GetoptError:
print 'rockTag.py -d <data path> -t <tag string>'
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('rockTag.py -d <data path> -t <tag string>')
sys.exit()
elif opt in ("-v", "--verbose"):
verbose = True
elif opt in ("-d", "--datapath"):
dbpath = arg
elif opt in ("-t", "--tagstring"):
tagstring = str(arg).lower()
elif opt in ("-a", "--alldata"):
usealldata = True
elif opt in ("-m", "--model"):
if str(arg).lower() in ['logistic','svm']:
model_type = str(arg).lower
else:
print('valid models are logistic and svm')
sys.exit()
elif opt in ("-s", "--step"):
model_step = float(arg)
elif opt in ("-i", "--iterations"):
model_iterations = int(arg)
elif opt in ("-o", "--holdout"):
holdout = float(arg)
if holdout <= 0 | holdout >= 1:
print('holdout must be greater than 0 and less than 1')
elif opt in ("-c", "--intercept"):
model_intercept = True
if verbose:
print('data path: ' + dbpath)
print('tag string: ' + tagstring)
labels, features = getLabelsAndFeatures(dbpath, tagstring=tagstring, verbose=verbose, usealldata=usealldata)
# scale features
std = StandardScaler(True, True).fit(features)
features = std.transform(features)
# make labeled data
labeledData = labels.zip(features).map(lambda (label, data): LabeledPoint(label, data))
if verbose: labeledData.take(3)
# rebalance samples
equalSampleData = rebalanceSample(labeledData, verbose=verbose)
# split data
trainData, testData = randomSplit(equalSampleData, [1-holdout, holdout])
if verbose: trainData.map(lambda p: (p.label, p.features)).take(3)
# train model
if model_type == 'logistic':
model = LogisticRegressionWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step)
elif model_type == 'svm':
model = SVMWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step)
evalString = evaluateModel(model, testData)
print(evalString)
示例15: SparkConf
# step 1 - create spark context
conf = SparkConf().setAppName("KMeans-Content")\
.set("spark.executor.memory","1g")
sc = SparkContext()
# step 2 - load in input file
data = MLUtils.loadLibSVMFile(sc,"/Users/Ellen/Desktop/movie_features_dataset.dat")
labels = data.map(lambda x:x.label)
features = data.map(lambda x:x.features)
# step 3 - standarize the data with unit values and 0 mean
scaler = StandardScaler(withMean=False,withStd=True).fit(features)
data2 = labels.zip(scaler.transform(features))
numFeatures = len(data2.values().take(10)[0])
print "Type of data2: ",type(data2) #RDD
print "Type of data2.values(): ",type(data2.values()) # pipelinedrdd
print "Sample: ",data2.values().take(1)[0]
# splitting up the data to training, validation and testing models.
train,val,test = data2.randomSplit([.80,.10,.10])
print "Training Dataset Size:",train.count()
print "Validation Dataset size:",val.count()
print "Test Dataset Size:",test.count()