當前位置: 首頁>>代碼示例>>Python>>正文


Python feature.StandardScaler類代碼示例

本文整理匯總了Python中pyspark.mllib.feature.StandardScaler的典型用法代碼示例。如果您正苦於以下問題:Python StandardScaler類的具體用法?Python StandardScaler怎麽用?Python StandardScaler使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。


在下文中一共展示了StandardScaler類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: test_model_transform

 def test_model_transform(self):
     data = [
         [1.0, 2.0, 3.0],
         [2.0, 3.0, 4.0],
         [3.0, 4.0, 5.0]
     ]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
開發者ID:HodaAlemi,項目名稱:spark,代碼行數:8,代碼來源:tests.py

示例2: test_model_setters

 def test_model_setters(self):
     data = [
         [1.0, 2.0, 3.0],
         [2.0, 3.0, 4.0],
         [3.0, 4.0, 5.0]
     ]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertIsNotNone(model.setWithMean(True))
     self.assertIsNotNone(model.setWithStd(True))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([-1.0, -1.0, -1.0]))
開發者ID:HodaAlemi,項目名稱:spark,代碼行數:10,代碼來源:tests.py

示例3: extract_features

    def extract_features(self, feat='tfidf', **kwargs):
        """
        Converts each subtitle into its TF/TFIDF representation.
        Normalizes if necessary.

        Parameters
        --------
        Feat: 'tf' or 'tfidf'.
        kwargs: num_features, minDocFreq, or other arguments to be passed
        to the MLLib objects.

        Returns
        --------
        RDD of features with key.
        """

        # transform BOW into TF vectors
        num_features = kwargs.get('num_features', 10000)
        htf = HashingTF(num_features)
        feat_rdd = self.RDD.mapValues(htf.transform).cache()

        # transform TF vectors into IDF vectors
        if feat == 'tfidf':
            keys, tf_vecs = feat_rdd.keys(), feat_rdd.values()
            minDocFreq = kwargs.get('minDocFreq', 2)
            idf = IDF(minDocFreq=minDocFreq)
            idf_model = idf.fit(tf_vecs)
            idf_rdd = idf_model.transform(tf_vecs.map(lambda vec: vec.toArray()))
            feat_rdd = keys.zip(idf_rdd)

        if self.model_type == 'log_reg':
            normalizer = StandardScaler(withMean=True, withStd=True)
            keys, vecs = feat_rdd.keys(), feat_rdd.values()
            norm_model = normalizer.fit(vecs)
            norm_rdd = norm_model.transform(vecs.map(lambda vec: vec.toArray()))
            feat_rdd = keys.zip(norm_rdd)

        return feat_rdd
開發者ID:Nathx,項目名稱:parental_advisory_ml,代碼行數:38,代碼來源:spark_model.py

示例4: LabeledPoint

# 24 = mode
# 27 = tempo
# 28 = time_signature

allData = trackRocks.join(songData).map(lambda (tr, (rocks, data)): (tr, (0.0 if rocks is None else rocks, data)))
allData.take(3)

# label data

# only uses one feature for now
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [data[6]]))
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [random.random() + (.5 if rocks == 1 else 0)]))

labels = allData.map(lambda (tr, (rocks, data)): rocks)
features = allData.map(lambda (tr, (rocks, data)): data)
std = StandardScaler(True, True).fit(features)
scaledFeatures = std.transform(features)

labeledData = labels.zip(scaledFeatures).map(lambda (label, data): LabeledPoint(label, data))

# uses all extracted
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [x for x in data]))

labeledData.take(3)

# make sample sizes equal
labeledRock = labeledData.filter(lambda p: p.label == 1.0)
labeledRock.count()
labeledRock.map(lambda p: p.features[0]).mean()
nrock = labeledRock.count()
開發者ID:ScalingUpMusic,項目名稱:SUMsandbox,代碼行數:30,代碼來源:rock_ml.py

示例5: StandardScaler

df = sqlContext.createDataFrame(dictList)
df.show()
pdf = df.toPandas

table = pd.pivot_table(pdf, index=['datetime'], columns=['data:temp'], aggfunc=numpy.mean)
print table.values
# For Testing
#df.show()
#df.describe(['data:temp', 'datetime', 'sensorName', 'data:humidity']).show()
df = df.select('data:temp', 'data:humidity', 'data:chlPPM', 'data:co2', 'data:flo', 'data:psi')
#df.show()
temp = df.map(lambda line:LabeledPoint(line[0], [line[1:]]))

# Scale the data
features = df.map(lambda row: row[1:])
standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)
print features_transform.take(5)

lab = df.map(lambda row: row[0])

transformedData = lab.zip(features_transform)

transformedData = transformedData.map(lambda row: LabeledPoint(row[0], [row[1]]))

trainingData, testingData = transformedData.randomSplit([.8, .2], seed=1234)

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

linearModel = LinearRegressionWithSGD.train(trainingData, 1000, .0002)
開發者ID:stevekludt,項目名稱:sparkModels,代碼行數:31,代碼來源:HBaseRead.py

示例6: main

def main():
    appName = "BadOrGood;zl"
    
    conf = (SparkConf()
            .setAppName(appName)
            .set("spark.executor.memory", "5g")
            .set("spark.executor.cores","3")
            .set("spark.executor.instance", "3")
            )
    sc = SparkContext(conf = conf)
    hc = HiveContext(sc)

    #fetch data
    #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd'
    #fetchDataToFile(hc, filepath)
    
    #load data
    # AllDataRawrdd = sc.pickleFile(filepath) \
                    # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \
                    # .repartition(10)
    
    AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10)
    
    
    #standardizer for train and test data
    model = StandardScaler(True, True) \
            .fit( AllDataRawrdd \
                  .map( lambda _: Vectors.dense(_['feature']) ) 
            )
    labels = AllDataRawrdd.map(lambda _: _['label'])
    featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) )
    AllDataRawrdd = labels \
                    .zip(featureTransformed) \
                    .map( lambda _: { 'label':_[0], 'feature':_[1] } )
    #sampling
    trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100)
    trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist()
    testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist()
    
    #prediction & test
    lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1")
    resultrdd = test(lrmLBFGS, testDatardd)
    lrmLBFGSFone = fone(resultrdd)
    lrmLBFGSac = accuracy(resultrdd)

    lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1")
    resultrdd = test(lrmSGD, testDatardd)
    lrmSGDFone = fone(resultrdd)
    lrmSGDac = accuracy(resultrdd)
  
    dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10)
    resultrdd = test(dt, testDatardd)
    dtFone = fone(resultrdd)
    dtac = accuracy(resultrdd)
  
    rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10)
    resultrdd = test(rf, testDatardd)
    rfFone = fone(resultrdd)
    rfac = accuracy(resultrdd)

    print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac)
    print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac)
    print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac)
    print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac)

    print lrmLBFGS.weights
    print lrmSGD.weights

    sc.stop()
開發者ID:retanoj,項目名稱:ss_homework,代碼行數:69,代碼來源:BadOrGood.py

示例7: norm

def norm(features):
    scaler = StandardScaler(withMean=False, withStd=False).fit(features)
    return scaler.transform(features)
開發者ID:aymen82,項目名稱:kaggler-competitions-scripts,代碼行數:3,代碼來源:script.py

示例8: return

    parts = line.strip().split("::")
    return (int(parts[0])-1, int(parts[1])-1, float(parts[2]))

#load in input file
path = sys.argv[1]

#path = "/Users/jamesledoux/Documents/BigData/netflixrecommender/movie_features_dataset.dat/"
data = MLUtils.loadLibSVMFile(sc, path)

labels = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)


#normalize:
#scaler = StandardScaler(withMean = True, withStd = True).fit(features)  #data needs to be dense (zeros included)
scaler = StandardScaler(withMean = False, withStd = True).fit(features)  #becomes dense if using withMean. may run out of memory locally

#convert data to dense vector to be normalized
#data2 = labels.zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
data2 = labels.zip(scaler.transform(features))   #use this line if having memory issues

#hide 10% of the data for final test
data, test = data2.randomSplit([.9, .1])

#get size of chunks for 10-fold cross-validation
num_folds = 10
partitionSize = (len(data.collect())/num_folds)   #parameterize this value as num_folds (in loop as well)

#train/validate 10 times on each k
i = 0
j = partitionSize
開發者ID:Aniketsaoji,項目名稱:NetflixRecommender,代碼行數:31,代碼來源:KMeans_content.py

示例9: OrderedDict

    label_counts = labels.countByValue()
    sorted_labels = OrderedDict(sorted(label_counts.items(), key=lambda t: t[1], reverse=True))
    for label, count in sorted_labels.items():
        print label, count

    # Prepare data for clustering input
    # the data contains non-numeric features, we want to exclude them since
    # k-means works with numeric features. These are the first three and the last
    # column in each data row
    print "Parsing dataset..."
    parsed_data = raw_data.map(parse_interaction)
    parsed_data_values = parsed_data.values().cache()

    # Standardize data
    print "Standardizing data..."
    standardizer = StandardScaler(True, True)
    standardizer_model = standardizer.fit(parsed_data_values)
    standardized_data_values = standardizer_model.transform(parsed_data_values)

    # Evaluate values of k from 5 to 40
    print "Calculating total in within cluster distance for different k values (10 to %(max_k)d):" % {"max_k": max_k}
    scores = map(lambda k: clustering_score(standardized_data_values, k), range(10,max_k+1,10))

    # Obtain min score k
    min_k = min(scores, key=lambda x: x[2])[0]
    print "Best k value is %(best_k)d" % {"best_k": min_k}

    # Use the best model to assign a cluster to each datum
    # We use here standardized data - it is more appropriate for exploratory purposes
    print "Obtaining clustering result sample for k=%(min_k)d..." % {"min_k": min_k}
    best_model = min(scores, key=lambda x: x[2])[1]
開發者ID:4sp1r3,項目名稱:kdd-cup-99-spark,代碼行數:31,代碼來源:KDDCup99.py

示例10: StandardScaler

# This should be the maximum possible time
max_time = 23 * 3600 + 59 * 60 + 59
#max_time = 16 * 60
low = 0
high = 15 * 60
modelList = []

while low < max_time: # Temp should run once
	timeseries = df.filter(lambda x: low < x.timestamp < high)	

	#if timeseries.count() > 0:
	features = timeseries.map(lambda row: row[1:])
		#print "Possible points"
		#print features.collect()

	model = StandardScaler().fit(features)
	features_t = model.transform(features)
	
	label = timeseries.map(lambda row: row[0])
	labeled_data = label.zip(features_t)

	final_data = labeled_data.map(lambda row: LabeledPoint(row[0], row[1]))
	
	model = LinearRegressionWithSGD.train(final_data, 1000, .0000001, intercept=True)
		#model = RidgeRegressionWithSGD.train(final_data, 1000, .00000001, intercept=True)
		#model = LassoWithSGD.train(final_data, 1000, .00000001, intercept=True)
	modelList.append(model)
		

		#print ""
		#print "Model1 weights " + str(model.weights)
開發者ID:benCoomes,項目名稱:projectSol,代碼行數:31,代碼來源:spark_linear_regression.py

示例11: SparkContext

from pyspark import SparkContext
# $example on$
from pyspark.mllib.feature import StandardScaler, StandardScalerModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="StandardScalerExample")  # SparkContext

    # $example on$
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    label = data.map(lambda x: x.label)
    features = data.map(lambda x: x.features)

    scaler1 = StandardScaler().fit(features)
    scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)

    # data1 will be unit variance.
    data1 = label.zip(scaler1.transform(features))

    # data2 will be unit variance and zero mean.
    data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
    # $example off$

    print("data1:")
    for each in data1.collect():
        print(each)

    print("data2:")
    for each in data2.collect():
開發者ID:11wzy001,項目名稱:spark,代碼行數:31,代碼來源:standard_scaler_example.py

示例12: SparkContext


#Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set.
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.mllib.feature import StandardScaler

sc = SparkContext()

vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])]

dataset = sc.parallelize(vs)

#all false, do nothing.
standardizer = StandardScaler(False, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect(): print r

print("\n")

#deducts the mean
standardizer = StandardScaler(True, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect(): print r

print("\n")

#divides the length of vector
開發者ID:aviyashchin,項目名稱:CollabFiltering-Netflix-PySpark,代碼行數:29,代碼來源:classification.py

示例13: toLabeledPoint

#Section 7.4.4
from pyspark.mllib.regression import LabeledPoint
def toLabeledPoint(x):
  a = x.toArray()
  return LabeledPoint(a[-1], Vectors.dense(a[0:-1]))

housingData = housingVals.map(toLabeledPoint)

#Section 7.4.5
sets = housingData.randomSplit([0.8, 0.2])
housingTrain = sets[0]
housingValid = sets[1]

#Section 7.4.6
from pyspark.mllib.feature import StandardScaler
scaler = StandardScaler(True, True).fit(housingTrain.map(lambda x: x.features))
trainLabel = housingTrain.map(lambda x: x.label)
trainFeatures = housingTrain.map(lambda x: x.features)
validLabel = housingValid.map(lambda x: x.label)
validFeatures = housingValid.map(lambda x: x.features)
trainScaled = trainLabel.zip(scaler.transform(trainFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))
validScaled = validLabel.zip(scaler.transform(validFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))

#Section 7.5
from pyspark.mllib.regression import LinearRegressionWithSGD
alg = LinearRegressionWithSGD()
trainScaled.cache()
validScaled.cache()
model = alg.train(trainScaled, iterations=200, intercept=True)

#Section 7.5.1
開發者ID:AkiraKane,項目名稱:first-edition,代碼行數:31,代碼來源:ch07-listings.py

示例14: main

def main(argv):

	verbose = False

	dbpath = '/root/data/AdditionalFiles/'
	tagstring = 'rock'
	usealldata = False

	holdout = 0.1
	model_iterations = 100
	model_step = 1.0
	model_intercept = True

	# possible types logistic and svm
	model_type = 'logistic'

	try:
		opts, args = getopt.getopt(argv,"hvd:t:am:s:i:o:c",["help","verbose","datapath=","tagstring=","alldata","model=","step=","iterations=","holdout=","intercept"])
	except getopt.GetoptError:
		print 'rockTag.py -d <data path> -t <tag string>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print('rockTag.py -d <data path> -t <tag string>')
			sys.exit()
		elif opt in ("-v", "--verbose"):
			verbose = True
		elif opt in ("-d", "--datapath"):
			dbpath = arg
		elif opt in ("-t", "--tagstring"):
			tagstring = str(arg).lower()
		elif opt in ("-a", "--alldata"):
			usealldata = True
		elif opt in ("-m", "--model"):
			if str(arg).lower() in ['logistic','svm']:
				model_type = str(arg).lower
			else:
				print('valid models are logistic and svm')
				sys.exit()
		elif opt in ("-s", "--step"):
			model_step = float(arg)
		elif opt in ("-i", "--iterations"):
			model_iterations = int(arg)
		elif opt in ("-o", "--holdout"):
			holdout = float(arg)
			if holdout <= 0 | holdout >= 1:
				print('holdout must be greater than 0 and less than 1')
		elif opt in ("-c", "--intercept"):
			model_intercept = True

	if verbose:
		print('data path: ' + dbpath)
		print('tag string: ' + tagstring)

	labels, features = getLabelsAndFeatures(dbpath, tagstring=tagstring, verbose=verbose, usealldata=usealldata)

	# scale features
	std = StandardScaler(True, True).fit(features)
	features = std.transform(features)

	# make labeled data
	labeledData = labels.zip(features).map(lambda (label, data): LabeledPoint(label, data))
	if verbose: labeledData.take(3)

	# rebalance samples
	equalSampleData = rebalanceSample(labeledData, verbose=verbose)

	# split data
	trainData, testData = randomSplit(equalSampleData, [1-holdout, holdout])
	if verbose: trainData.map(lambda p: (p.label, p.features)).take(3)

	# train model
	if model_type == 'logistic':
		model = LogisticRegressionWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step)
	elif model_type == 'svm':
		model = SVMWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step)

	evalString = evaluateModel(model, testData)
	print(evalString)
開發者ID:ScalingUpMusic,項目名稱:SUMapplication,代碼行數:79,代碼來源:oneTag_1_4.py

示例15: SparkConf


    # step 1 - create spark context
    conf = SparkConf().setAppName("KMeans-Content")\
       .set("spark.executor.memory","1g")
    sc = SparkContext()


    # step 2 - load in input file
    data = MLUtils.loadLibSVMFile(sc,"/Users/Ellen/Desktop/movie_features_dataset.dat")
    labels = data.map(lambda x:x.label)
    features = data.map(lambda x:x.features)

  
    # step 3 - standarize the data with unit values and 0 mean
    scaler = StandardScaler(withMean=False,withStd=True).fit(features)

    data2 = labels.zip(scaler.transform(features))

    numFeatures = len(data2.values().take(10)[0])
    print "Type of data2: ",type(data2) #RDD
    print "Type of data2.values(): ",type(data2.values()) # pipelinedrdd
    print "Sample: ",data2.values().take(1)[0]

    # splitting up the data to training, validation and testing models.
    train,val,test = data2.randomSplit([.80,.10,.10])


    print "Training Dataset Size:",train.count()
    print "Validation Dataset size:",val.count()
    print "Test Dataset Size:",test.count()
開發者ID:ellenkimsy,項目名稱:Big-Data-Homework,代碼行數:29,代碼來源:Content_KMeans.py


注:本文中的pyspark.mllib.feature.StandardScaler類示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。