本文整理匯總了Python中pyspark.mllib.feature.StandardScaler.fit方法的典型用法代碼示例。如果您正苦於以下問題:Python StandardScaler.fit方法的具體用法?Python StandardScaler.fit怎麽用?Python StandardScaler.fit使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.mllib.feature.StandardScaler
的用法示例。
在下文中一共展示了StandardScaler.fit方法的5個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: fit
# 需要導入模塊: from pyspark.mllib.feature import StandardScaler [as 別名]
# 或者: from pyspark.mllib.feature.StandardScaler import fit [as 別名]
def fit(self, dataset):
"""
Computa la media y desvio estándar de un conjunto de datos, las cuales se usarán para estandarizar datos.
:param dataset: pyspark.rdd.RDD o numpy.ndarray o :class:`.LabeledDataSet`
"""
if isinstance(dataset, LabeledDataSet):
dataset = dataset.features
if isinstance(dataset, pyspark.rdd.RDD):
standarizer = StdSc(self.flag_mean, self.flag_std)
self.model = standarizer.fit(dataset)
else:
if type(dataset) is not np.ndarray:
dataset = np.array(dataset)
if self.flag_mean is True:
self.mean = dataset.mean(axis=0)
if self.flag_std is True:
self.std = dataset.std(axis=0, ddof=1)
return
示例2: extract_features
# 需要導入模塊: from pyspark.mllib.feature import StandardScaler [as 別名]
# 或者: from pyspark.mllib.feature.StandardScaler import fit [as 別名]
def extract_features(self, feat='tfidf', **kwargs):
"""
Converts each subtitle into its TF/TFIDF representation.
Normalizes if necessary.
Parameters
--------
Feat: 'tf' or 'tfidf'.
kwargs: num_features, minDocFreq, or other arguments to be passed
to the MLLib objects.
Returns
--------
RDD of features with key.
"""
# transform BOW into TF vectors
num_features = kwargs.get('num_features', 10000)
htf = HashingTF(num_features)
feat_rdd = self.RDD.mapValues(htf.transform).cache()
# transform TF vectors into IDF vectors
if feat == 'tfidf':
keys, tf_vecs = feat_rdd.keys(), feat_rdd.values()
minDocFreq = kwargs.get('minDocFreq', 2)
idf = IDF(minDocFreq=minDocFreq)
idf_model = idf.fit(tf_vecs)
idf_rdd = idf_model.transform(tf_vecs.map(lambda vec: vec.toArray()))
feat_rdd = keys.zip(idf_rdd)
if self.model_type == 'log_reg':
normalizer = StandardScaler(withMean=True, withStd=True)
keys, vecs = feat_rdd.keys(), feat_rdd.values()
norm_model = normalizer.fit(vecs)
norm_rdd = norm_model.transform(vecs.map(lambda vec: vec.toArray()))
feat_rdd = keys.zip(norm_rdd)
return feat_rdd
示例3: StandardScaler
# 需要導入模塊: from pyspark.mllib.feature import StandardScaler [as 別名]
# 或者: from pyspark.mllib.feature.StandardScaler import fit [as 別名]
df.show()
pdf = df.toPandas
table = pd.pivot_table(pdf, index=['datetime'], columns=['data:temp'], aggfunc=numpy.mean)
print table.values
# For Testing
#df.show()
#df.describe(['data:temp', 'datetime', 'sensorName', 'data:humidity']).show()
df = df.select('data:temp', 'data:humidity', 'data:chlPPM', 'data:co2', 'data:flo', 'data:psi')
#df.show()
temp = df.map(lambda line:LabeledPoint(line[0], [line[1:]]))
# Scale the data
features = df.map(lambda row: row[1:])
standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)
print features_transform.take(5)
lab = df.map(lambda row: row[0])
transformedData = lab.zip(features_transform)
transformedData = transformedData.map(lambda row: LabeledPoint(row[0], [row[1]]))
trainingData, testingData = transformedData.randomSplit([.8, .2], seed=1234)
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
linearModel = LinearRegressionWithSGD.train(trainingData, 1000, .0002)
print linearModel.weights
示例4: OrderedDict
# 需要導入模塊: from pyspark.mllib.feature import StandardScaler [as 別名]
# 或者: from pyspark.mllib.feature.StandardScaler import fit [as 別名]
sorted_labels = OrderedDict(sorted(label_counts.items(), key=lambda t: t[1], reverse=True))
for label, count in sorted_labels.items():
print label, count
# Prepare data for clustering input
# the data contains non-numeric features, we want to exclude them since
# k-means works with numeric features. These are the first three and the last
# column in each data row
print "Parsing dataset..."
parsed_data = raw_data.map(parse_interaction)
parsed_data_values = parsed_data.values().cache()
# Standardize data
print "Standardizing data..."
standardizer = StandardScaler(True, True)
standardizer_model = standardizer.fit(parsed_data_values)
standardized_data_values = standardizer_model.transform(parsed_data_values)
# Evaluate values of k from 5 to 40
print "Calculating total in within cluster distance for different k values (10 to %(max_k)d):" % {"max_k": max_k}
scores = map(lambda k: clustering_score(standardized_data_values, k), range(10,max_k+1,10))
# Obtain min score k
min_k = min(scores, key=lambda x: x[2])[0]
print "Best k value is %(best_k)d" % {"best_k": min_k}
# Use the best model to assign a cluster to each datum
# We use here standardized data - it is more appropriate for exploratory purposes
print "Obtaining clustering result sample for k=%(min_k)d..." % {"min_k": min_k}
best_model = min(scores, key=lambda x: x[2])[1]
cluster_assignments_sample = standardized_data_values.map(lambda datum: str(best_model.predict(datum))+","+",".join(map(str,datum))).sample(False,0.05)
示例5: SparkContext
# 需要導入模塊: from pyspark.mllib.feature import StandardScaler [as 別名]
# 或者: from pyspark.mllib.feature.StandardScaler import fit [as 別名]
#Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set.
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.mllib.feature import StandardScaler
sc = SparkContext()
vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])]
dataset = sc.parallelize(vs)
#all false, do nothing.
standardizer = StandardScaler(False, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect(): print r
print("\n")
#deducts the mean
standardizer = StandardScaler(True, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect(): print r
print("\n")
#divides the length of vector
standardizer = StandardScaler(False, True)