本文整理汇总了Python中pyspark.mllib.feature.StandardScaler.fit方法的典型用法代码示例。如果您正苦于以下问题:Python StandardScaler.fit方法的具体用法?Python StandardScaler.fit怎么用?Python StandardScaler.fit使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.feature.StandardScaler
的用法示例。
在下文中一共展示了StandardScaler.fit方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: fit
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import fit [as 别名]
def fit(self, dataset):
"""
Computa la media y desvio estándar de un conjunto de datos, las cuales se usarán para estandarizar datos.
:param dataset: pyspark.rdd.RDD o numpy.ndarray o :class:`.LabeledDataSet`
"""
if isinstance(dataset, LabeledDataSet):
dataset = dataset.features
if isinstance(dataset, pyspark.rdd.RDD):
standarizer = StdSc(self.flag_mean, self.flag_std)
self.model = standarizer.fit(dataset)
else:
if type(dataset) is not np.ndarray:
dataset = np.array(dataset)
if self.flag_mean is True:
self.mean = dataset.mean(axis=0)
if self.flag_std is True:
self.std = dataset.std(axis=0, ddof=1)
return
示例2: extract_features
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import fit [as 别名]
def extract_features(self, feat='tfidf', **kwargs):
"""
Converts each subtitle into its TF/TFIDF representation.
Normalizes if necessary.
Parameters
--------
Feat: 'tf' or 'tfidf'.
kwargs: num_features, minDocFreq, or other arguments to be passed
to the MLLib objects.
Returns
--------
RDD of features with key.
"""
# transform BOW into TF vectors
num_features = kwargs.get('num_features', 10000)
htf = HashingTF(num_features)
feat_rdd = self.RDD.mapValues(htf.transform).cache()
# transform TF vectors into IDF vectors
if feat == 'tfidf':
keys, tf_vecs = feat_rdd.keys(), feat_rdd.values()
minDocFreq = kwargs.get('minDocFreq', 2)
idf = IDF(minDocFreq=minDocFreq)
idf_model = idf.fit(tf_vecs)
idf_rdd = idf_model.transform(tf_vecs.map(lambda vec: vec.toArray()))
feat_rdd = keys.zip(idf_rdd)
if self.model_type == 'log_reg':
normalizer = StandardScaler(withMean=True, withStd=True)
keys, vecs = feat_rdd.keys(), feat_rdd.values()
norm_model = normalizer.fit(vecs)
norm_rdd = norm_model.transform(vecs.map(lambda vec: vec.toArray()))
feat_rdd = keys.zip(norm_rdd)
return feat_rdd
示例3: StandardScaler
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import fit [as 别名]
df.show()
pdf = df.toPandas
table = pd.pivot_table(pdf, index=['datetime'], columns=['data:temp'], aggfunc=numpy.mean)
print table.values
# For Testing
#df.show()
#df.describe(['data:temp', 'datetime', 'sensorName', 'data:humidity']).show()
df = df.select('data:temp', 'data:humidity', 'data:chlPPM', 'data:co2', 'data:flo', 'data:psi')
#df.show()
temp = df.map(lambda line:LabeledPoint(line[0], [line[1:]]))
# Scale the data
features = df.map(lambda row: row[1:])
standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)
print features_transform.take(5)
lab = df.map(lambda row: row[0])
transformedData = lab.zip(features_transform)
transformedData = transformedData.map(lambda row: LabeledPoint(row[0], [row[1]]))
trainingData, testingData = transformedData.randomSplit([.8, .2], seed=1234)
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
linearModel = LinearRegressionWithSGD.train(trainingData, 1000, .0002)
print linearModel.weights
示例4: OrderedDict
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import fit [as 别名]
sorted_labels = OrderedDict(sorted(label_counts.items(), key=lambda t: t[1], reverse=True))
for label, count in sorted_labels.items():
print label, count
# Prepare data for clustering input
# the data contains non-numeric features, we want to exclude them since
# k-means works with numeric features. These are the first three and the last
# column in each data row
print "Parsing dataset..."
parsed_data = raw_data.map(parse_interaction)
parsed_data_values = parsed_data.values().cache()
# Standardize data
print "Standardizing data..."
standardizer = StandardScaler(True, True)
standardizer_model = standardizer.fit(parsed_data_values)
standardized_data_values = standardizer_model.transform(parsed_data_values)
# Evaluate values of k from 5 to 40
print "Calculating total in within cluster distance for different k values (10 to %(max_k)d):" % {"max_k": max_k}
scores = map(lambda k: clustering_score(standardized_data_values, k), range(10,max_k+1,10))
# Obtain min score k
min_k = min(scores, key=lambda x: x[2])[0]
print "Best k value is %(best_k)d" % {"best_k": min_k}
# Use the best model to assign a cluster to each datum
# We use here standardized data - it is more appropriate for exploratory purposes
print "Obtaining clustering result sample for k=%(min_k)d..." % {"min_k": min_k}
best_model = min(scores, key=lambda x: x[2])[1]
cluster_assignments_sample = standardized_data_values.map(lambda datum: str(best_model.predict(datum))+","+",".join(map(str,datum))).sample(False,0.05)
示例5: SparkContext
# 需要导入模块: from pyspark.mllib.feature import StandardScaler [as 别名]
# 或者: from pyspark.mllib.feature.StandardScaler import fit [as 别名]
#Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set.
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.mllib.feature import StandardScaler
sc = SparkContext()
vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])]
dataset = sc.parallelize(vs)
#all false, do nothing.
standardizer = StandardScaler(False, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect(): print r
print("\n")
#deducts the mean
standardizer = StandardScaler(True, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect(): print r
print("\n")
#divides the length of vector
standardizer = StandardScaler(False, True)