本文整理汇总了Python中pyspark.mllib.linalg.Vectors.dense方法的典型用法代码示例。如果您正苦于以下问题:Python Vectors.dense方法的具体用法?Python Vectors.dense怎么用?Python Vectors.dense使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.linalg.Vectors
的用法示例。
在下文中一共展示了Vectors.dense方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_save_load
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import dense [as 别名]
def test_save_load(self):
temp_path = tempfile.mkdtemp()
sqlContext = SQLContext(self.sc)
dataset = sqlContext.createDataFrame(
[(Vectors.dense([0.0]), 0.0),
(Vectors.dense([0.4]), 1.0),
(Vectors.dense([0.5]), 0.0),
(Vectors.dense([0.6]), 1.0),
(Vectors.dense([1.0]), 1.0)] * 10,
["features", "label"])
lr = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
evaluator = BinaryClassificationEvaluator()
tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
tvsModel = tvs.fit(dataset)
tvsPath = temp_path + "/tvs"
tvs.save(tvsPath)
loadedTvs = TrainValidationSplit.load(tvsPath)
self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)
self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps())
tvsModelPath = temp_path + "/tvsModel"
tvsModel.save(tvsModelPath)
loadedModel = TrainValidationSplitModel.load(tvsModelPath)
self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
示例2: test_nnclassifier_in_pipeline
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import dense [as 别名]
def test_nnclassifier_in_pipeline(self):
if self.sc.version.startswith("1"):
from pyspark.mllib.linalg import Vectors
df = self.sqlContext.createDataFrame(
[(Vectors.dense([2.0, 1.0]), 1.0),
(Vectors.dense([1.0, 2.0]), 2.0),
(Vectors.dense([2.0, 1.0]), 1.0),
(Vectors.dense([1.0, 2.0]), 2.0),
], ["features", "label"])
scaler = MinMaxScaler().setInputCol("features").setOutputCol("scaled")
model = Sequential().add(Linear(2, 2))
criterion = ClassNLLCriterion()
classifier = NNClassifier(model, criterion, MLlibVectorToTensor([2]))\
.setBatchSize(4) \
.setLearningRate(0.01).setMaxEpoch(1).setFeaturesCol("scaled")
pipeline = Pipeline(stages=[scaler, classifier])
pipelineModel = pipeline.fit(df)
res = pipelineModel.transform(df)
assert type(res).__name__ == 'DataFrame'
示例3: test_model_transform
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import dense [as 别名]
def test_model_transform(self):
weight = Vectors.dense([3, 2, 1])
densevec = Vectors.dense([4, 5, 6])
sparsevec = Vectors.sparse(3, [0], [1])
eprod = ElementwiseProduct(weight)
self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6]))
self.assertEqual(
eprod.transform(sparsevec), SparseVector(3, [0], [3]))
示例4: _get_train_data
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import dense [as 别名]
def _get_train_data(self):
sql_context = SQLContext(self.sc)
l = [
(1, Vectors.dense([1, 2, 3]), 1.0),
(2, Vectors.dense([1, 2, 3]), 0.0),
(3, Vectors.dense([1, 2, 3]), 1.0),
(4, Vectors.dense([1, 2, 3]), 0.0),
]
return sql_context.createDataFrame(l, ['id', 'features', 'label'])
示例5: test_output_columns
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import dense [as 别名]
def test_output_columns(self):
df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
(1.0, Vectors.sparse(2, [], [])),
(2.0, Vectors.dense(0.5, 0.5))],
["label", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01)
ovr = OneVsRest(classifier=lr)
model = ovr.fit(df)
output = model.transform(df)
self.assertEqual(output.columns, ["label", "features", "prediction"])
示例6: test_idf_model
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import dense [as 别名]
def test_idf_model(self):
data = [
Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]),
Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]),
Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]),
Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9])
]
model = IDF().fit(self.sc.parallelize(data, 2))
idf = model.idf()
self.assertEqual(len(idf), 11)
示例7: load_data_rdd
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import dense [as 别名]
def load_data_rdd(csv_file, shuffle=True, train=True):
if shuffle:
shuffle_csv(csv_file)
data = sc.textFile(data_path + csv_file)
data = data.filter(lambda x:x.split(',')[0] != 'id').map(lambda line: line.split(','))
if train:
data = data.map(
lambda line: (Vectors.dense(np.asarray(line[1:-1]).astype(np.float32)),
str(line[-1]).replace('Class_', '')) )
else:
data = data.map(lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)), "1") )
return data
示例8: remove_time_dependent_effects
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import dense [as 别名]
def remove_time_dependent_effects(self, ts):
"""
Given a timeseries, apply inverse operations to obtain the original series of underlying errors.
Parameters
----------
ts:
Time series of observations with this model's characteristics as a Numpy array
returns the time series with removed time-dependent effects as a Numpy array
"""
destts = Vectors.dense(np.array([0] * len(ts)))
result = self._jmodel.removeTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts))
return _java2py(self._ctx, result.toArray())
示例9: load_data_frame
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import dense [as 别名]
def load_data_frame(csv_file, shuffle=True, train=True):
if shuffle:
shuffle_csv(csv_file)
data = sc.textFile('/home/minglu/dist_spark/data/' + csv_file) # This is an RDD, which will later be transformed to a data frame
data = data.filter(lambda x:x.split(',')[0] != 'label').map(lambda line: line.split(','))
if train:
data = data.map(
lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),
'class_'+str(line[0]),int(line[0])) )
else:
# Test data gets dummy labels. We need the same structure as in Train data
data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),'class_'+str(line[0]),int(line[0])) )
return sqlcontext.createDataFrame(data, ['features', 'category','label'])
示例10: create_rows_for_rdd
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import dense [as 别名]
def create_rows_for_rdd(x):
"""
:param x:
:return:
"""
features = list(x[1])
l = len(features) - 1
label = float(features.pop(l))
meta_data = x[0]
return Row(label=label,
features=Vectors.dense(features),
meta_data=Vectors.dense(meta_data))
示例11: test_copy
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import dense [as 别名]
def test_copy(self):
df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
(1.0, Vectors.sparse(2, [], [])),
(2.0, Vectors.dense(0.5, 0.5))],
["label", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01)
ovr = OneVsRest(classifier=lr)
ovr1 = ovr.copy({lr.maxIter: 10})
self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
model = ovr.fit(df)
model1 = model.copy({model.predictionCol: "indexed"})
self.assertEqual(model1.getPredictionCol(), "indexed")
示例12: add_time_dependent_effects
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import dense [as 别名]
def add_time_dependent_effects(self, ts):
"""
Given a timeseries, apply a model to it.
Parameters
----------
ts:
Time series of i.i.d. observations as a Numpy array
returns the time series with added time-dependent effects as a Numpy array.
"""
destts = Vectors.dense([0] * len(ts))
result = self._jmodel.addTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts))
return _java2py(self._ctx, result.toArray())
示例13: to_vector
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import dense [as 别名]
def to_vector(np_array):
''' Convert numpy array to MLlib Vector '''
if len(np_array.shape) == 1:
return Vectors.dense(np_array)
else:
raise Exception("""An MLLib Vector can only be created
from a one-dimensional numpy array""")
示例14: test_persistence
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import dense [as 别名]
def test_persistence(self):
# Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
sqlContext = SQLContext(self.sc)
df = sqlContext.createDataFrame([
[1, Vectors.dense([0.0, 1.0])],
[2, Vectors.sparse(2, {0: 1.0})],
], ["id", "features"])
# Fit model
lda = LDA(k=2, seed=1, optimizer="em")
distributedModel = lda.fit(df)
self.assertTrue(distributedModel.isDistributed())
localModel = distributedModel.toLocal()
self.assertFalse(localModel.isDistributed())
# Define paths
path = tempfile.mkdtemp()
lda_path = path + "/lda"
dist_model_path = path + "/distLDAModel"
local_model_path = path + "/localLDAModel"
# Test LDA
lda.save(lda_path)
lda2 = LDA.load(lda_path)
self._compare(lda, lda2)
# Test DistributedLDAModel
distributedModel.save(dist_model_path)
distributedModel2 = DistributedLDAModel.load(dist_model_path)
self._compare(distributedModel, distributedModel2)
# Test LocalLDAModel
localModel.save(local_model_path)
localModel2 = LocalLDAModel.load(local_model_path)
self._compare(localModel, localModel2)
# Clean up
try:
rmtree(path)
except OSError:
pass
示例15: buildLabeledPoint
# 需要导入模块: from pyspark.mllib.linalg import Vectors [as 别名]
# 或者: from pyspark.mllib.linalg.Vectors import dense [as 别名]
def buildLabeledPoint(s, classification):
features=[]
for attr in attributes:
features.append(getattr(s, attr + '_1'))
for attr in attributes:
features.append(getattr(s, attr + '_2'))
return LabeledPoint(classification,Vectors.dense(features))