本文整理汇总了Python中pyspark.mllib.regression.LabeledPoint方法的典型用法代码示例。如果您正苦于以下问题:Python regression.LabeledPoint方法的具体用法?Python regression.LabeledPoint怎么用?Python regression.LabeledPoint使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.mllib.regression
的用法示例。
在下文中一共展示了regression.LabeledPoint方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: to_labeled_point
# 需要导入模块: from pyspark.mllib import regression [as 别名]
# 或者: from pyspark.mllib.regression import LabeledPoint [as 别名]
def to_labeled_point(sc, features, labels, categorical=False):
"""Convert numpy arrays of features and labels into
a LabeledPoint RDD for MLlib and ML integration.
:param sc: Spark context
:param features: numpy array with features
:param labels: numpy array with labels
:param categorical: boolean, whether labels are already one-hot encoded or not
:return: LabeledPoint RDD with features and labels
"""
labeled_points = []
for x, y in zip(features, labels):
if categorical:
lp = LabeledPoint(np.argmax(y), to_vector(x))
else:
lp = LabeledPoint(y, to_vector(x))
labeled_points.append(lp)
return sc.parallelize(labeled_points)
示例2: from_labeled_point
# 需要导入模块: from pyspark.mllib import regression [as 别名]
# 或者: from pyspark.mllib.regression import LabeledPoint [as 别名]
def from_labeled_point(rdd, categorical=False, nb_classes=None):
"""Convert a LabeledPoint RDD back to a pair of numpy arrays
:param rdd: LabeledPoint RDD
:param categorical: boolean, if labels should be one-hot encode when returned
:param nb_classes: optional int, indicating the number of class labels
:return: pair of numpy arrays, features and labels
"""
features = np.asarray(
rdd.map(lambda lp: from_vector(lp.features)).collect())
labels = np.asarray(rdd.map(lambda lp: lp.label).collect(), dtype='int32')
if categorical:
if not nb_classes:
nb_classes = np.max(labels) + 1
temp = np.zeros((len(labels), nb_classes))
for i, label in enumerate(labels):
temp[i, label] = 1.
labels = temp
return features, labels
示例3: lp_to_simple_rdd
# 需要导入模块: from pyspark.mllib import regression [as 别名]
# 或者: from pyspark.mllib.regression import LabeledPoint [as 别名]
def lp_to_simple_rdd(lp_rdd, categorical=False, nb_classes=None):
"""Convert a LabeledPoint RDD into an RDD of feature-label pairs
:param lp_rdd: LabeledPoint RDD of features and labels
:param categorical: boolean, if labels should be one-hot encode when returned
:param nb_classes: int, number of total classes
:return: Spark RDD with feature-label pairs
"""
if categorical:
if not nb_classes:
labels = np.asarray(lp_rdd.map(
lambda lp: lp.label).collect(), dtype='int32')
nb_classes = np.max(labels) + 1
rdd = lp_rdd.map(lambda lp: (from_vector(lp.features),
encode_label(lp.label, nb_classes)))
else:
rdd = lp_rdd.map(lambda lp: (from_vector(lp.features), lp.label))
return rdd
示例4: parse_point
# 需要导入模块: from pyspark.mllib import regression [as 别名]
# 或者: from pyspark.mllib.regression import LabeledPoint [as 别名]
def parse_point(line):
ptn1 = "\(([\d\.]*),\sSparseVector\((.*?)\)\)"
ptn2 = "(\d+),\s+\{(.*?)\}"
m = re.search(ptn1, line)
if m:
label = float(m.group(1))
features_str = m.group(2)
mx = re.search(ptn2, features_str)
num = float(mx.group(1))
fs = mx.group(2)
idx_set = []
tfidf_scores = []
if fs != '':
fs_split = fs.split(', ')
for f in fs_split:
idx_set.append(f.split(': ')[0])
tfidf_scores.append(f.split(': ')[1])
sp = SparseVector(num, idx_set, tfidf_scores)
LP = LabeledPoint(label, sp)
return LP
return None
# Find the best step_size through cross validation, using RMSE as the error measurement
示例5: test_chi_sq_pearson
# 需要导入模块: from pyspark.mllib import regression [as 别名]
# 或者: from pyspark.mllib.regression import LabeledPoint [as 别名]
def test_chi_sq_pearson(self):
data = [
LabeledPoint(0.0, Vectors.dense([0.5, 10.0])),
LabeledPoint(0.0, Vectors.dense([1.5, 20.0])),
LabeledPoint(1.0, Vectors.dense([1.5, 30.0])),
LabeledPoint(0.0, Vectors.dense([3.5, 30.0])),
LabeledPoint(0.0, Vectors.dense([3.5, 40.0])),
LabeledPoint(1.0, Vectors.dense([3.5, 40.0]))
]
for numParts in [2, 4, 6, 8]:
chi = Statistics.chiSqTest(self.sc.parallelize(data, numParts))
feature1 = chi[0]
self.assertEqual(feature1.statistic, 0.75)
self.assertEqual(feature1.degreesOfFreedom, 2)
self.assertAlmostEqual(feature1.pValue, 0.6873, 4)
feature2 = chi[1]
self.assertEqual(feature2.statistic, 1.5)
self.assertEqual(feature2.degreesOfFreedom, 3)
self.assertAlmostEqual(feature2.pValue, 0.6823, 4)
示例6: saveAsLibSVMFile
# 需要导入模块: from pyspark.mllib import regression [as 别名]
# 或者: from pyspark.mllib.regression import LabeledPoint [as 别名]
def saveAsLibSVMFile(data, dir):
"""
Save labeled data in LIBSVM format.
:param data: an RDD of LabeledPoint to be saved
:param dir: directory to save the data
>>> from tempfile import NamedTemporaryFile
>>> from fileinput import input
>>> from pyspark.mllib.regression import LabeledPoint
>>> from glob import glob
>>> from pyspark.mllib.util import MLUtils
>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])),
... LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
>>> tempFile = NamedTemporaryFile(delete=True)
>>> tempFile.close()
>>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)
>>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))
'0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n'
"""
lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p))
lines.saveAsTextFile(dir)
示例7: loadLabeledPoints
# 需要导入模块: from pyspark.mllib import regression [as 别名]
# 或者: from pyspark.mllib.regression import LabeledPoint [as 别名]
def loadLabeledPoints(sc, path, minPartitions=None):
"""
Load labeled points saved using RDD.saveAsTextFile.
:param sc: Spark context
:param path: file or directory path in any Hadoop-supported file
system URI
:param minPartitions: min number of partitions
@return: labeled data stored as an RDD of LabeledPoint
>>> from tempfile import NamedTemporaryFile
>>> from pyspark.mllib.util import MLUtils
>>> from pyspark.mllib.regression import LabeledPoint
>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, -1.23), (2, 4.56e-7)])),
... LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
>>> tempFile = NamedTemporaryFile(delete=True)
>>> tempFile.close()
>>> sc.parallelize(examples, 1).saveAsTextFile(tempFile.name)
>>> MLUtils.loadLabeledPoints(sc, tempFile.name).collect()
[LabeledPoint(1.1, (3,[0,2],[-1.23,4.56e-07])), LabeledPoint(0.0, [1.01,2.02,3.03])]
"""
minPartitions = minPartitions or min(sc.defaultParallelism, 2)
return callMLlibFunc("loadLabeledPoints", sc, path, minPartitions)
示例8: saveAsLibSVMFile
# 需要导入模块: from pyspark.mllib import regression [as 别名]
# 或者: from pyspark.mllib.regression import LabeledPoint [as 别名]
def saveAsLibSVMFile(data, dir):
"""
Save labeled data in LIBSVM format.
@param data: an RDD of LabeledPoint to be saved
@param dir: directory to save the data
>>> from tempfile import NamedTemporaryFile
>>> from fileinput import input
>>> from glob import glob
>>> from pyspark.mllib.util import MLUtils
>>> examples = [LabeledPoint(1.1, Vectors.sparse(3, [(0, 1.23), (2, 4.56)])), \
LabeledPoint(0.0, Vectors.dense([1.01, 2.02, 3.03]))]
>>> tempFile = NamedTemporaryFile(delete=True)
>>> tempFile.close()
>>> MLUtils.saveAsLibSVMFile(sc.parallelize(examples), tempFile.name)
>>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))
'0.0 1:1.01 2:2.02 3:3.03\\n1.1 1:1.23 3:4.56\\n'
"""
lines = data.map(lambda p: MLUtils._convert_labeled_point_to_libsvm(p))
lines.saveAsTextFile(dir)
示例9: parsePoint
# 需要导入模块: from pyspark.mllib import regression [as 别名]
# 或者: from pyspark.mllib.regression import LabeledPoint [as 别名]
def parsePoint(line):
values = csv.reader(StringIO(line), delimiter=";").next() # CSV parsing of line
values = [float(x) for x in values] # Cast to all floats
return LabeledPoint(values[-1], values[:-1]) # y = quality, X = row[:-1]
示例10: from_data_frame
# 需要导入模块: from pyspark.mllib import regression [as 别名]
# 或者: from pyspark.mllib.regression import LabeledPoint [as 别名]
def from_data_frame(df, categorical=False, nb_classes=None):
"""Convert DataFrame back to pair of numpy arrays
"""
lp_rdd = df.rdd.map(lambda row: LabeledPoint(row.label, row.features))
features, labels = from_labeled_point(lp_rdd, categorical, nb_classes)
return features, labels
示例11: df_to_simple_rdd
# 需要导入模块: from pyspark.mllib import regression [as 别名]
# 或者: from pyspark.mllib.regression import LabeledPoint [as 别名]
def df_to_simple_rdd(df, categorical=False, nb_classes=None, features_col='features', label_col='label'):
"""Convert DataFrame into RDD of pairs
"""
sql_context = df.sql_ctx
sql_context.registerDataFrameAsTable(df, "temp_table")
selected_df = sql_context.sql(
"SELECT {0} AS features, {1} as label from temp_table".format(features_col, label_col))
if isinstance(selected_df.first().features, MLLibVector):
lp_rdd = selected_df.rdd.map(
lambda row: LabeledPoint(row.label, row.features))
else:
lp_rdd = selected_df.rdd.map(lambda row: LabeledPoint(
row.label, MLLibVectors.fromML(row.features)))
rdd = lp_to_simple_rdd(lp_rdd, categorical, nb_classes)
return rdd
示例12: get_lp
# 需要导入模块: from pyspark.mllib import regression [as 别名]
# 或者: from pyspark.mllib.regression import LabeledPoint [as 别名]
def get_lp(t):
rating = t[1][0]
avg_features = t[1][1]
return LabeledPoint(rating, avg_features)
示例13: get_lp
# 需要导入模块: from pyspark.mllib import regression [as 别名]
# 或者: from pyspark.mllib.regression import LabeledPoint [as 别名]
def get_lp(t):
rating = t[0]
sp = t[1]
return LabeledPoint(rating, sp)
示例14: test_infer_schema
# 需要导入模块: from pyspark.mllib import regression [as 别名]
# 或者: from pyspark.mllib.regression import LabeledPoint [as 别名]
def test_infer_schema(self):
rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)])
df = rdd.toDF()
schema = df.schema
field = [f for f in schema.fields if f.name == "features"][0]
self.assertEqual(field.dataType, self.udt)
vectors = df.rdd.map(lambda p: p.features).collect()
self.assertEqual(len(vectors), 2)
for v in vectors:
if isinstance(v, SparseVector):
self.assertEqual(v, self.sv1)
elif isinstance(v, DenseVector):
self.assertEqual(v, self.dv1)
else:
raise TypeError("expecting a vector but got %r of type %r" % (v, type(v)))
示例15: test_classification
# 需要导入模块: from pyspark.mllib import regression [as 别名]
# 或者: from pyspark.mllib.regression import LabeledPoint [as 别名]
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import DecisionTree
data = [
LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
]
rdd = self.sc.parallelize(data)
features = [p.features for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)