本文整理汇总了Python中pyspark.ml.linalg.SparseVector方法的典型用法代码示例。如果您正苦于以下问题:Python linalg.SparseVector方法的具体用法?Python linalg.SparseVector怎么用?Python linalg.SparseVector使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.ml.linalg
的用法示例。
在下文中一共展示了linalg.SparseVector方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: sparse
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def sparse(size, *args):
"""
Create a sparse vector, using either a dictionary, a list of
(index, value) pairs, or two separate arrays of indices and
values (sorted by index).
:param size: Size of the vector.
:param args: Non-zero entries, as a dictionary, list of tuples,
or two sorted lists containing indices and values.
>>> Vectors.sparse(4, {1: 1.0, 3: 5.5})
SparseVector(4, {1: 1.0, 3: 5.5})
>>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)])
SparseVector(4, {1: 1.0, 3: 5.5})
>>> Vectors.sparse(4, [1, 3], [1.0, 5.5])
SparseVector(4, {1: 1.0, 3: 5.5})
"""
return SparseVector(size, *args)
示例2: fromML
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def fromML(vec):
"""
Convert a vector from the new mllib-local representation.
This does NOT copy the data; it copies references.
:param vec: a :py:class:`pyspark.ml.linalg.Vector`
:return: a :py:class:`pyspark.mllib.linalg.Vector`
.. versionadded:: 2.0.0
"""
if isinstance(vec, newlinalg.DenseVector):
return DenseVector(vec.array)
elif isinstance(vec, newlinalg.SparseVector):
return SparseVector(vec.size, vec.indices, vec.values)
else:
raise TypeError("Unsupported vector type %s" % type(vec))
示例3: transform
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def transform(self,X_rdd,y_rdd=None):
'''
given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels
'''
#check input type
if type(X_rdd) != RDD:
raise TypeError("Arguments must be pySpark RDDs")
if y_rdd and type(y_rdd) != RDD:
raise TypeError("Arguments must be pySpark RDDs")
#get term frequencies
X = X_rdd.map(self._term_frequency).cache()
#convert to sparse
X = X.map(lambda (hash,features): (hash,SparseVector(self.num_features,np.nonzero(features)[0],features[features>0])))
#check if labels exist
if y_rdd:
#combine X and y into single dataframe
X = X.zipWithIndex().map(lambda r: (r[1],r[0]))
y = y_rdd.zipWithIndex().map(lambda r: (r[1],r[0]))
data = X.join(y).map(lambda (idx,((hash,features),label)): (hash,features,label))
schema = StructType([StructField('hash',StringType(),True),StructField('features',VectorUDT(),True),StructField('label',StringType(),True)])
data = data.toDF(schema)
data = data.withColumn('label',data.label.cast(DoubleType()))
else:
schema = StructType([StructField('hash',StringType(),True),StructField("features", VectorUDT(), True)])
data = X.toDF(schema)
return data
示例4: _term_frequency
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def _term_frequency(self,row):
'''
convert row of word tokens into sparse vector of terms frequencies
'''
sparse_dic = {}
df_dic = {}
for word in row[1]:
if word in self.dictionary:
if self.dictionary[word] in sparse_dic:
sparse_dic[self.dictionary[word]] += 1.
else:
sparse_dic[self.dictionary[word]] = 1.
tf = SparseVector(len(self.dictionary),sparse_dic)
return (row[0],tf)
示例5: _convert_to_vector
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def _convert_to_vector(l):
if isinstance(l, Vector):
return l
elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange):
return DenseVector(l)
elif _have_scipy and scipy.sparse.issparse(l):
assert l.shape[1] == 1, "Expected column vector"
# Make sure the converted csc_matrix has sorted indices.
csc = l.tocsc()
if not csc.has_sorted_indices:
csc.sort_indices()
return SparseVector(l.shape[0], csc.indices, csc.data)
else:
raise TypeError("Cannot convert type %s into Vector" % type(l))
示例6: serialize
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def serialize(self, obj):
if isinstance(obj, SparseVector):
indices = [int(i) for i in obj.indices]
values = [float(v) for v in obj.values]
return (0, obj.size, indices, values)
elif isinstance(obj, DenseVector):
values = [float(v) for v in obj]
return (1, None, None, values)
else:
raise TypeError("cannot serialize %r of type %r" % (obj, type(obj)))
示例7: deserialize
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def deserialize(self, datum):
assert len(datum) == 4, \
"VectorUDT.deserialize given row with length %d but requires 4" % len(datum)
tpe = datum[0]
if tpe == 0:
return SparseVector(datum[1], datum[2], datum[3])
elif tpe == 1:
return DenseVector(datum[3])
else:
raise ValueError("do not recognize type %r" % tpe)
示例8: squared_distance
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def squared_distance(self, other):
"""
Squared distance of two Vectors.
>>> dense1 = DenseVector(array.array('d', [1., 2.]))
>>> dense1.squared_distance(dense1)
0.0
>>> dense2 = np.array([2., 1.])
>>> dense1.squared_distance(dense2)
2.0
>>> dense3 = [2., 1.]
>>> dense1.squared_distance(dense3)
2.0
>>> sparse1 = SparseVector(2, [0, 1], [2., 1.])
>>> dense1.squared_distance(sparse1)
2.0
>>> dense1.squared_distance([1.,])
Traceback (most recent call last):
...
AssertionError: dimension mismatch
>>> dense1.squared_distance(SparseVector(1, [0,], [1.,]))
Traceback (most recent call last):
...
AssertionError: dimension mismatch
"""
assert len(self) == _vector_size(other), "dimension mismatch"
if isinstance(other, SparseVector):
return other.squared_distance(self)
elif _have_scipy and scipy.sparse.issparse(other):
return _convert_to_vector(other).squared_distance(self)
if isinstance(other, Vector):
other = other.toArray()
elif not isinstance(other, np.ndarray):
other = np.array(other)
diff = self.toArray() - other
return np.dot(diff, diff)
示例9: norm
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def norm(self, p):
"""
Calculates the norm of a SparseVector.
>>> a = SparseVector(4, [0, 1], [3., -4.])
>>> a.norm(1)
7.0
>>> a.norm(2)
5.0
"""
return np.linalg.norm(self.values, p)
示例10: __reduce__
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def __reduce__(self):
return (
SparseVector,
(self.size, self.indices.tostring(), self.values.tostring()))
示例11: asML
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def asML(self):
"""
Convert this vector to the new mllib-local representation.
This does NOT copy the data; it copies references.
:return: :py:class:`pyspark.ml.linalg.SparseVector`
.. versionadded:: 2.0.0
"""
return newlinalg.SparseVector(self.size, self.indices, self.values)
示例12: __repr__
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def __repr__(self):
inds = self.indices
vals = self.values
entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i]))
for i in xrange(len(inds))])
return "SparseVector({0}, {{{1}}})".format(self.size, entries)
示例13: __eq__
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def __eq__(self, other):
if isinstance(other, SparseVector):
return other.size == self.size and np.array_equal(other.indices, self.indices) \
and np.array_equal(other.values, self.values)
elif isinstance(other, DenseVector):
if self.size != len(other):
return False
return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array)
return False
示例14: parse
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def parse(s):
"""Parse a string representation back into the Vector.
>>> Vectors.parse('[2,1,2 ]')
DenseVector([2.0, 1.0, 2.0])
>>> Vectors.parse(' ( 100, [0], [2])')
SparseVector(100, {0: 2.0})
"""
if s.find('(') == -1 and s.find('[') != -1:
return DenseVector.parse(s)
elif s.find('(') != -1:
return SparseVector.parse(s)
else:
raise ValueError(
"Cannot find tokens '[' or '(' from the input string.")
示例15: test_udt
# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def test_udt(self):
sparse_values = {0: 0.1, 1: 1.1}
sparse_vector = SparseVector(len(sparse_values), sparse_values)
pdf = pd.DataFrame({"a": [sparse_vector], "b": [10]})
if LooseVersion(pyspark.__version__) < LooseVersion("2.4"):
with self.sql_conf({"spark.sql.execution.arrow.enabled": False}):
kdf = ks.from_pandas(pdf)
self.assert_eq(kdf, pdf)
else:
kdf = ks.from_pandas(pdf)
self.assert_eq(kdf, pdf)