当前位置: 首页>>代码示例>>Python>>正文


Python linalg.SparseVector方法代码示例

本文整理汇总了Python中pyspark.ml.linalg.SparseVector方法的典型用法代码示例。如果您正苦于以下问题:Python linalg.SparseVector方法的具体用法?Python linalg.SparseVector怎么用?Python linalg.SparseVector使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyspark.ml.linalg的用法示例。


在下文中一共展示了linalg.SparseVector方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: sparse

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def sparse(size, *args):
        """
        Create a sparse vector, using either a dictionary, a list of
        (index, value) pairs, or two separate arrays of indices and
        values (sorted by index).

        :param size: Size of the vector.
        :param args: Non-zero entries, as a dictionary, list of tuples,
                     or two sorted lists containing indices and values.

        >>> Vectors.sparse(4, {1: 1.0, 3: 5.5})
        SparseVector(4, {1: 1.0, 3: 5.5})
        >>> Vectors.sparse(4, [(1, 1.0), (3, 5.5)])
        SparseVector(4, {1: 1.0, 3: 5.5})
        >>> Vectors.sparse(4, [1, 3], [1.0, 5.5])
        SparseVector(4, {1: 1.0, 3: 5.5})
        """
        return SparseVector(size, *args) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:20,代码来源:__init__.py

示例2: fromML

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def fromML(vec):
        """
        Convert a vector from the new mllib-local representation.
        This does NOT copy the data; it copies references.

        :param vec: a :py:class:`pyspark.ml.linalg.Vector`
        :return: a :py:class:`pyspark.mllib.linalg.Vector`

        .. versionadded:: 2.0.0
        """
        if isinstance(vec, newlinalg.DenseVector):
            return DenseVector(vec.array)
        elif isinstance(vec, newlinalg.SparseVector):
            return SparseVector(vec.size, vec.indices, vec.values)
        else:
            raise TypeError("Unsupported vector type %s" % type(vec)) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:18,代码来源:__init__.py

示例3: transform

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def transform(self,X_rdd,y_rdd=None):
        '''
        given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels
        '''    
        #check input type
        if type(X_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")
        if y_rdd and type(y_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")
        
        #get term frequencies
        X = X_rdd.map(self._term_frequency).cache()
        
        #convert to sparse
        X = X.map(lambda (hash,features): (hash,SparseVector(self.num_features,np.nonzero(features)[0],features[features>0])))

        #check if labels exist
        if y_rdd:
            #combine X and y into single dataframe
            X = X.zipWithIndex().map(lambda r: (r[1],r[0]))
            y = y_rdd.zipWithIndex().map(lambda r: (r[1],r[0]))
            data = X.join(y).map(lambda (idx,((hash,features),label)): (hash,features,label))
            schema = StructType([StructField('hash',StringType(),True),StructField('features',VectorUDT(),True),StructField('label',StringType(),True)])
            data = data.toDF(schema)
            data = data.withColumn('label',data.label.cast(DoubleType()))
        
        else:
            schema = StructType([StructField('hash',StringType(),True),StructField("features", VectorUDT(), True)])
            data = X.toDF(schema)
            
        return data 
开发者ID:iamshang1,项目名称:Projects,代码行数:33,代码来源:preprocessing_bytes.py

示例4: _term_frequency

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def _term_frequency(self,row):
        '''
        convert row of word tokens into sparse vector of terms frequencies
        '''
        sparse_dic = {}
        df_dic = {}
        for word in row[1]:
            if word in self.dictionary:
                if self.dictionary[word] in sparse_dic:
                    sparse_dic[self.dictionary[word]] += 1.
                else:
                    sparse_dic[self.dictionary[word]] = 1.         
        tf = SparseVector(len(self.dictionary),sparse_dic)
        return (row[0],tf) 
开发者ID:iamshang1,项目名称:Projects,代码行数:16,代码来源:preprocessing_asm.py

示例5: _convert_to_vector

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def _convert_to_vector(l):
    if isinstance(l, Vector):
        return l
    elif type(l) in (array.array, np.array, np.ndarray, list, tuple, xrange):
        return DenseVector(l)
    elif _have_scipy and scipy.sparse.issparse(l):
        assert l.shape[1] == 1, "Expected column vector"
        # Make sure the converted csc_matrix has sorted indices.
        csc = l.tocsc()
        if not csc.has_sorted_indices:
            csc.sort_indices()
        return SparseVector(l.shape[0], csc.indices, csc.data)
    else:
        raise TypeError("Cannot convert type %s into Vector" % type(l)) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:16,代码来源:__init__.py

示例6: serialize

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def serialize(self, obj):
        if isinstance(obj, SparseVector):
            indices = [int(i) for i in obj.indices]
            values = [float(v) for v in obj.values]
            return (0, obj.size, indices, values)
        elif isinstance(obj, DenseVector):
            values = [float(v) for v in obj]
            return (1, None, None, values)
        else:
            raise TypeError("cannot serialize %r of type %r" % (obj, type(obj))) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:12,代码来源:__init__.py

示例7: deserialize

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def deserialize(self, datum):
        assert len(datum) == 4, \
            "VectorUDT.deserialize given row with length %d but requires 4" % len(datum)
        tpe = datum[0]
        if tpe == 0:
            return SparseVector(datum[1], datum[2], datum[3])
        elif tpe == 1:
            return DenseVector(datum[3])
        else:
            raise ValueError("do not recognize type %r" % tpe) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:12,代码来源:__init__.py

示例8: squared_distance

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def squared_distance(self, other):
        """
        Squared distance of two Vectors.

        >>> dense1 = DenseVector(array.array('d', [1., 2.]))
        >>> dense1.squared_distance(dense1)
        0.0
        >>> dense2 = np.array([2., 1.])
        >>> dense1.squared_distance(dense2)
        2.0
        >>> dense3 = [2., 1.]
        >>> dense1.squared_distance(dense3)
        2.0
        >>> sparse1 = SparseVector(2, [0, 1], [2., 1.])
        >>> dense1.squared_distance(sparse1)
        2.0
        >>> dense1.squared_distance([1.,])
        Traceback (most recent call last):
            ...
        AssertionError: dimension mismatch
        >>> dense1.squared_distance(SparseVector(1, [0,], [1.,]))
        Traceback (most recent call last):
            ...
        AssertionError: dimension mismatch
        """
        assert len(self) == _vector_size(other), "dimension mismatch"
        if isinstance(other, SparseVector):
            return other.squared_distance(self)
        elif _have_scipy and scipy.sparse.issparse(other):
            return _convert_to_vector(other).squared_distance(self)

        if isinstance(other, Vector):
            other = other.toArray()
        elif not isinstance(other, np.ndarray):
            other = np.array(other)
        diff = self.toArray() - other
        return np.dot(diff, diff) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:39,代码来源:__init__.py

示例9: norm

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def norm(self, p):
        """
        Calculates the norm of a SparseVector.

        >>> a = SparseVector(4, [0, 1], [3., -4.])
        >>> a.norm(1)
        7.0
        >>> a.norm(2)
        5.0
        """
        return np.linalg.norm(self.values, p) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:13,代码来源:__init__.py

示例10: __reduce__

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def __reduce__(self):
        return (
            SparseVector,
            (self.size, self.indices.tostring(), self.values.tostring())) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:6,代码来源:__init__.py

示例11: asML

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def asML(self):
        """
        Convert this vector to the new mllib-local representation.
        This does NOT copy the data; it copies references.

        :return: :py:class:`pyspark.ml.linalg.SparseVector`

        .. versionadded:: 2.0.0
        """
        return newlinalg.SparseVector(self.size, self.indices, self.values) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:12,代码来源:__init__.py

示例12: __repr__

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def __repr__(self):
        inds = self.indices
        vals = self.values
        entries = ", ".join(["{0}: {1}".format(inds[i], _format_float(vals[i]))
                             for i in xrange(len(inds))])
        return "SparseVector({0}, {{{1}}})".format(self.size, entries) 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:8,代码来源:__init__.py

示例13: __eq__

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def __eq__(self, other):
        if isinstance(other, SparseVector):
            return other.size == self.size and np.array_equal(other.indices, self.indices) \
                and np.array_equal(other.values, self.values)
        elif isinstance(other, DenseVector):
            if self.size != len(other):
                return False
            return Vectors._equals(self.indices, self.values, list(xrange(len(other))), other.array)
        return False 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:11,代码来源:__init__.py

示例14: parse

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def parse(s):
        """Parse a string representation back into the Vector.

        >>> Vectors.parse('[2,1,2 ]')
        DenseVector([2.0, 1.0, 2.0])
        >>> Vectors.parse(' ( 100,  [0],  [2])')
        SparseVector(100, {0: 2.0})
        """
        if s.find('(') == -1 and s.find('[') != -1:
            return DenseVector.parse(s)
        elif s.find('(') != -1:
            return SparseVector.parse(s)
        else:
            raise ValueError(
                "Cannot find tokens '[' or '(' from the input string.") 
开发者ID:runawayhorse001,项目名称:LearningApacheSpark,代码行数:17,代码来源:__init__.py

示例15: test_udt

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import SparseVector [as 别名]
def test_udt(self):
        sparse_values = {0: 0.1, 1: 1.1}
        sparse_vector = SparseVector(len(sparse_values), sparse_values)
        pdf = pd.DataFrame({"a": [sparse_vector], "b": [10]})

        if LooseVersion(pyspark.__version__) < LooseVersion("2.4"):
            with self.sql_conf({"spark.sql.execution.arrow.enabled": False}):
                kdf = ks.from_pandas(pdf)
                self.assert_eq(kdf, pdf)
        else:
            kdf = ks.from_pandas(pdf)
            self.assert_eq(kdf, pdf) 
开发者ID:databricks,项目名称:koalas,代码行数:14,代码来源:test_dataframe.py


注:本文中的pyspark.ml.linalg.SparseVector方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。