Python linalg.VectorUDT方法代码示例

本文整理汇总了Python中pyspark.ml.linalg.VectorUDT方法的典型用法代码示例。如果您正苦于以下问题：Python linalg.VectorUDT方法的具体用法？Python linalg.VectorUDT怎么用？Python linalg.VectorUDT使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.ml.linalg的用法示例。

在下文中一共展示了linalg.VectorUDT方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: append_features

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def append_features(df, *cols):
    """Append features from columns to the features vector.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    cols : list of str

    Returns
    -------
    pyspark.sql.DataFrame
    """
    def add_features(feat, *other):
        raw = feat.toArray()
        return Vectors.dense(np.append(raw, list(map(float, other))))
    add_features_udf = F.udf(add_features, VectorUDT())
    new_feat_list = df.schema['features'].metadata['features'] + cols
    return df.withColumn('features', mjolnir.spark.add_meta(
        df._sc, add_features_udf('features', *cols), {'features': new_feat_list}))

开发者ID:wikimedia，项目名称:search-MjoLniR，代码行数:21，代码来源:feature_engineering.py

示例2: zero_features

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def zero_features(df, *feature_names):
    """Zero out features in the feature vector.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    feature_names : list of str

    Returns
    -------
    pyspark.sql.DataFrame
    """
    features = df.schema['features'].metadata['features']
    idxs = [features.index(name) for name in feature_names]

    def zero_features(feat):
        raw = feat.toArray()
        for idx in idxs:
            raw[idx] = 0.
        return Vectors.dense(raw)
    zero_features_udf = F.udf(zero_features, VectorUDT())
    return df.withColumn('features', mjolnir.spark.add_meta(
        df._sc, zero_features_udf('features'), {'features': features}))

开发者ID:wikimedia，项目名称:search-MjoLniR，代码行数:25，代码来源:feature_engineering.py

示例3: transform

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def transform(self,X_rdd,y_rdd=None):
        '''
        given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels
        '''    
        #check input type
        if type(X_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")
        if y_rdd and type(y_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")
        
        #get term frequencies
        X = X_rdd.map(self._term_frequency).cache()
        
        #convert to sparse
        X = X.map(lambda (hash,features): (hash,SparseVector(self.num_features,np.nonzero(features)[0],features[features>0])))

        #check if labels exist
        if y_rdd:
            #combine X and y into single dataframe
            X = X.zipWithIndex().map(lambda r: (r[1],r[0]))
            y = y_rdd.zipWithIndex().map(lambda r: (r[1],r[0]))
            data = X.join(y).map(lambda (idx,((hash,features),label)): (hash,features,label))
            schema = StructType([StructField('hash',StringType(),True),StructField('features',VectorUDT(),True),StructField('label',StringType(),True)])
            data = data.toDF(schema)
            data = data.withColumn('label',data.label.cast(DoubleType()))
        
        else:
            schema = StructType([StructField('hash',StringType(),True),StructField("features", VectorUDT(), True)])
            data = X.toDF(schema)
            
        return data

开发者ID:iamshang1，项目名称:Projects，代码行数:33，代码来源:preprocessing_bytes.py

示例4: transform

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def transform(self,X_rdd,y_rdd=None,train=True):
        '''
        given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels
        '''    
        #check input type
        if type(X_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")
        if y_rdd and type(y_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")
        
        #word tokenization
        X = X_rdd.map(self._tokenize).cache()
        
        #create dictionary of words
        if train:
            self.dictionary = X.map(lambda row: row[1]).flatMap(lambda word: word).map(lambda word: (word,1)).reduceByKey(lambda acc, w: acc + w).filter(lambda x: x[1]>=self.min_df).collectAsMap()
            self.dictionary = dict(zip(self.dictionary,xrange(len(self.dictionary))))

        #create word vectors
        X = X.map(self._term_frequency)
        
        #check if labels exist
        if y_rdd:
            #combine X and y into single dataframe
            X = X.zipWithIndex().map(lambda r: (r[1],r[0]))
            y = y_rdd.zipWithIndex().map(lambda r: (r[1],r[0]))
            data = X.join(y).map(lambda (idx,((hash,features),label)): (hash,features,label))
            schema = StructType([StructField('hash',StringType(),True),StructField('features',VectorUDT(),True),StructField('label',StringType(),True)])
            data = data.toDF(schema)
            data = data.withColumn('label',data.label.cast(DoubleType()))
        
        else:
            schema = StructType([StructField('hash',StringType(),True),StructField("features", VectorUDT(), True)])
            data = X.toDF(schema)
            
        return data

开发者ID:iamshang1，项目名称:Projects，代码行数:38，代码来源:preprocessing_asm.py

示例5: read_partition

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def read_partition(
    spark: SparkSession,
    table: str,
    partition_spec: Mapping[str, str],
    schema: Optional[T.StructType] = None,
    direct_parquet_read: bool = False
) -> DataFrame:
    """Read a single partition from a hive table.

    Verifies the partition specification describes a complete partition,
    that the partition exists, and optionally that the table is compatible
    with an expected schema. The partition could still be empty.
    """
    # We don't need to do anything with the result, our goal is to
    # trigger AnalysisException when the arguments are invalid.
    spark.sql(_describe_partition_ql(table, partition_spec)).collect()

    partition_cond = F.lit(True)
    for k, v in partition_spec.items():
        partition_cond &= F.col(k) == v
    df = spark.read.table(table).where(partition_cond)
    # The df we have now has types defined by the hive table, but this downgrades
    # non-standard types like VectorUDT() to it's sql equivalent. Use the first
    # df to find the files, then read them directly.
    if direct_parquet_read:
        input_files = list(df._jdf.inputFiles())  # type: ignore
        input_dirs = set(os.path.dirname(path) for path in input_files)
        if len(input_dirs) != 1:
            raise Exception('Expected single directory containing partition data: [{}]'.format(
                '],['.join(input_files)))
        df = spark.read.parquet(list(input_dirs)[0])
    if schema is not None:
        # TODO: This only allows extra top level columns, anything
        # nested must be exactly the same. Fine for now.
        _verify_schema_compatability(schema, df.schema)
        df = df.select(*(field.name for field in schema))
    # Drop partitioning columns. These are not part of the mjolnir transformations, and
    # are only an implementation detail of putting them on disk and tracking history.
    return df.drop(*partition_spec.keys())

开发者ID:wikimedia，项目名称:search-MjoLniR，代码行数:41，代码来源:transform.py

示例6: _transform

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def _transform(self, dataset):
        inp = self.getOrDefault(self.inputCol)
        out = self.getOrDefault(self.predictionCol)
        mod_str = self.getOrDefault(self.modStr)
        use_vector_out = self.getOrDefault(self.useVectorOut)

        model = dill.loads(codecs.decode(mod_str.encode(), "base64"))
        model_broadcast = dataset._sc.broadcast(model)

        def predict_vec(data):
            features = data.toArray().reshape((1, len(data)))
            x_data = torch.from_numpy(features).float()
            model = model_broadcast.value
            model.eval()
            return Vectors.dense(model(x_data).detach().numpy().flatten())

        def predict_float(data):
            features = data.toArray().reshape((1, len(data)))
            x_data = torch.from_numpy(features).float()
            model = model_broadcast.value
            model.eval()
            raw_prediction = model(x_data).detach().numpy().flatten()
            if len(raw_prediction) > 1:
                return float(np.argmax(raw_prediction))
            return float(raw_prediction[0])

        if use_vector_out:
            udfGenerateCode = F.udf(predict_vec, VectorUDT())
        else:
            udfGenerateCode = F.udf(predict_float, DoubleType())

        return dataset.withColumn(out, udfGenerateCode(inp))

开发者ID:dmmiller612，项目名称:sparktorch，代码行数:34，代码来源:torch_distributed.py

示例7: _convert_vector

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def _convert_vector(df, dtype):
    from pyspark.ml.linalg import VectorUDT
    from pyspark.mllib.linalg import VectorUDT as OldVectorUDT

    for field in df.schema:
        col_name = field.name
        if isinstance(field.dataType, VectorUDT) or \
                isinstance(field.dataType, OldVectorUDT):
            df = df.withColumn(col_name,
                               vector_to_array(df[col_name], dtype))
    return df

开发者ID:uber，项目名称:petastorm，代码行数:13，代码来源:spark_dataset_converter.py

示例8: _average_feature_vectors

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def _average_feature_vectors(self, data, outputCol):
        '''Average the feature vectors

        Parameters
        ----------
        data : DataFrame
           input dataframe
        outputCol : str
           name of the output column
        '''

        session = SparkSession.builder.getOrCreate()

        def _averager(v1, v2, v3):
            f1 = v1.toArray()
            f2 = v2.toArray()
            f3 = v3.toArray()

            length = min(len(f1), len(f2), len(f3))
            average = []

            for i in range(length):
                average.append((f1[i] + f2[i] + f3[i])/3.0)

            return Vectors.dense(average)

        session.udf.register("averager", _averager, VectorUDT())

        data.createOrReplaceTempView("table")

        sql = f"SELECT *, averager(feature0, feature1, feature2) AS {self.outputCol} from table"

        data = session.sql(sql)

        return data

开发者ID:sbl-sdsc，项目名称:mmtf-pyspark，代码行数:37，代码来源:proteinSequenceEncoder.py

示例9: toPandas

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def toPandas(self, df):
        """
        This is similar to the Spark DataFrame built-in toPandas() method, but it handles
        MLlib Vector columns differently.  It converts MLlib Vectors into rows of
        scipy.sparse.csr_matrix, which is generally friendlier for PyData tools like scikit-learn.

        .. note:: Experimental: This will likely be replaced in later releases with improved APIs.

        :param df: Spark DataFrame
        :return:  Pandas dataframe
        """
        cols = df.columns
        # Convert any MLlib Vector columns to scipy.sparse.csr_matrix
        matrixCols = []

        def toscipy(v):
            if isinstance(v, DenseVector):
                return csr_matrix((v.values, np.array(range(v.size)), np.array([0, v.size])),
                                  shape=(1, v.size))
            elif isinstance(v, SparseVector):
                return csr_matrix((v.values, v.indices, np.array([0, len(v.indices)])),
                                  shape=(1, v.size))
            else:
                raise TypeError("Converter.toPandas found unknown Vector type: %s" % type(v))
        tosparse = udf(lambda v: toscipy(v), CSRVectorUDT())
        for i in range(len(cols)):
            c = cols[i]
            if isinstance(df.schema.fields[i].dataType, VectorUDT):
                cols[i] = tosparse(df[c]).alias(c)
                matrixCols.append(c)
            else:
                cols[i] = df[c]
        return df.select(*cols).toPandas()

开发者ID:databricks，项目名称:spark-sklearn，代码行数:35，代码来源:converter.py

示例10: dataframe_to_nparray

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def dataframe_to_nparray(df):
    from pyspark.ml.linalg import VectorUDT
    schema = df.schema
    npcols = []
    for i in range(0, len(df.columns)):
        if isinstance(schema.fields[i].dataType, VectorUDT):
            npcols.append(df.select(df.columns[i]).toPandas().apply(
                lambda x : numpy.array(x[0].toArray())).as_matrix().reshape(-1, 1))
        else:
            npcols.append(df.select(df.columns[i]).collect())
    return numpy.array(npcols)

开发者ID:onnx，项目名称:onnxmltools，代码行数:13，代码来源:sparkml_test_utils.py

示例11: _getNumpyFeaturesAndLabels

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def _getNumpyFeaturesAndLabels(self, dataset):
        """
        We assume the training data fits in memory on a single server.
        The input dataframe is converted to numerical image features and
        broadcast to all the worker nodes.
        """
        image_uri_col = self.getInputCol()
        label_col = None
        if self.isDefined(self.labelCol) and self.getLabelCol() != "":
            label_col = self.getLabelCol()
        tmp_image_col = self._loadedImageCol()
        image_df = self.loadImagesInternal(dataset, image_uri_col).dropna(subset=[tmp_image_col])

        # Extract features
        localFeatures = []
        rows = image_df.collect()
        for row in rows:
            spimg = row[tmp_image_col]
            features = imageStructToArray(spimg)
            localFeatures.append(features)

        if not localFeatures:  # NOTE(phi-dbq): pep-8 recommended against testing 0 == len(array)
            raise ValueError("Cannot extract any feature from dataset!")
        X = np.stack(localFeatures, axis=0)

        # Extract labels
        y = None
        if label_col is not None:
            label_schema = image_df.schema[label_col]
            label_dtype = label_schema.dataType
            assert isinstance(label_dtype, spla.VectorUDT), \
                "must encode labels in one-hot vector format, but got {}".format(label_dtype)

            localLabels = []
            for row in rows:
                try:
                    _keras_label = row[label_col].toArray()
                except ValueError:
                    raise ValueError("Cannot extract encoded label array")
                localLabels.append(_keras_label)

            if not localLabels:
                raise ValueError("Failed to load any labels from dataset, but labels are required")

            y = np.stack(localLabels, axis=0)
            assert y.shape[0] == X.shape[0], \
                "number of features {} != number of labels {}".format(X.shape[0], y.shape[0])

        return X, y

开发者ID:databricks，项目名称:spark-deep-learning，代码行数:51，代码来源:keras_image_file_estimator.py

示例12: collect_from_ltr_plugin_and_kafka

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def collect_from_ltr_plugin_and_kafka(df, brokers, model, feature_names_accu, indices=None):
    """Collect feature vectors from elasticsearch via kafka

    Pushes queries into a kafka topic and retrieves results from a second kafka topic.
    A daemon must be running on relforge to collect the queries and produce results.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
        Source dataframe containing wikiid, query and hit_page_id fields
        to collect feature vectors for.
    brokers : list of str
        List of kafka brokers used to bootstrap access into the kafka cluster.
    model : string
        definition of the model/featureset: "featureset:name", "model:name" or "featureset:name@storeName"
    feature_names_accu : Accumulator
        used to collect feature names
    indices : dict, optional
        map from wikiid to elasticsearch index to query. If wikiid is
        not present the wikiid will be used as index name. (Default: None)
    """
    mjolnir.spark.assert_columns(df, ['wikiid', 'query', 'hit_page_id'])
    if indices is None:
        indices = {}
    eltType, name, store = mjolnir.utils.explode_ltr_model_definition(model)
    log_query = LtrLoggingQuery(eltType, name, store)

    def kafka_handle_response(record):
        assert record['status_code'] == 200
        parsed = json.loads(record['text'])
        response = parsed['responses'][0]
        meta = record['meta']

        for hit_page_id, features in extract_ltr_log_feature_values(response, feature_names_accu):
            yield [meta['wikiid'], meta['query'], hit_page_id, features]

    rdd = mjolnir.kafka.client.msearch(
        df.groupBy('wikiid', 'query').agg(F.collect_set('hit_page_id').alias('hit_page_ids')),
        client_config=brokers,
        meta_keys=['wikiid', 'query'],
        create_es_query=lambda row: log_query.make_msearch(row, indices),
        handle_response=kafka_handle_response)

    return df.sql_ctx.createDataFrame(rdd, T.StructType([
        df.schema['wikiid'], df.schema['query'], df.schema['hit_page_id'],
        T.StructField('features', VectorUDT(), nullable=False)
        # We could have gotten duplicate data from kafka. Clean them up.
    ])).drop_duplicates(['wikiid', 'query', 'hit_page_id'])

开发者ID:wikimedia，项目名称:search-MjoLniR，代码行数:50，代码来源:features.py

示例13: one_hot_encode

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def one_hot_encode(self, data = None, inputCol = None, outputCol = None):
        '''
        One-hot encodes a protein sequence. The one-hot encoding
        encodes the 20 natural amino acids, plus X for any other
        residue for a total of 21 elements per residue.

        Parameters
        ----------
        data : DataFrame
           input data to be encoded [None]
        inputCol : str
           name of the input column [None]
        outputCol : str
           name of the output column [None]
        '''

        # Setting class variables
        if data is not None:
            self.data = data

        if inputCol is not None:
            self.inputCol = inputCol

        if outputCol is not None:
            self.outputCol = outputCol

        if self.data is None:
            raise ValueError("Class variable data is not defined, please pass\
                             in a dataframe into the data parameter")

        session = SparkSession.builder.getOrCreate()
        AMINO_ACIDS21 = self.AMINO_ACIDS21

        # Encoder function to be passed as User Defined Function (UDF)
        def _encoder(s):

            values = [0] * len(AMINO_ACIDS21) * len(s)

            for i in range(len(s)):

                if s[i] in AMINO_ACIDS21:
                    index = AMINO_ACIDS21.index(s[i])

                else:
                    index = AMINO_ACIDS21.index('X')

                values[i*len(AMINO_ACIDS21) + index] = 1

            return Vectors.dense(values)

        session.udf.register("encoder", _encoder, VectorUDT())

        self.data.createOrReplaceTempView("table")
        sql = f"SELECT *, encoder({self.inputCol}) AS {self.outputCol} from table"

        data = session.sql(sql)

        return data

开发者ID:sbl-sdsc，项目名称:mmtf-pyspark，代码行数:60，代码来源:proteinSequenceEncoder.py

示例14: property_encode

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def property_encode(self, data = None, inputCol = None, outputCol = None):
        '''Encodes a protein sequence by 7 physicochemical properties

        References
        ----------
        Meiler, J., Müller, M., Zeidler, A. et al. J Mol Model (2001)
        https://link.springer.com/article/10.1007/s008940100038

        Parameters
        ----------
        data : DataFrame
           input data to be encoded [None]
        inputCol : str
           name of the input column [None]
        outputCol : str
           name of the output column [None]

        Returns
        -------
        dataset
           dataset with feature vector appended
        '''

        # Setting class variables
        if data is not None:
            self.data = data

        if inputCol is not None:
            self.inputCol = inputCol

        if outputCol is not None:
            self.outputCol = outputCol

        if self.data is None:
            raise ValueError("Class variable data is not defined, please pass\
                             in a dataframe into the data parameter")

        session = SparkSession.builder.getOrCreate()
        properties = self.properties

        #Encoder function to be passed as User Defined Function (UDF)
        def _encoder(s):
            values = []

            for i in range(len(s)):

                if s[i] in properties:
                    values += properties[s[i]]

            return Vectors.dense(values)

        session.udf.register("encoder", _encoder, VectorUDT())

        self.data.createOrReplaceTempView("table")
        sql = f"SELECT *, encoder({self.inputCol}) AS {self.outputCol} from table"

        data = session.sql(sql)

        return data

开发者ID:sbl-sdsc，项目名称:mmtf-pyspark，代码行数:61，代码来源:proteinSequenceEncoder.py

示例15: blosum62_encode

# 需要导入模块: from pyspark.ml import linalg [as 别名]
# 或者: from pyspark.ml.linalg import VectorUDT [as 别名]
def blosum62_encode(self, data = None, inputCol = None, outputCol = None):
        '''Encodes a protein sequence by 7 Blosum62

        References
        ----------
        Blosum Matrix
        https://ftp.ncbi.nih.gov/repository/blocks/unix/blosum/BLOSUM/blosum62.blast.new

        Parameters
        ----------
        data : DataFrame
           input data to be encoded [None]
        inputCol : str
           name of the input column [None]
        outputCol : str
           name of the output column [None]

        Returns
        -------
        dataset
           dataset with feature vector appended
        '''

        if data is not None:
            self.data = data

        if inputCol is not None:
            self.inputCol = inputCol

        if outputCol is not None:
            self.outputCol = outputCol

        if self.data is None:
            raise ValueError("Class variable data is not defined, please pass\
                             in a dataframe into the data parameter")

        session = SparkSession.builder.getOrCreate()
        blosum62 = self.blosum62

        #Encoder function to be passed as User Defined Function (UDF)
        def _encoder(s):
            values = []

            for i in range(len(s)):

                if s[i] in blosum62:
                    values += blosum62[s[i]]

            return Vectors.dense(values)

        session.udf.register("encoder", _encoder, VectorUDT())

        self.data.createOrReplaceTempView("table")
        sql = f"SELECT *, encoder({self.inputCol}) AS {self.outputCol} from table"

        data = session.sql(sql)

        return data

开发者ID:sbl-sdsc，项目名称:mmtf-pyspark，代码行数:60，代码来源:proteinSequenceEncoder.py

注：本文中的pyspark.ml.linalg.VectorUDT方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。