當前位置: 首頁>>代碼示例>>Python>>正文


Python functions.col方法代碼示例

本文整理匯總了Python中pyspark.sql.functions.col方法的典型用法代碼示例。如果您正苦於以下問題:Python functions.col方法的具體用法?Python functions.col怎麽用?Python functions.col使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在pyspark.sql.functions的用法示例。


在下文中一共展示了functions.col方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: smvPlusYears

# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def smvPlusYears(self, delta):
        """Add N years to `Timestamp` or `Date` column

            Args:
                delta (int or Column): the number of years to add

            Example:
                >>> df.select(col("dob").smvPlusYears(3))

            Returns:
                (Column): TimestampType. The incremented Timestamp, or null if input is null.
                    **Note** even if the input is DateType, the output is TimestampType
        """
        if (isinstance(delta, int)):
            jdelta = delta
        elif (isinstance(delta, Column)):
            jdelta = delta._jc
        else:
            raise RuntimeError("delta parameter must be either an int or a Column")
        jc = self._jColumnHelper.smvPlusYears(jdelta)
        return Column(jc) 
開發者ID:TresAmigosSD,項目名稱:SMV,代碼行數:23,代碼來源:helpers.py

示例2: test_readImages

# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def test_readImages(self):
        # Test that reading
        imageDF = imageIO._readImagesWithCustomFn(
            "file/path", decode_f=imageIO.PIL_decode, numPartition=2, sc=self.binaryFilesMock)
        self.assertTrue("image" in imageDF.schema.names)

        # The DF should have 2 images and 1 null.
        self.assertEqual(imageDF.count(), 3)
        validImages = imageDF.filter(col("image").isNotNull())
        self.assertEqual(validImages.count(), 2)

        img = validImages.first().image
        self.assertEqual(img.height, array.shape[0])
        self.assertEqual(img.width, array.shape[1])
        self.assertEqual(imageIO.imageTypeByOrdinal(img.mode).nChannels, array.shape[2])
        # array comes out of PIL and is in RGB order
        self.assertEqual(img.data, array.tobytes()) 
開發者ID:databricks,項目名稱:spark-deep-learning,代碼行數:19,代碼來源:test_imageIO.py

示例3: for_each_item

# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def for_each_item(
    col_name: str,
    items: List[_LT],
    transformer_factory: Callable[[_LT], Transformer],
    mapper=map
) -> Transformer:
    """Run a transformation for each value in a list of values"""
    # A lambda inside the list comprehension would capture `item`
    # by name, use a proper function to ensure item is captured
    # from a unique context.
    def restrict_to_item(item: _LT) -> Transformer:
        return lambda df: df.where(F.col(col_name) == item)

    transformers = [seq_transform([
        restrict_to_item(item),
        transformer_factory(item)
    ]) for item in items]

    return par_transform(transformers, mapper)


# Shared transformations 
開發者ID:wikimedia,項目名稱:search-MjoLniR,代碼行數:24,代碼來源:transform.py

示例4: cache_to_disk

# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def cache_to_disk(temp_dir: str, partition_by: str) -> Transformer:
    """Write a dataframe to disk partitioned by a column.

    Writes out the source dataframe partitioned by the provided
    column. The intention is for downstream tasks to construct
    a dataframe per partitioned value. When doing so this allows
    the downstream data frames to read individual columns for specific
    wikis from disk directly.

    Cleaning up the temp_dir is the callers responsibility and must
    be done after all transformations have executed to completion,
    likely after closing the SparkContext.

    TODO: This emits the same number of partitions for each partition col,
    while some may need 1 partition and others 1000. We would need count
    estimates to do that partitioning though.
    """
    def transform(df: DataFrame) -> DataFrame:
        df.write.partitionBy(partition_by).parquet(temp_dir)
        return df.sql_ctx.read.parquet(temp_dir)
    return transform 
開發者ID:wikimedia,項目名稱:search-MjoLniR,代碼行數:23,代碼來源:transform.py

示例5: group_k_fold

# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def group_k_fold(df, num_folds, output_column='fold'):
    """
    Generates group k-fold splits. The fold a row belongs to is
    assigned to the column identified by the output_column parameter.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    num_folds : int
    output_column : str, optional

    Returns
    ------
    pyspark.sql.DataFrame
        Input data frame with a 'fold' column indicating fold membership.
        Normalized queries are equally distributed to each fold.
    """
    return (
        split(df, [1. / num_folds] * num_folds, output_column)
        .withColumn(output_column, mjolnir.spark.add_meta(df._sc, F.col(output_column), {
            'num_folds': num_folds,
        }))) 
開發者ID:wikimedia,項目名稱:search-MjoLniR,代碼行數:24,代碼來源:tuning.py

示例6: test_split

# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def test_split(spark):
    df = (
        spark
        .range(1, 100 * 100)
        # convert into 100 "queries" with 100 values each. We need a
        # sufficiently large number of queries, or the split wont have
        # enough data for partitions to even out.
        .select(F.lit('foowiki').alias('wikiid'),
                (F.col('id')/100).cast('int').alias('norm_query_id')))

    with_folds = mjolnir.training.tuning.split(df, (0.8, 0.2)).collect()

    fold_0 = [row for row in with_folds if row.fold == 0]
    fold_1 = [row for row in with_folds if row.fold == 1]

    # Check the folds are pretty close to requested
    total_len = float(len(with_folds))
    assert 0.8 == pytest.approx(len(fold_0) / total_len, abs=0.015)
    assert 0.2 == pytest.approx(len(fold_1) / total_len, abs=0.015)

    # Check each norm query is only found on one side of the split
    queries_in_0 = set([row.norm_query_id for row in fold_0])
    queries_in_1 = set([row.norm_query_id for row in fold_1])
    assert len(queries_in_0.intersection(queries_in_1)) == 0 
開發者ID:wikimedia,項目名稱:search-MjoLniR,代碼行數:26,代碼來源:test_tuning.py

示例7: select_features

# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def select_features(
    wiki: str,
    num_features: int,
    metadata: Dict
) -> mt.Transformer:
    def transform(df: DataFrame) -> DataFrame:
        # Compute the "best" features, per some metric
        sc = df.sql_ctx.sparkSession.sparkContext
        features = metadata['input_feature_meta']['features']
        selected = mjolnir.feature_engineering.select_features(
            sc, df, features, num_features, algo='mrmr')
        metadata['wiki_features'][wiki] = selected

        # Rebuild the `features` col with only the selected features
        keep_cols = metadata['default_cols'] + selected
        df_selected = df.select(*keep_cols)
        assembler = VectorAssembler(
            inputCols=selected, outputCol='features')
        return assembler.transform(df_selected).drop(*selected)
    return transform 
開發者ID:wikimedia,項目名稱:search-MjoLniR,代碼行數:22,代碼來源:feature_selection.py

示例8: wrap_function_cols

# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def wrap_function_cols(self, name, package_name=None, object_name=None, java_class_instance=None, doc=""):
        """Utility method for wrapping a scala/java function that returns a spark sql Column.

        This assumes that the function that you are wrapping takes a list of spark sql Column objects as its arguments.
        """
        def _(*cols):
            jcontainer = self.get_java_container(package_name=package_name, object_name=object_name, java_class_instance=java_class_instance)
            # Ensure that your argument is a column
            col_args = [col._jc if isinstance(col, Column) else _make_col(col)._jc for col in cols]
            function = getattr(jcontainer, name)
            args = col_args
            jc = function(*args)
            return Column(jc)
        _.__name__ = name
        _.__doc__ = doc
        return _ 
開發者ID:Valassis-Digital-Media,項目名稱:spylon,代碼行數:18,代碼來源:utils.py

示例9: smvTopNRecs

# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def smvTopNRecs(self, maxElems, *cols):
        """For each group, return the top N records according to a given ordering

            Example:

                >>> df.smvGroupBy("id").smvTopNRecs(3, col("amt").desc())

                This will keep the 3 largest amt records for each id

            Args:
                maxElems (int): maximum number of records per group
                cols (\*str): columns defining the ordering

            Returns:
                (DataFrame): result of taking top records from groups

        """
        return DataFrame(self.sgd.smvTopNRecs(maxElems, smv_copy_array(self.df._sc, *cols)), self.df.sql_ctx) 
開發者ID:TresAmigosSD,項目名稱:SMV,代碼行數:20,代碼來源:helpers.py

示例10: topNValsByFreq

# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def topNValsByFreq(self, n, col):
        """Get top N most frequent values in Column col

            Args:
                n (int): maximum number of values
                col (Column): which column to get values from

            Example:

                >>> df.topNValsByFreq(1, col("cid"))

                will return the single most frequent value in the cid column

            Returns:
                (list(object)): most frequent values (type depends on schema)
        """
        topNdf = DataFrame(self._jDfHelper._topNValsByFreq(n, col._jc), self._sql_ctx)
        return [list(r)[0] for r in topNdf.collect()] 
開發者ID:TresAmigosSD,項目名稱:SMV,代碼行數:20,代碼來源:helpers.py

示例11: smvSelectPlus

# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def smvSelectPlus(self, *cols):
        """Selects all the current columns in current DataFrame plus the supplied expressions

            The new columns are added to the end of the current column list.

            Args:
                cols (\*Column): expressions to add to the DataFrame

            Example:
                >>> df.smvSelectPlus((col("price") * col("count")).alias("amt"))

            Returns:
                (DataFrame): the resulting DataFrame after removal of columns
        """
        jdf = self._jDfHelper.smvSelectPlus(_to_seq(cols, _jcol))
        return DataFrame(jdf, self._sql_ctx) 
開發者ID:TresAmigosSD,項目名稱:SMV,代碼行數:18,代碼來源:helpers.py

示例12: smvDupeCheck

# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def smvDupeCheck(self, keys, n=10000):
        """For a given list of potential keys, check for duplicated records with the number of duplications and all the columns.

            Null values are allowed in the potential keys, so duplication on Null valued keys will also be reported.

            Args:
                keys (list(string)): the key column list which the duplicate check applied
                n (integer): number of rows from input data for checking duplications, defaults to 10000

            Returns:
                (DataFrame): returns key columns + "_N" + the rest columns for the records with more key duplication records, 
                    where "_N" has the count of duplications of the key values of that record
        """
        dfTopN = self.df.limit(n).cache()

        res = dfTopN.groupBy(*keys)\
            .agg(F.count(F.lit(1)).alias('_N'))\
            .where(F.col('_N') > 1)\
            .smvJoinByKey(dfTopN, keys, 'inner', True)\
            .orderBy(*keys)

        dfTopN.unpersist()
        return res 
開發者ID:TresAmigosSD,項目名稱:SMV,代碼行數:25,代碼來源:helpers.py

示例13: smvPlusWeeks

# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def smvPlusWeeks(self, delta):
        """Add N weeks to `Timestamp` or `Date` column

            Args:
                delta (int or Column): the number of weeks to add

            Example:
                >>> df.select(col("dob").smvPlusWeeks(3))

            Returns:
                (Column): TimestampType. The incremented Timestamp, or null if input is null.
                    **Note** even if the input is DateType, the output is TimestampType
        """
        if (isinstance(delta, int)):
            jdelta = delta
        elif (isinstance(delta, Column)):
            jdelta = delta._jc
        else:
            raise RuntimeError("delta parameter must be either an int or a Column")
        jc = self._jColumnHelper.smvPlusWeeks(jdelta)
        return Column(jc) 
開發者ID:TresAmigosSD,項目名稱:SMV,代碼行數:23,代碼來源:helpers.py

示例14: smvTimestampToStr

# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def smvTimestampToStr(self, timezone, fmt):
        """Build a string from a timestamp and timezone

            Args:
                timezone (string or Column): the timezone follows the rules in 
                    https://www.joda.org/joda-time/apidocs/org/joda/time/DateTimeZone.html#forID-java.lang.String-
                    It can be a string like "America/Los_Angeles" or "+1000". If it is null, use current system time zone.
                fmt (string): the format is the same as the Java `Date` format

            Example:
                >>> df.select(col("ts").smvTimestampToStr("America/Los_Angeles","yyyy-MM-dd HH:mm:ss"))

            Returns:
                (Column): StringType. The converted String with given format
        """
        if is_string(timezone):
            jtimezone = timezone
        elif isinstance(timezone, Column):
            jtimezone = timezone._jc
        else:
            raise RuntimeError("timezone parameter must be either an string or a Column")
        jc = self._jColumnHelper.smvTimestampToStr(jtimezone, fmt)
        return Column(jc) 
開發者ID:TresAmigosSD,項目名稱:SMV,代碼行數:25,代碼來源:helpers.py

示例15: test_smvDedupByKey_with_column

# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def test_smvDedupByKey_with_column(self):
        schema = "a:Integer; b:Double; c:String"
        df = self.createDF(
            schema,
            """1,2.0,hello;
            1,3.0,hello;
            2,10.0,hello2;
            2,11.0,hello3"""
        )
        r1 = df.smvDedupByKey(col("a"))
        expect = self.createDF(
            schema,
            """1,2.0,hello;
            2,10.0,hello2"""
        )
        self.should_be_same(expect, r1) 
開發者ID:TresAmigosSD,項目名稱:SMV,代碼行數:18,代碼來源:testDataFrameHelper.py


注:本文中的pyspark.sql.functions.col方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。