本文整理汇总了Python中pyspark.sql.functions.col方法的典型用法代码示例。如果您正苦于以下问题:Python functions.col方法的具体用法?Python functions.col怎么用?Python functions.col使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.functions
的用法示例。
在下文中一共展示了functions.col方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: smvPlusYears
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import col [as 别名]
def smvPlusYears(self, delta):
"""Add N years to `Timestamp` or `Date` column
Args:
delta (int or Column): the number of years to add
Example:
>>> df.select(col("dob").smvPlusYears(3))
Returns:
(Column): TimestampType. The incremented Timestamp, or null if input is null.
**Note** even if the input is DateType, the output is TimestampType
"""
if (isinstance(delta, int)):
jdelta = delta
elif (isinstance(delta, Column)):
jdelta = delta._jc
else:
raise RuntimeError("delta parameter must be either an int or a Column")
jc = self._jColumnHelper.smvPlusYears(jdelta)
return Column(jc)
示例2: test_readImages
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import col [as 别名]
def test_readImages(self):
# Test that reading
imageDF = imageIO._readImagesWithCustomFn(
"file/path", decode_f=imageIO.PIL_decode, numPartition=2, sc=self.binaryFilesMock)
self.assertTrue("image" in imageDF.schema.names)
# The DF should have 2 images and 1 null.
self.assertEqual(imageDF.count(), 3)
validImages = imageDF.filter(col("image").isNotNull())
self.assertEqual(validImages.count(), 2)
img = validImages.first().image
self.assertEqual(img.height, array.shape[0])
self.assertEqual(img.width, array.shape[1])
self.assertEqual(imageIO.imageTypeByOrdinal(img.mode).nChannels, array.shape[2])
# array comes out of PIL and is in RGB order
self.assertEqual(img.data, array.tobytes())
示例3: for_each_item
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import col [as 别名]
def for_each_item(
col_name: str,
items: List[_LT],
transformer_factory: Callable[[_LT], Transformer],
mapper=map
) -> Transformer:
"""Run a transformation for each value in a list of values"""
# A lambda inside the list comprehension would capture `item`
# by name, use a proper function to ensure item is captured
# from a unique context.
def restrict_to_item(item: _LT) -> Transformer:
return lambda df: df.where(F.col(col_name) == item)
transformers = [seq_transform([
restrict_to_item(item),
transformer_factory(item)
]) for item in items]
return par_transform(transformers, mapper)
# Shared transformations
示例4: cache_to_disk
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import col [as 别名]
def cache_to_disk(temp_dir: str, partition_by: str) -> Transformer:
"""Write a dataframe to disk partitioned by a column.
Writes out the source dataframe partitioned by the provided
column. The intention is for downstream tasks to construct
a dataframe per partitioned value. When doing so this allows
the downstream data frames to read individual columns for specific
wikis from disk directly.
Cleaning up the temp_dir is the callers responsibility and must
be done after all transformations have executed to completion,
likely after closing the SparkContext.
TODO: This emits the same number of partitions for each partition col,
while some may need 1 partition and others 1000. We would need count
estimates to do that partitioning though.
"""
def transform(df: DataFrame) -> DataFrame:
df.write.partitionBy(partition_by).parquet(temp_dir)
return df.sql_ctx.read.parquet(temp_dir)
return transform
示例5: group_k_fold
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import col [as 别名]
def group_k_fold(df, num_folds, output_column='fold'):
"""
Generates group k-fold splits. The fold a row belongs to is
assigned to the column identified by the output_column parameter.
Parameters
----------
df : pyspark.sql.DataFrame
num_folds : int
output_column : str, optional
Returns
------
pyspark.sql.DataFrame
Input data frame with a 'fold' column indicating fold membership.
Normalized queries are equally distributed to each fold.
"""
return (
split(df, [1. / num_folds] * num_folds, output_column)
.withColumn(output_column, mjolnir.spark.add_meta(df._sc, F.col(output_column), {
'num_folds': num_folds,
})))
示例6: test_split
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import col [as 别名]
def test_split(spark):
df = (
spark
.range(1, 100 * 100)
# convert into 100 "queries" with 100 values each. We need a
# sufficiently large number of queries, or the split wont have
# enough data for partitions to even out.
.select(F.lit('foowiki').alias('wikiid'),
(F.col('id')/100).cast('int').alias('norm_query_id')))
with_folds = mjolnir.training.tuning.split(df, (0.8, 0.2)).collect()
fold_0 = [row for row in with_folds if row.fold == 0]
fold_1 = [row for row in with_folds if row.fold == 1]
# Check the folds are pretty close to requested
total_len = float(len(with_folds))
assert 0.8 == pytest.approx(len(fold_0) / total_len, abs=0.015)
assert 0.2 == pytest.approx(len(fold_1) / total_len, abs=0.015)
# Check each norm query is only found on one side of the split
queries_in_0 = set([row.norm_query_id for row in fold_0])
queries_in_1 = set([row.norm_query_id for row in fold_1])
assert len(queries_in_0.intersection(queries_in_1)) == 0
示例7: select_features
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import col [as 别名]
def select_features(
wiki: str,
num_features: int,
metadata: Dict
) -> mt.Transformer:
def transform(df: DataFrame) -> DataFrame:
# Compute the "best" features, per some metric
sc = df.sql_ctx.sparkSession.sparkContext
features = metadata['input_feature_meta']['features']
selected = mjolnir.feature_engineering.select_features(
sc, df, features, num_features, algo='mrmr')
metadata['wiki_features'][wiki] = selected
# Rebuild the `features` col with only the selected features
keep_cols = metadata['default_cols'] + selected
df_selected = df.select(*keep_cols)
assembler = VectorAssembler(
inputCols=selected, outputCol='features')
return assembler.transform(df_selected).drop(*selected)
return transform
示例8: wrap_function_cols
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import col [as 别名]
def wrap_function_cols(self, name, package_name=None, object_name=None, java_class_instance=None, doc=""):
"""Utility method for wrapping a scala/java function that returns a spark sql Column.
This assumes that the function that you are wrapping takes a list of spark sql Column objects as its arguments.
"""
def _(*cols):
jcontainer = self.get_java_container(package_name=package_name, object_name=object_name, java_class_instance=java_class_instance)
# Ensure that your argument is a column
col_args = [col._jc if isinstance(col, Column) else _make_col(col)._jc for col in cols]
function = getattr(jcontainer, name)
args = col_args
jc = function(*args)
return Column(jc)
_.__name__ = name
_.__doc__ = doc
return _
示例9: smvTopNRecs
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import col [as 别名]
def smvTopNRecs(self, maxElems, *cols):
"""For each group, return the top N records according to a given ordering
Example:
>>> df.smvGroupBy("id").smvTopNRecs(3, col("amt").desc())
This will keep the 3 largest amt records for each id
Args:
maxElems (int): maximum number of records per group
cols (\*str): columns defining the ordering
Returns:
(DataFrame): result of taking top records from groups
"""
return DataFrame(self.sgd.smvTopNRecs(maxElems, smv_copy_array(self.df._sc, *cols)), self.df.sql_ctx)
示例10: topNValsByFreq
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import col [as 别名]
def topNValsByFreq(self, n, col):
"""Get top N most frequent values in Column col
Args:
n (int): maximum number of values
col (Column): which column to get values from
Example:
>>> df.topNValsByFreq(1, col("cid"))
will return the single most frequent value in the cid column
Returns:
(list(object)): most frequent values (type depends on schema)
"""
topNdf = DataFrame(self._jDfHelper._topNValsByFreq(n, col._jc), self._sql_ctx)
return [list(r)[0] for r in topNdf.collect()]
示例11: smvSelectPlus
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import col [as 别名]
def smvSelectPlus(self, *cols):
"""Selects all the current columns in current DataFrame plus the supplied expressions
The new columns are added to the end of the current column list.
Args:
cols (\*Column): expressions to add to the DataFrame
Example:
>>> df.smvSelectPlus((col("price") * col("count")).alias("amt"))
Returns:
(DataFrame): the resulting DataFrame after removal of columns
"""
jdf = self._jDfHelper.smvSelectPlus(_to_seq(cols, _jcol))
return DataFrame(jdf, self._sql_ctx)
示例12: smvDupeCheck
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import col [as 别名]
def smvDupeCheck(self, keys, n=10000):
"""For a given list of potential keys, check for duplicated records with the number of duplications and all the columns.
Null values are allowed in the potential keys, so duplication on Null valued keys will also be reported.
Args:
keys (list(string)): the key column list which the duplicate check applied
n (integer): number of rows from input data for checking duplications, defaults to 10000
Returns:
(DataFrame): returns key columns + "_N" + the rest columns for the records with more key duplication records,
where "_N" has the count of duplications of the key values of that record
"""
dfTopN = self.df.limit(n).cache()
res = dfTopN.groupBy(*keys)\
.agg(F.count(F.lit(1)).alias('_N'))\
.where(F.col('_N') > 1)\
.smvJoinByKey(dfTopN, keys, 'inner', True)\
.orderBy(*keys)
dfTopN.unpersist()
return res
示例13: smvPlusWeeks
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import col [as 别名]
def smvPlusWeeks(self, delta):
"""Add N weeks to `Timestamp` or `Date` column
Args:
delta (int or Column): the number of weeks to add
Example:
>>> df.select(col("dob").smvPlusWeeks(3))
Returns:
(Column): TimestampType. The incremented Timestamp, or null if input is null.
**Note** even if the input is DateType, the output is TimestampType
"""
if (isinstance(delta, int)):
jdelta = delta
elif (isinstance(delta, Column)):
jdelta = delta._jc
else:
raise RuntimeError("delta parameter must be either an int or a Column")
jc = self._jColumnHelper.smvPlusWeeks(jdelta)
return Column(jc)
示例14: smvTimestampToStr
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import col [as 别名]
def smvTimestampToStr(self, timezone, fmt):
"""Build a string from a timestamp and timezone
Args:
timezone (string or Column): the timezone follows the rules in
https://www.joda.org/joda-time/apidocs/org/joda/time/DateTimeZone.html#forID-java.lang.String-
It can be a string like "America/Los_Angeles" or "+1000". If it is null, use current system time zone.
fmt (string): the format is the same as the Java `Date` format
Example:
>>> df.select(col("ts").smvTimestampToStr("America/Los_Angeles","yyyy-MM-dd HH:mm:ss"))
Returns:
(Column): StringType. The converted String with given format
"""
if is_string(timezone):
jtimezone = timezone
elif isinstance(timezone, Column):
jtimezone = timezone._jc
else:
raise RuntimeError("timezone parameter must be either an string or a Column")
jc = self._jColumnHelper.smvTimestampToStr(jtimezone, fmt)
return Column(jc)
示例15: test_smvDedupByKey_with_column
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import col [as 别名]
def test_smvDedupByKey_with_column(self):
schema = "a:Integer; b:Double; c:String"
df = self.createDF(
schema,
"""1,2.0,hello;
1,3.0,hello;
2,10.0,hello2;
2,11.0,hello3"""
)
r1 = df.smvDedupByKey(col("a"))
expect = self.createDF(
schema,
"""1,2.0,hello;
2,10.0,hello2"""
)
self.should_be_same(expect, r1)