本文整理匯總了Python中pyspark.sql.functions.col方法的典型用法代碼示例。如果您正苦於以下問題:Python functions.col方法的具體用法?Python functions.col怎麽用?Python functions.col使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.sql.functions
的用法示例。
在下文中一共展示了functions.col方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: smvPlusYears
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def smvPlusYears(self, delta):
"""Add N years to `Timestamp` or `Date` column
Args:
delta (int or Column): the number of years to add
Example:
>>> df.select(col("dob").smvPlusYears(3))
Returns:
(Column): TimestampType. The incremented Timestamp, or null if input is null.
**Note** even if the input is DateType, the output is TimestampType
"""
if (isinstance(delta, int)):
jdelta = delta
elif (isinstance(delta, Column)):
jdelta = delta._jc
else:
raise RuntimeError("delta parameter must be either an int or a Column")
jc = self._jColumnHelper.smvPlusYears(jdelta)
return Column(jc)
示例2: test_readImages
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def test_readImages(self):
# Test that reading
imageDF = imageIO._readImagesWithCustomFn(
"file/path", decode_f=imageIO.PIL_decode, numPartition=2, sc=self.binaryFilesMock)
self.assertTrue("image" in imageDF.schema.names)
# The DF should have 2 images and 1 null.
self.assertEqual(imageDF.count(), 3)
validImages = imageDF.filter(col("image").isNotNull())
self.assertEqual(validImages.count(), 2)
img = validImages.first().image
self.assertEqual(img.height, array.shape[0])
self.assertEqual(img.width, array.shape[1])
self.assertEqual(imageIO.imageTypeByOrdinal(img.mode).nChannels, array.shape[2])
# array comes out of PIL and is in RGB order
self.assertEqual(img.data, array.tobytes())
示例3: for_each_item
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def for_each_item(
col_name: str,
items: List[_LT],
transformer_factory: Callable[[_LT], Transformer],
mapper=map
) -> Transformer:
"""Run a transformation for each value in a list of values"""
# A lambda inside the list comprehension would capture `item`
# by name, use a proper function to ensure item is captured
# from a unique context.
def restrict_to_item(item: _LT) -> Transformer:
return lambda df: df.where(F.col(col_name) == item)
transformers = [seq_transform([
restrict_to_item(item),
transformer_factory(item)
]) for item in items]
return par_transform(transformers, mapper)
# Shared transformations
示例4: cache_to_disk
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def cache_to_disk(temp_dir: str, partition_by: str) -> Transformer:
"""Write a dataframe to disk partitioned by a column.
Writes out the source dataframe partitioned by the provided
column. The intention is for downstream tasks to construct
a dataframe per partitioned value. When doing so this allows
the downstream data frames to read individual columns for specific
wikis from disk directly.
Cleaning up the temp_dir is the callers responsibility and must
be done after all transformations have executed to completion,
likely after closing the SparkContext.
TODO: This emits the same number of partitions for each partition col,
while some may need 1 partition and others 1000. We would need count
estimates to do that partitioning though.
"""
def transform(df: DataFrame) -> DataFrame:
df.write.partitionBy(partition_by).parquet(temp_dir)
return df.sql_ctx.read.parquet(temp_dir)
return transform
示例5: group_k_fold
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def group_k_fold(df, num_folds, output_column='fold'):
"""
Generates group k-fold splits. The fold a row belongs to is
assigned to the column identified by the output_column parameter.
Parameters
----------
df : pyspark.sql.DataFrame
num_folds : int
output_column : str, optional
Returns
------
pyspark.sql.DataFrame
Input data frame with a 'fold' column indicating fold membership.
Normalized queries are equally distributed to each fold.
"""
return (
split(df, [1. / num_folds] * num_folds, output_column)
.withColumn(output_column, mjolnir.spark.add_meta(df._sc, F.col(output_column), {
'num_folds': num_folds,
})))
示例6: test_split
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def test_split(spark):
df = (
spark
.range(1, 100 * 100)
# convert into 100 "queries" with 100 values each. We need a
# sufficiently large number of queries, or the split wont have
# enough data for partitions to even out.
.select(F.lit('foowiki').alias('wikiid'),
(F.col('id')/100).cast('int').alias('norm_query_id')))
with_folds = mjolnir.training.tuning.split(df, (0.8, 0.2)).collect()
fold_0 = [row for row in with_folds if row.fold == 0]
fold_1 = [row for row in with_folds if row.fold == 1]
# Check the folds are pretty close to requested
total_len = float(len(with_folds))
assert 0.8 == pytest.approx(len(fold_0) / total_len, abs=0.015)
assert 0.2 == pytest.approx(len(fold_1) / total_len, abs=0.015)
# Check each norm query is only found on one side of the split
queries_in_0 = set([row.norm_query_id for row in fold_0])
queries_in_1 = set([row.norm_query_id for row in fold_1])
assert len(queries_in_0.intersection(queries_in_1)) == 0
示例7: select_features
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def select_features(
wiki: str,
num_features: int,
metadata: Dict
) -> mt.Transformer:
def transform(df: DataFrame) -> DataFrame:
# Compute the "best" features, per some metric
sc = df.sql_ctx.sparkSession.sparkContext
features = metadata['input_feature_meta']['features']
selected = mjolnir.feature_engineering.select_features(
sc, df, features, num_features, algo='mrmr')
metadata['wiki_features'][wiki] = selected
# Rebuild the `features` col with only the selected features
keep_cols = metadata['default_cols'] + selected
df_selected = df.select(*keep_cols)
assembler = VectorAssembler(
inputCols=selected, outputCol='features')
return assembler.transform(df_selected).drop(*selected)
return transform
示例8: wrap_function_cols
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def wrap_function_cols(self, name, package_name=None, object_name=None, java_class_instance=None, doc=""):
"""Utility method for wrapping a scala/java function that returns a spark sql Column.
This assumes that the function that you are wrapping takes a list of spark sql Column objects as its arguments.
"""
def _(*cols):
jcontainer = self.get_java_container(package_name=package_name, object_name=object_name, java_class_instance=java_class_instance)
# Ensure that your argument is a column
col_args = [col._jc if isinstance(col, Column) else _make_col(col)._jc for col in cols]
function = getattr(jcontainer, name)
args = col_args
jc = function(*args)
return Column(jc)
_.__name__ = name
_.__doc__ = doc
return _
示例9: smvTopNRecs
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def smvTopNRecs(self, maxElems, *cols):
"""For each group, return the top N records according to a given ordering
Example:
>>> df.smvGroupBy("id").smvTopNRecs(3, col("amt").desc())
This will keep the 3 largest amt records for each id
Args:
maxElems (int): maximum number of records per group
cols (\*str): columns defining the ordering
Returns:
(DataFrame): result of taking top records from groups
"""
return DataFrame(self.sgd.smvTopNRecs(maxElems, smv_copy_array(self.df._sc, *cols)), self.df.sql_ctx)
示例10: topNValsByFreq
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def topNValsByFreq(self, n, col):
"""Get top N most frequent values in Column col
Args:
n (int): maximum number of values
col (Column): which column to get values from
Example:
>>> df.topNValsByFreq(1, col("cid"))
will return the single most frequent value in the cid column
Returns:
(list(object)): most frequent values (type depends on schema)
"""
topNdf = DataFrame(self._jDfHelper._topNValsByFreq(n, col._jc), self._sql_ctx)
return [list(r)[0] for r in topNdf.collect()]
示例11: smvSelectPlus
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def smvSelectPlus(self, *cols):
"""Selects all the current columns in current DataFrame plus the supplied expressions
The new columns are added to the end of the current column list.
Args:
cols (\*Column): expressions to add to the DataFrame
Example:
>>> df.smvSelectPlus((col("price") * col("count")).alias("amt"))
Returns:
(DataFrame): the resulting DataFrame after removal of columns
"""
jdf = self._jDfHelper.smvSelectPlus(_to_seq(cols, _jcol))
return DataFrame(jdf, self._sql_ctx)
示例12: smvDupeCheck
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def smvDupeCheck(self, keys, n=10000):
"""For a given list of potential keys, check for duplicated records with the number of duplications and all the columns.
Null values are allowed in the potential keys, so duplication on Null valued keys will also be reported.
Args:
keys (list(string)): the key column list which the duplicate check applied
n (integer): number of rows from input data for checking duplications, defaults to 10000
Returns:
(DataFrame): returns key columns + "_N" + the rest columns for the records with more key duplication records,
where "_N" has the count of duplications of the key values of that record
"""
dfTopN = self.df.limit(n).cache()
res = dfTopN.groupBy(*keys)\
.agg(F.count(F.lit(1)).alias('_N'))\
.where(F.col('_N') > 1)\
.smvJoinByKey(dfTopN, keys, 'inner', True)\
.orderBy(*keys)
dfTopN.unpersist()
return res
示例13: smvPlusWeeks
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def smvPlusWeeks(self, delta):
"""Add N weeks to `Timestamp` or `Date` column
Args:
delta (int or Column): the number of weeks to add
Example:
>>> df.select(col("dob").smvPlusWeeks(3))
Returns:
(Column): TimestampType. The incremented Timestamp, or null if input is null.
**Note** even if the input is DateType, the output is TimestampType
"""
if (isinstance(delta, int)):
jdelta = delta
elif (isinstance(delta, Column)):
jdelta = delta._jc
else:
raise RuntimeError("delta parameter must be either an int or a Column")
jc = self._jColumnHelper.smvPlusWeeks(jdelta)
return Column(jc)
示例14: smvTimestampToStr
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def smvTimestampToStr(self, timezone, fmt):
"""Build a string from a timestamp and timezone
Args:
timezone (string or Column): the timezone follows the rules in
https://www.joda.org/joda-time/apidocs/org/joda/time/DateTimeZone.html#forID-java.lang.String-
It can be a string like "America/Los_Angeles" or "+1000". If it is null, use current system time zone.
fmt (string): the format is the same as the Java `Date` format
Example:
>>> df.select(col("ts").smvTimestampToStr("America/Los_Angeles","yyyy-MM-dd HH:mm:ss"))
Returns:
(Column): StringType. The converted String with given format
"""
if is_string(timezone):
jtimezone = timezone
elif isinstance(timezone, Column):
jtimezone = timezone._jc
else:
raise RuntimeError("timezone parameter must be either an string or a Column")
jc = self._jColumnHelper.smvTimestampToStr(jtimezone, fmt)
return Column(jc)
示例15: test_smvDedupByKey_with_column
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import col [as 別名]
def test_smvDedupByKey_with_column(self):
schema = "a:Integer; b:Double; c:String"
df = self.createDF(
schema,
"""1,2.0,hello;
1,3.0,hello;
2,10.0,hello2;
2,11.0,hello3"""
)
r1 = df.smvDedupByKey(col("a"))
expect = self.createDF(
schema,
"""1,2.0,hello;
2,10.0,hello2"""
)
self.should_be_same(expect, r1)