本文整理匯總了Python中pyspark.sql.functions.explode方法的典型用法代碼示例。如果您正苦於以下問題:Python functions.explode方法的具體用法?Python functions.explode怎麽用?Python functions.explode使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.sql.functions
的用法示例。
在下文中一共展示了functions.explode方法的12個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: resample_clicks_to_query_page
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import explode [as 別名]
def resample_clicks_to_query_page(
df_cluster: DataFrame,
random_seed: Optional[int],
samples_per_wiki: int
) -> mt.Transformer:
# Resamples the click log by proxy of resampling clusters, such
# that a complete cluster is either included or excluded from the
# resulting dataset.
# TODO: Evaluate alternative resampling, such as perhaps only dropping from
# clusters where all clicks were to the top result (implying an "easy" search).
mt.check_schema(df_cluster, mt.QueryClustering)
return mt.seq_transform([
# Grab only the parts of the query log we need to make the resulting sampled QueryPage
lambda df: df.select('query', 'wikiid', 'session_id', 'hit_page_ids'),
mt.join_cluster_by_query(df_cluster),
# [1] is because sample returns a tuple of (page_counts, df)
mt.temp_rename_col('cluster_id', 'norm_query_id', lambda df: mjolnir.sampling.sample(
df, random_seed, samples_per_wiki)[1]),
lambda df: df.withColumn(
'page_id', F.explode('hit_page_ids')).drop('hit_page_ids')
])
示例2: _flatten_dataset
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import explode [as 別名]
def _flatten_dataset(ds):
'''Flattens the original hierarchical data schema into a simple row-based
schema. Some less useful data are excluded.
Parameters
----------
ds : dataset
the original spark dataset
Returns
-------
dataset
flattened dataset
'''
ds = ds.withColumn("structures", explode(ds.result.structures))
return ds.select(col("query.ac"), col("result.sequence"), \
col("structures.from"), col("structures.to"), \
col("structures.qmean"), col("structures.qmean_norm"), \
col("structures.gmqe"), col("structures.coverage"), \
col("structures.oligo-state"), col("structures.method"), \
col("structures.template"), col("structures.identity"), \
col("structures.similarity"), col("structures.coordinates"),\
col("result.md5"), col("structures.md5"))
示例3: cluster_within_norm_query_groups
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import explode [as 別名]
def cluster_within_norm_query_groups(df: DataFrame) -> DataFrame:
make_groups = F.udf(_make_query_groups, T.ArrayType(T.StructType([
T.StructField('query', T.StringType(), nullable=False),
T.StructField('norm_query_group_id', T.IntegerType(), nullable=False),
])))
return (
df
.groupBy('wikiid', 'norm_query')
.agg(F.collect_list(F.struct('query', 'hit_page_ids')).alias('source'))
.select(
'wikiid', 'norm_query',
F.explode(make_groups('source')).alias('group'))
.select('wikiid', 'norm_query', 'group.query', 'group.norm_query_group_id'))
示例4: with_unique_cluster_id
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import explode [as 別名]
def with_unique_cluster_id(df: DataFrame) -> DataFrame:
return (
df
.groupby('wikiid', 'norm_query', 'norm_query_group_id')
.agg(F.collect_list('query').alias('queries'))
.select(
'wikiid', 'queries',
F.monotonically_increasing_id().alias('cluster_id'))
.select('wikiid', F.explode('queries').alias('query'), 'cluster_id'))
示例5: __get_entity_to_chain_id
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import explode [as 別名]
def __get_entity_to_chain_id():
# get entityID to strandId mapping
query = "SELECT pdbid, entity_id, pdbx_strand_id FROM entity_poly"
mapping: DataFrame = pdbjMineDataset.get_dataset(query)
# split one-to-many relationship into multiple records: 'A,B -> [A, B] -> explode to separate rows
mapping = mapping.withColumn("chainId", split(mapping.pdbx_strand_id, ","))
mapping = mapping.withColumn("chainId", explode("chainId"))
# create a structureChainId file, e.g. 1XYZ + A -> 1XYZ.A
mapping = mapping.withColumn("pdbChainId", concat_ws(".", mapping.structureId, mapping.chainId))
return mapping.select(mapping.entity_id, mapping.structureId, mapping.pdbChainId)
示例6: _flatten_dataframe
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import explode [as 別名]
def _flatten_dataframe(df):
return df.withColumn("pdbPosition", explode(col("residueMapping.pdbPosition"))) \
.withColumn("pdbAminoAcid", explode(col("residueMapping.pdbAminoAcid")))
示例7: flatten_dataset
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import explode [as 別名]
def flatten_dataset(dataset: DataFrame):
tmp = dataset
for field in tmp.schema.fields:
if isinstance(field.dataType, ArrayType):
print(field.name, field.dataType)
tmp = tmp.withColumn(field.name, explode(tmp.field.name))
return tmp
示例8: _flatten_dataframe
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import explode [as 別名]
def _flatten_dataframe(df):
return df.withColumn("variationId", explode(df.hits._id)) \
.select(col("variationId"), col("uniprotId"))
示例9: get_distinct_keys
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import explode [as 別名]
def get_distinct_keys(df, col_name, is_col_arr_map=False):
""" Return list of distinct keys.
Set is_col_arr_map to be true if column is an array of Maps.
Otherwise, assume column is a Map.
"""
if is_col_arr_map:
df = df.select(explode(col_name).alias(col_name))
df = df.select(explode(map_keys(col_name)))
return df.distinct().rdd.flatMap(lambda x: x).collect()
示例10: ms_explode_addons
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import explode [as 別名]
def ms_explode_addons(ms):
"""
Explodes the active_addons object in
the ms DataFrame and selects relevant fields
:param ms: a subset of main_summary
:return SparkDF
"""
addons_df = (
ms.select(MS_FIELDS + [fun.explode("active_addons").alias("addons")])
.select(MS_FIELDS + ADDON_FIELDS)
.withColumn("app_version", fun.substring("app_version", 1, 2))
)
return addons_df
示例11: transform
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import explode [as 別名]
def transform(longitudinal_addons):
# Only for logging, not used, but may be interesting for later analysis.
guid_set_unique = (
longitudinal_addons.withColumn(
"exploded", F.explode(longitudinal_addons.installed_addons)
)
.select("exploded") # noqa: E501 - long lines
.rdd.flatMap(lambda x: x)
.distinct()
.collect()
)
logging.info(
"Number of unique guids co-installed in sample: " + str(len(guid_set_unique))
)
restructured = longitudinal_addons.rdd.flatMap(
lambda x: key_all(x.installed_addons)
).toDF(["key_addon", "coinstalled_addons"])
# Explode the list of co-installs and count pair occurrences.
addon_co_installations = (
restructured.select(
"key_addon", F.explode("coinstalled_addons").alias("coinstalled_addon")
) # noqa: E501 - long lines
.groupBy("key_addon", "coinstalled_addon")
.count()
)
# Collect the set of coinstalled_addon, count pairs for each key_addon.
combine_and_map_cols = F.udf(
lambda x, y: (x, y),
StructType([StructField("id", StringType()), StructField("n", LongType())]),
)
# Spark functions are sometimes long and unwieldy. Tough luck.
# Ignore E128 and E501 long line errors
addon_co_installations_collapsed = (
addon_co_installations.select( # noqa: E128
"key_addon",
combine_and_map_cols("coinstalled_addon", "count").alias( # noqa: E501
"id_n"
),
)
.groupby("key_addon")
.agg(F.collect_list("id_n").alias("coinstallation_counts"))
)
logging.info(addon_co_installations_collapsed.printSchema())
logging.info("Collecting final result of co-installations.")
return addon_co_installations_collapsed
示例12: run
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import explode [as 別名]
def run(self, initial_scaffolds):
randomized_scaffold_udf = psf.udf(self._generate_func, pst.ArrayType(pst.StringType()))
get_attachment_points_udf = psf.udf(usc.get_attachment_points, pst.ArrayType(pst.IntegerType()))
remove_attachment_point_numbers_udf = psf.udf(usc.remove_attachment_point_numbers, pst.StringType())
results_df = self._initialize_results(initial_scaffolds)
scaffolds_df = results_df.select("smiles", "scaffold", "decorations")
i = 0
while scaffolds_df.count() > 0:
# generate randomized SMILES
self._log("info", "Starting iteration #%d.", i)
scaffolds_df = scaffolds_df.withColumn("randomized_scaffold", randomized_scaffold_udf("smiles"))\
.select(
"smiles", "scaffold", "decorations",
psf.explode("randomized_scaffold").alias("randomized_scaffold"))\
.withColumn("attachment_points", get_attachment_points_udf("randomized_scaffold"))\
.withColumn("randomized_scaffold", remove_attachment_point_numbers_udf("randomized_scaffold"))\
.withColumn("id", psf.monotonically_increasing_id())\
.persist()
self._log("info", "Generated %d randomized SMILES from %d scaffolds.",
scaffolds_df.count(), scaffolds_df.select("smiles").distinct().count())
# sample each randomized scaffold N times
scaffolds = scaffolds_df.select("id", "randomized_scaffold")\
.rdd.map(lambda row: (row["id"], row["randomized_scaffold"])).toLocalIterator()
self._sample_and_write_scaffolds_to_disk(scaffolds, scaffolds_df.count())
self._log("info", "Sampled %d scaffolds.", scaffolds_df.count())
# merge decorated molecules
joined_df = self._join_results(scaffolds_df).persist()
if joined_df.count() > 0:
self._log("info", "Joined %d -> %d (valid) -> %d unique sampled scaffolds",
scaffolds_df.count(), joined_df.agg(psf.sum("count")).head()[0], joined_df.count())
scaffolds_df = joined_df.join(results_df, on="smiles", how="left_anti")\
.select("smiles", "scaffold", "decorations")\
.where("smiles LIKE '%*%'")
self._log("info", "Obtained %d scaffolds for next iteration.", scaffolds_df.count())
results_df = results_df.union(joined_df)\
.groupBy("smiles")\
.agg(
psf.first("scaffold").alias("scaffold"),
psf.first("decorations").alias("decorations"),
psf.sum("count").alias("count"))\
.persist()
i += 1
return results_df