本文整理汇总了Python中pyspark.sql.DataFrame方法的典型用法代码示例。如果您正苦于以下问题:Python sql.DataFrame方法的具体用法?Python sql.DataFrame怎么用?Python sql.DataFrame使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql
的用法示例。
在下文中一共展示了sql.DataFrame方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: append_features
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import DataFrame [as 别名]
def append_features(df, *cols):
"""Append features from columns to the features vector.
Parameters
----------
df : pyspark.sql.DataFrame
cols : list of str
Returns
-------
pyspark.sql.DataFrame
"""
def add_features(feat, *other):
raw = feat.toArray()
return Vectors.dense(np.append(raw, list(map(float, other))))
add_features_udf = F.udf(add_features, VectorUDT())
new_feat_list = df.schema['features'].metadata['features'] + cols
return df.withColumn('features', mjolnir.spark.add_meta(
df._sc, add_features_udf('features', *cols), {'features': new_feat_list}))
示例2: zero_features
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import DataFrame [as 别名]
def zero_features(df, *feature_names):
"""Zero out features in the feature vector.
Parameters
----------
df : pyspark.sql.DataFrame
feature_names : list of str
Returns
-------
pyspark.sql.DataFrame
"""
features = df.schema['features'].metadata['features']
idxs = [features.index(name) for name in feature_names]
def zero_features(feat):
raw = feat.toArray()
for idx in idxs:
raw[idx] = 0.
return Vectors.dense(raw)
zero_features_udf = F.udf(zero_features, VectorUDT())
return df.withColumn('features', mjolnir.spark.add_meta(
df._sc, zero_features_udf('features'), {'features': features}))
示例3: explode_features
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import DataFrame [as 别名]
def explode_features(df, features=None):
"""Convert feature vector into individual columns
Parameters
----------
df : pyspark.sql.DataFrame
features : list of str or None
Returns
-------
pyspark.sql.DataFrame
"""
if features is None:
features = df.schema['features'].metadata['features']
def extract_feature(features, idx):
return float(features[idx])
extract_feature_udf = F.udf(extract_feature, pyspark.sql.types.FloatType())
cols = [extract_feature_udf('features', F.lit(idx)).alias(name) for idx, name in enumerate(features)]
return df.select('*', *cols)
示例4: resample_clicks_to_query_page
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import DataFrame [as 别名]
def resample_clicks_to_query_page(
df_cluster: DataFrame,
random_seed: Optional[int],
samples_per_wiki: int
) -> mt.Transformer:
# Resamples the click log by proxy of resampling clusters, such
# that a complete cluster is either included or excluded from the
# resulting dataset.
# TODO: Evaluate alternative resampling, such as perhaps only dropping from
# clusters where all clicks were to the top result (implying an "easy" search).
mt.check_schema(df_cluster, mt.QueryClustering)
return mt.seq_transform([
# Grab only the parts of the query log we need to make the resulting sampled QueryPage
lambda df: df.select('query', 'wikiid', 'session_id', 'hit_page_ids'),
mt.join_cluster_by_query(df_cluster),
# [1] is because sample returns a tuple of (page_counts, df)
mt.temp_rename_col('cluster_id', 'norm_query_id', lambda df: mjolnir.sampling.sample(
df, random_seed, samples_per_wiki)[1]),
lambda df: df.withColumn(
'page_id', F.explode('hit_page_ids')).drop('hit_page_ids')
])
示例5: transform
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import DataFrame [as 别名]
def transform(
query_clicks: HivePartition,
query_clustering: HivePartition,
samples_per_wiki: int,
random_seed: Optional[int],
wikis: List[str],
brokers: str,
topic_request: str,
topic_response: str,
feature_set: str,
**kwargs
) -> DataFrame:
transformer = mt.seq_transform([
mt.restrict_wikis(wikis),
resample_clicks_to_query_page(
query_clustering.df, random_seed, samples_per_wiki),
feature_vectors.transformer(
brokers, topic_request, topic_response, feature_set)
])
return transformer(query_clicks.df)
示例6: require_output_table
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import DataFrame [as 别名]
def require_output_table(
self, partition_spec_spec, metadata_fn=None,
mode='overwrite',
):
@self._post_process_transform.append
def post(df: DataFrame, kwargs: Dict):
mt.write_partition(
df, kwargs['output_table'], kwargs['output_path'],
self._resolve_partition_spec(kwargs, partition_spec_spec),
mode=mode)
if metadata_fn is not None:
spark = df.sql_ctx.sparkSession
metadata = metadata_fn(spark.read.parquet(kwargs['output_path']))
write_metadata(kwargs['output_path'], metadata)
self.add_argument('--output-table', required=True)
self.add_argument('--output-path', required=True)
示例7: collect_features
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import DataFrame [as 别名]
def collect_features(
kafka_config: ClientConfig, feature_set: str
) -> mt.Transformer:
def transform(df: DataFrame) -> DataFrame:
df_features, fnames_accu = mjolnir.features.collect(
df,
model='featureset:' + feature_set,
brokers=kafka_config,
indices=mt.ContentIndices())
# Collect the accumulator to get feature names
df_features.cache().count()
# Future transformations have to be extra careful to not lose this metadata
return _add_meta(df_features, 'features', {
'feature_set': feature_set,
'features': _check_features(fnames_accu),
'collected_at': datetime.datetime.now().isoformat()
})
return transform
示例8: select_features
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import DataFrame [as 别名]
def select_features(
wiki: str,
num_features: int,
metadata: Dict
) -> mt.Transformer:
def transform(df: DataFrame) -> DataFrame:
# Compute the "best" features, per some metric
sc = df.sql_ctx.sparkSession.sparkContext
features = metadata['input_feature_meta']['features']
selected = mjolnir.feature_engineering.select_features(
sc, df, features, num_features, algo='mrmr')
metadata['wiki_features'][wiki] = selected
# Rebuild the `features` col with only the selected features
keep_cols = metadata['default_cols'] + selected
df_selected = df.select(*keep_cols)
assembler = VectorAssembler(
inputCols=selected, outputCol='features')
return assembler.transform(df_selected).drop(*selected)
return transform
示例9: transformer
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import DataFrame [as 别名]
def transformer(
df_label: DataFrame,
temp_dir: str,
wikis: List[str],
num_features: int
) -> mt.Transformer:
mt.check_schema(df_label, mt.LabeledQueryPage)
# Hack to transfer metadata between transformations. This is populated in
# time since `select_features` does direct computation of the features.
metadata = cast(Dict, {'wiki_features': {}})
return mt.seq_transform([
mt.restrict_wikis(wikis),
mt.join_labels(df_label),
explode_features(metadata),
mt.cache_to_disk(temp_dir, partition_by='wikiid'),
mt.for_each_item('wikiid', wikis, lambda wiki: select_features(
wiki, num_features, metadata)),
attach_feature_metadata(metadata),
# While we used the labels for selecting features, they are not part of the feature vectors.
# Allow them to be joined with any other label set for export to training.
lambda df: df.drop('cluster_id', 'label'),
lambda df: df.repartition(200, 'wikiid', 'query'),
])
示例10: convert_svmrank_to_xgboost
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import DataFrame [as 别名]
def convert_svmrank_to_xgboost(df: DataFrame) -> DataFrame:
def convert_one(row: Row) -> Row:
# For now place the .xgb right next to the svmrank files. Naming/path
# options could be added if needed later.
out_path = row.path + '.xgb'
_convert_xgboost_remote(row.path, out_path)
return Row(**dict(
row.asDict(),
vec_format='xgboost',
path=out_path))
# Each row represents potentially gigabytes, convince spark
# to create a partition per row.
rdd_xgb = mt.partition_per_row(df.rdd).map(convert_one)
df_xgb = df.sql_ctx.createDataFrame(rdd_xgb, df.schema) # type: ignore
# Return both the xgb and svmrank datasets since
# we aren't purging the related files. df is safe to reuse since
# svmrank conversion returns a new dataframe with no lineage.
return df.union(df_xgb)
示例11: test_gaussian_mixture_summary
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import DataFrame [as 别名]
def test_gaussian_mixture_summary(self):
data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
(Vectors.sparse(1, [], []),)]
df = self.spark.createDataFrame(data, ["features"])
gmm = GaussianMixture(k=2)
model = gmm.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.probabilityCol, "probability")
self.assertTrue(isinstance(s.probability, DataFrame))
self.assertEqual(s.featuresCol, "features")
self.assertEqual(s.predictionCol, "prediction")
self.assertTrue(isinstance(s.cluster, DataFrame))
self.assertEqual(len(s.clusterSizes), 2)
self.assertEqual(s.k, 2)
self.assertEqual(s.numIter, 3)
示例12: assignClusters
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import DataFrame [as 别名]
def assignClusters(self, dataset):
"""
Run the PIC algorithm and returns a cluster assignment for each input vertex.
:param dataset:
A dataset with columns src, dst, weight representing the affinity matrix,
which is the matrix A in the PIC paper. Suppose the src column value is i,
the dst column value is j, the weight column value is similarity s,,ij,,
which must be nonnegative. This is a symmetric matrix and hence
s,,ij,, = s,,ji,,. For any (i, j) with nonzero similarity, there should be
either (i, j, s,,ij,,) or (j, i, s,,ji,,) in the input. Rows with i = j are
ignored, because we assume s,,ij,, = 0.0.
:return:
A dataset that contains columns of vertex id and the corresponding cluster for
the id. The schema of it will be:
- id: Long
- cluster: Int
.. versionadded:: 2.4.0
"""
self._transfer_params_to_java()
jdf = self._java_obj.assignClusters(dataset._jdf)
return DataFrame(jdf, dataset.sql_ctx)
示例13: _py2java
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import DataFrame [as 别名]
def _py2java(sc, obj):
""" Convert Python object into Java """
if isinstance(obj, RDD):
obj = _to_java_object_rdd(obj)
elif isinstance(obj, DataFrame):
obj = obj._jdf
elif isinstance(obj, SparkContext):
obj = obj._jsc
elif isinstance(obj, list):
obj = [_py2java(sc, x) for x in obj]
elif isinstance(obj, JavaObject):
pass
elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
pass
else:
data = bytearray(PickleSerializer().dumps(obj))
obj = sc._jvm.org.apache.spark.ml.python.MLSerDe.loads(data)
return obj
示例14: _prepare
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import DataFrame [as 别名]
def _prepare(cls, ratings):
if isinstance(ratings, RDD):
pass
elif isinstance(ratings, DataFrame):
ratings = ratings.rdd
else:
raise TypeError("Ratings should be represented by either an RDD or a DataFrame, "
"but got %s." % type(ratings))
first = ratings.first()
if isinstance(first, Rating):
pass
elif isinstance(first, (tuple, list)):
ratings = ratings.map(lambda x: Rating(*x))
else:
raise TypeError("Expect a Rating or a tuple/list, but got %s." % type(first))
return ratings
示例15: _py2java
# 需要导入模块: from pyspark import sql [as 别名]
# 或者: from pyspark.sql import DataFrame [as 别名]
def _py2java(sc, obj):
""" Convert Python object into Java """
if isinstance(obj, RDD):
obj = _to_java_object_rdd(obj)
elif isinstance(obj, DataFrame):
obj = obj._jdf
elif isinstance(obj, SparkContext):
obj = obj._jsc
elif isinstance(obj, list):
obj = [_py2java(sc, x) for x in obj]
elif isinstance(obj, JavaObject):
pass
elif isinstance(obj, (int, long, float, bool, bytes, unicode)):
pass
else:
data = bytearray(PickleSerializer().dumps(obj))
obj = sc._jvm.org.apache.spark.mllib.api.python.SerDe.loads(data)
return obj