本文整理汇总了Python中pyspark.sql.functions.rand方法的典型用法代码示例。如果您正苦于以下问题:Python functions.rand方法的具体用法?Python functions.rand怎么用?Python functions.rand使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyspark.sql.functions
的用法示例。
在下文中一共展示了functions.rand方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: cross_validation_split
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import rand [as 别名]
def cross_validation_split(dataset, k_folds):
"""
Splits dataframe into k_folds, returning array of dataframes
"""
dataset_split = []
h = 1.0 / k_folds
df = dataset.select("*", rand().alias("rand"))
for i in range(k_folds):
validateLB = i * h
validateUB = (i + 1) * h
condition = (df["rand"] >= validateLB) & (df["rand"] < validateUB)
fold = df.filter(condition).cache()
dataset_split.append(fold)
return dataset_split
示例2: _transform
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import rand [as 别名]
def _transform(self, dataset):
return dataset.withColumn("prediction",
dataset.feature + (rand(0) * self.getInducedError()))
示例3: _fit
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import rand [as 别名]
def _fit(self, dataset):
est = self.getOrDefault(self.estimator)
epm = self.getOrDefault(self.estimatorParamMaps)
numModels = len(epm)
eva = self.getOrDefault(self.evaluator)
nFolds = self.getOrDefault(self.numFolds)
seed = self.getOrDefault(self.seed)
h = 1.0 / nFolds
randCol = self.uid + "_rand"
df = dataset.select("*", rand(seed).alias(randCol))
metrics = [0.0] * numModels
pool = ThreadPool(processes=min(self.getParallelism(), numModels))
subModels = None
collectSubModelsParam = self.getCollectSubModels()
if collectSubModelsParam:
subModels = [[None for j in range(numModels)] for i in range(nFolds)]
for i in range(nFolds):
validateLB = i * h
validateUB = (i + 1) * h
condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB)
validation = df.filter(condition).cache()
train = df.filter(~condition).cache()
tasks = _parallelFitTasks(est, train, eva, validation, epm, collectSubModelsParam)
for j, metric, subModel in pool.imap_unordered(lambda f: f(), tasks):
metrics[j] += (metric / nFolds)
if collectSubModelsParam:
subModels[i][j] = subModel
validation.unpersist()
train.unpersist()
if eva.isLargerBetter():
bestIndex = np.argmax(metrics)
else:
bestIndex = np.argmin(metrics)
bestModel = est.fit(dataset, epm[bestIndex])
return self._copyValues(CrossValidatorModel(bestModel, metrics, subModels))
示例4: shuffle
# 需要导入模块: from pyspark.sql import functions [as 别名]
# 或者: from pyspark.sql.functions import rand [as 别名]
def shuffle(dataset):
"""Shuffles the rows in the specified Spark Dataframe.
# Arguments
dataset: dataframe. A Spark Dataframe.
"""
dataset = dataset.orderBy(rand())
dataset.cache()
return dataset