本文整理匯總了Python中pyspark.sql.functions.rand方法的典型用法代碼示例。如果您正苦於以下問題:Python functions.rand方法的具體用法?Python functions.rand怎麽用?Python functions.rand使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pyspark.sql.functions
的用法示例。
在下文中一共展示了functions.rand方法的4個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: cross_validation_split
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import rand [as 別名]
def cross_validation_split(dataset, k_folds):
"""
Splits dataframe into k_folds, returning array of dataframes
"""
dataset_split = []
h = 1.0 / k_folds
df = dataset.select("*", rand().alias("rand"))
for i in range(k_folds):
validateLB = i * h
validateUB = (i + 1) * h
condition = (df["rand"] >= validateLB) & (df["rand"] < validateUB)
fold = df.filter(condition).cache()
dataset_split.append(fold)
return dataset_split
示例2: _transform
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import rand [as 別名]
def _transform(self, dataset):
return dataset.withColumn("prediction",
dataset.feature + (rand(0) * self.getInducedError()))
示例3: _fit
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import rand [as 別名]
def _fit(self, dataset):
est = self.getOrDefault(self.estimator)
epm = self.getOrDefault(self.estimatorParamMaps)
numModels = len(epm)
eva = self.getOrDefault(self.evaluator)
nFolds = self.getOrDefault(self.numFolds)
seed = self.getOrDefault(self.seed)
h = 1.0 / nFolds
randCol = self.uid + "_rand"
df = dataset.select("*", rand(seed).alias(randCol))
metrics = [0.0] * numModels
pool = ThreadPool(processes=min(self.getParallelism(), numModels))
subModels = None
collectSubModelsParam = self.getCollectSubModels()
if collectSubModelsParam:
subModels = [[None for j in range(numModels)] for i in range(nFolds)]
for i in range(nFolds):
validateLB = i * h
validateUB = (i + 1) * h
condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB)
validation = df.filter(condition).cache()
train = df.filter(~condition).cache()
tasks = _parallelFitTasks(est, train, eva, validation, epm, collectSubModelsParam)
for j, metric, subModel in pool.imap_unordered(lambda f: f(), tasks):
metrics[j] += (metric / nFolds)
if collectSubModelsParam:
subModels[i][j] = subModel
validation.unpersist()
train.unpersist()
if eva.isLargerBetter():
bestIndex = np.argmax(metrics)
else:
bestIndex = np.argmin(metrics)
bestModel = est.fit(dataset, epm[bestIndex])
return self._copyValues(CrossValidatorModel(bestModel, metrics, subModels))
示例4: shuffle
# 需要導入模塊: from pyspark.sql import functions [as 別名]
# 或者: from pyspark.sql.functions import rand [as 別名]
def shuffle(dataset):
"""Shuffles the rows in the specified Spark Dataframe.
# Arguments
dataset: dataframe. A Spark Dataframe.
"""
dataset = dataset.orderBy(rand())
dataset.cache()
return dataset