此页面包含配方预处理蓝图的详细信息。如果 x
是配方,则这是 mold()
默认使用的蓝图。
用法
default_recipe_blueprint(
intercept = FALSE,
allow_novel_levels = FALSE,
fresh = TRUE,
strings_as_factors = TRUE,
composition = "tibble"
)
# S3 method for recipe
mold(x, data, ..., blueprint = NULL)
参数
- intercept
-
一个合乎逻辑的。处理的数据中是否应该包含拦截?该信息由
mold
和forge
函数列表中的process
函数使用。 - allow_novel_levels
-
一个合乎逻辑的。在预测时是否应该允许新的因子水平?此信息由
forge
函数列表中的clean
函数使用,并传递给scream()
。 - fresh
-
当调用
prep()
时是否应该重新训练已经训练过的操作? - strings_as_factors
-
调用
prep()
时是否应该将字符列转换为因子? - composition
-
"tibble"、"matrix" 或 "dgCMatrix" 用于已处理预测变量的格式。如果选择 "matrix" 或 "dgCMatrix",则在应用预处理方法后,所有预测变量都必须为数值;否则会抛出错误。
- x
-
从
recipes::recipe()
创建的未准备的配方。 - data
-
包含结果和预测变量的 DataFrame 或矩阵。
- ...
-
不曾用过。
- blueprint
-
预处理
blueprint
。如果保留为NULL
,则使用default_recipe_blueprint()
。
模具
当mold()
与默认配方蓝图一起使用时:
-
它调用
recipes::prep()
来准备配方。 -
它调用
recipes::juice()
来提取结果和预测变量。这些作为 tibbles 返回。 -
如果
intercept = TRUE
,则向预测变量添加截距列。
锻造
当forge()
与默认配方蓝图一起使用时:
-
它调用
shrink()
将new_data
修剪为仅所需的列,并将new_data
强制为 tibble。 -
它调用
scream()
对new_data
的列结构进行验证。 -
它使用训练期间使用的准备配方在
new_data
上调用recipes::bake()
。 -
如果
intercept = TRUE
,它将截距列添加到new_data
上。
例子
library(recipes)
#> Loading required package: dplyr
#>
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#>
#> filter, lag
#> The following objects are masked from ‘package:base’:
#>
#> intersect, setdiff, setequal, union
#>
#> Attaching package: ‘recipes’
#> The following object is masked from ‘package:stats’:
#>
#> step
# ---------------------------------------------------------------------------
# Setup
train <- iris[1:100, ]
test <- iris[101:150, ]
# ---------------------------------------------------------------------------
# Recipes example
# Create a recipe that logs a predictor
rec <- recipe(Species ~ Sepal.Length + Sepal.Width, train) %>%
step_log(Sepal.Length)
processed <- mold(rec, train)
# Sepal.Length has been logged
processed$predictors
#> # A tibble: 100 × 2
#> Sepal.Length Sepal.Width
#> <dbl> <dbl>
#> 1 1.63 3.5
#> 2 1.59 3
#> 3 1.55 3.2
#> 4 1.53 3.1
#> 5 1.61 3.6
#> 6 1.69 3.9
#> 7 1.53 3.4
#> 8 1.61 3.4
#> 9 1.48 2.9
#> 10 1.59 3.1
#> # ℹ 90 more rows
processed$outcomes
#> # A tibble: 100 × 1
#> Species
#> <fct>
#> 1 setosa
#> 2 setosa
#> 3 setosa
#> 4 setosa
#> 5 setosa
#> 6 setosa
#> 7 setosa
#> 8 setosa
#> 9 setosa
#> 10 setosa
#> # ℹ 90 more rows
# The underlying blueprint is a prepped recipe
processed$blueprint$recipe
#>
#> ── Recipe ────────────────────────────────────────────────────────────────
#>
#> ── Inputs
#> Number of variables by role
#> outcome: 1
#> predictor: 2
#>
#> ── Training information
#> Training data contained 100 data points and no incomplete rows.
#>
#> ── Operations
#> • Log transformation on: Sepal.Length | Trained
# Call forge() with the blueprint and the test data
# to have it preprocess the test data in the same way
forge(test, processed$blueprint)
#> $predictors
#> # A tibble: 50 × 2
#> Sepal.Length Sepal.Width
#> <dbl> <dbl>
#> 1 1.84 3.3
#> 2 1.76 2.7
#> 3 1.96 3
#> 4 1.84 2.9
#> 5 1.87 3
#> 6 2.03 3
#> 7 1.59 2.5
#> 8 1.99 2.9
#> 9 1.90 2.5
#> 10 1.97 3.6
#> # ℹ 40 more rows
#>
#> $outcomes
#> NULL
#>
#> $extras
#> $extras$roles
#> NULL
#>
#>
# Use `outcomes = TRUE` to also extract the preprocessed outcome!
# This logged the Sepal.Length column of `new_data`
forge(test, processed$blueprint, outcomes = TRUE)
#> $predictors
#> # A tibble: 50 × 2
#> Sepal.Length Sepal.Width
#> <dbl> <dbl>
#> 1 1.84 3.3
#> 2 1.76 2.7
#> 3 1.96 3
#> 4 1.84 2.9
#> 5 1.87 3
#> 6 2.03 3
#> 7 1.59 2.5
#> 8 1.99 2.9
#> 9 1.90 2.5
#> 10 1.97 3.6
#> # ℹ 40 more rows
#>
#> $outcomes
#> # A tibble: 50 × 1
#> Species
#> <fct>
#> 1 virginica
#> 2 virginica
#> 3 virginica
#> 4 virginica
#> 5 virginica
#> 6 virginica
#> 7 virginica
#> 8 virginica
#> 9 virginica
#> 10 virginica
#> # ℹ 40 more rows
#>
#> $extras
#> $extras$roles
#> NULL
#>
#>
# ---------------------------------------------------------------------------
# With an intercept
# You can add an intercept with `intercept = TRUE`
processed <- mold(rec, train, blueprint = default_recipe_blueprint(intercept = TRUE))
processed$predictors
#> # A tibble: 100 × 3
#> `(Intercept)` Sepal.Length Sepal.Width
#> <int> <dbl> <dbl>
#> 1 1 1.63 3.5
#> 2 1 1.59 3
#> 3 1 1.55 3.2
#> 4 1 1.53 3.1
#> 5 1 1.61 3.6
#> 6 1 1.69 3.9
#> 7 1 1.53 3.4
#> 8 1 1.61 3.4
#> 9 1 1.48 2.9
#> 10 1 1.59 3.1
#> # ℹ 90 more rows
# But you also could have used a recipe step
rec2 <- step_intercept(rec)
mold(rec2, iris)$predictors
#> # A tibble: 150 × 3
#> intercept Sepal.Length Sepal.Width
#> <int> <dbl> <dbl>
#> 1 1 1.63 3.5
#> 2 1 1.59 3
#> 3 1 1.55 3.2
#> 4 1 1.53 3.1
#> 5 1 1.61 3.6
#> 6 1 1.69 3.9
#> 7 1 1.53 3.4
#> 8 1 1.61 3.4
#> 9 1 1.48 2.9
#> 10 1 1.59 3.1
#> # ℹ 140 more rows
# ---------------------------------------------------------------------------
# Matrix output for predictors
# You can change the `composition` of the predictor data set
bp <- default_recipe_blueprint(composition = "dgCMatrix")
processed <- mold(rec, train, blueprint = bp)
class(processed$predictors)
#> [1] "dgCMatrix"
#> attr(,"package")
#> [1] "Matrix"
# ---------------------------------------------------------------------------
# Non standard roles
# If you have custom recipes roles, they are assumed to be required at
# `bake()` time when passing in `new_data`. This is an assumption that both
# recipes and hardhat makes, meaning that those roles are required at
# `forge()` time as well.
rec_roles <- recipe(train) %>%
update_role(Sepal.Width, new_role = "predictor") %>%
update_role(Species, new_role = "outcome") %>%
update_role(Sepal.Length, new_role = "id") %>%
update_role(Petal.Length, new_role = "important")
processed_roles <- mold(rec_roles, train)
# The custom roles will be in the `mold()` result in case you need
# them for modeling.
processed_roles$extras
#> $roles
#> $roles$id
#> # A tibble: 100 × 1
#> Sepal.Length
#> <dbl>
#> 1 5.1
#> 2 4.9
#> 3 4.7
#> 4 4.6
#> 5 5
#> 6 5.4
#> 7 4.6
#> 8 5
#> 9 4.4
#> 10 4.9
#> # ℹ 90 more rows
#>
#> $roles$important
#> # A tibble: 100 × 1
#> Petal.Length
#> <dbl>
#> 1 1.4
#> 2 1.4
#> 3 1.3
#> 4 1.5
#> 5 1.4
#> 6 1.7
#> 7 1.4
#> 8 1.5
#> 9 1.4
#> 10 1.5
#> # ℹ 90 more rows
#>
#> $roles$`NA`
#> # A tibble: 100 × 1
#> Petal.Width
#> <dbl>
#> 1 0.2
#> 2 0.2
#> 3 0.2
#> 4 0.2
#> 5 0.2
#> 6 0.4
#> 7 0.3
#> 8 0.2
#> 9 0.2
#> 10 0.1
#> # ℹ 90 more rows
#>
#>
# And they are in the `forge()` result
forge(test, processed_roles$blueprint)$extras
#> $roles
#> $roles$id
#> # A tibble: 50 × 1
#> Sepal.Length
#> <dbl>
#> 1 6.3
#> 2 5.8
#> 3 7.1
#> 4 6.3
#> 5 6.5
#> 6 7.6
#> 7 4.9
#> 8 7.3
#> 9 6.7
#> 10 7.2
#> # ℹ 40 more rows
#>
#> $roles$important
#> # A tibble: 50 × 1
#> Petal.Length
#> <dbl>
#> 1 6
#> 2 5.1
#> 3 5.9
#> 4 5.6
#> 5 5.8
#> 6 6.6
#> 7 4.5
#> 8 6.3
#> 9 5.8
#> 10 6.1
#> # ℹ 40 more rows
#>
#> $roles$`NA`
#> # A tibble: 50 × 1
#> Petal.Width
#> <dbl>
#> 1 2.5
#> 2 1.9
#> 3 2.1
#> 4 1.8
#> 5 2.2
#> 6 2.1
#> 7 1.7
#> 8 1.8
#> 9 1.8
#> 10 2.5
#> # ℹ 40 more rows
#>
#>
# If you remove a column with a custom role from the test data, then you
# won't be able to `forge()` even though this recipe technically didn't
# use that column in any steps
test2 <- test
test2$Petal.Length <- NULL
try(forge(test2, processed_roles$blueprint))
#> Error in validate_column_names(data, cols) :
#> The following required columns are missing: 'Petal.Length'.
# Most of the time, if you find yourself in the above scenario, then we
# suggest that you remove `Petal.Length` from the data that is supplied to
# the recipe. If that isn't an option, you can declare that that column
# isn't required at `bake()` time by using `update_role_requirements()`
rec_roles <- update_role_requirements(rec_roles, "important", bake = FALSE)
processed_roles <- mold(rec_roles, train)
forge(test2, processed_roles$blueprint)
#> $predictors
#> # A tibble: 50 × 1
#> Sepal.Width
#> <dbl>
#> 1 3.3
#> 2 2.7
#> 3 3
#> 4 2.9
#> 5 3
#> 6 3
#> 7 2.5
#> 8 2.9
#> 9 2.5
#> 10 3.6
#> # ℹ 40 more rows
#>
#> $outcomes
#> NULL
#>
#> $extras
#> $extras$roles
#> $extras$roles$id
#> # A tibble: 50 × 1
#> Sepal.Length
#> <dbl>
#> 1 6.3
#> 2 5.8
#> 3 7.1
#> 4 6.3
#> 5 6.5
#> 6 7.6
#> 7 4.9
#> 8 7.3
#> 9 6.7
#> 10 7.2
#> # ℹ 40 more rows
#>
#> $extras$roles$important
#> # A tibble: 50 × 0
#>
#> $extras$roles$`NA`
#> # A tibble: 50 × 1
#> Petal.Width
#> <dbl>
#> 1 2.5
#> 2 1.9
#> 3 2.1
#> 4 1.8
#> 5 2.2
#> 6 2.1
#> 7 1.7
#> 8 1.8
#> 9 1.8
#> 10 2.5
#> # ℹ 40 more rows
#>
#>
#>
相关用法
- R hardhat default_formula_blueprint 默认公式蓝图
- R hardhat default_xy_blueprint 默认 XY 蓝图
- R hardhat delete_response 从术语对象中删除响应
- R hardhat validate_prediction_size 确保预测具有正确的行数
- R hardhat is_blueprint x 是预处理蓝图吗?
- R hardhat validate_column_names 确保数据包含所需的列名
- R hardhat update_blueprint 更新预处理蓝图
- R hardhat weighted_table 加权表
- R hardhat validate_outcomes_are_univariate 确保结果是单变量
- R hardhat get_levels 从 DataFrame 中提取因子水平
- R hardhat add_intercept_column 向数据添加截距列
- R hardhat is_frequency_weights x 是频率权重向量吗?
- R hardhat model_offset 提取模型偏移
- R hardhat standardize 标准化结果
- R hardhat model_matrix 构建设计矩阵
- R hardhat is_importance_weights x 是重要性权重向量吗?
- R hardhat run-mold 根据蓝图 Mold()
- R hardhat get_data_classes 从 DataFrame 或矩阵中提取数据类
- R hardhat fct_encode_one_hot 将一个因子编码为 one-hot 指标矩阵
- R hardhat new_frequency_weights 构建频率权重向量
- R hardhat validate_no_formula_duplication 确保公式中不出现重复项
- R hardhat shrink 仅对所需列进行子集化
- R hardhat validate_outcomes_are_numeric 确保结果都是数字
- R hardhat scream ? 尖叫。
- R hardhat frequency_weights 频率权重
注:本文由纯净天空筛选整理自Davis Vaughan等大神的英文原创作品 Default recipe blueprint。非经特殊声明,原始代码版权归原作者所有,本译文未经允许或授权,请勿转载或复制。