R hardhat default_recipe_blueprint 默认配方蓝图

此页面包含配方预处理蓝图的详细信息。如果 x 是配方，则这是 mold() 默认使用的蓝图。

用法

default_recipe_blueprint(
  intercept = FALSE,
  allow_novel_levels = FALSE,
  fresh = TRUE,
  strings_as_factors = TRUE,
  composition = "tibble"
)

# S3 method for recipe
mold(x, data, ..., blueprint = NULL)

参数

intercept: 一个合乎逻辑的。处理的数据中是否应该包含拦截？该信息由mold 和forge 函数列表中的process 函数使用。
allow_novel_levels: 一个合乎逻辑的。在预测时是否应该允许新的因子水平？此信息由 forge 函数列表中的 clean 函数使用，并传递给 scream() 。
fresh: 当调用 prep() 时是否应该重新训练已经训练过的操作？
strings_as_factors: 调用 prep() 时是否应该将字符列转换为因子？
composition: "tibble"、"matrix" 或 "dgCMatrix" 用于已处理预测变量的格式。如果选择 "matrix" 或 "dgCMatrix"，则在应用预处理方法后，所有预测变量都必须为数值；否则会抛出错误。
x: 从 recipes::recipe() 创建的未准备的配方。
data: 包含结果和预测变量的 DataFrame 或矩阵。
...: 不曾用过。
blueprint: 预处理blueprint。如果保留为NULL，则使用default_recipe_blueprint()。

值

对于 default_recipe_blueprint() ，配方蓝图。

模具

当mold()与默认配方蓝图一起使用时：

它调用 recipes::prep() 来准备配方。
它调用 recipes::juice() 来提取结果和预测变量。这些作为 tibbles 返回。
如果 intercept = TRUE ，则向预测变量添加截距列。

锻造

当forge()与默认配方蓝图一起使用时：

它调用 shrink() 将 new_data 修剪为仅所需的列，并将 new_data 强制为 tibble。
它调用 scream() 对 new_data 的列结构进行验证。
它使用训练期间使用的准备配方在 new_data 上调用 recipes::bake()。
如果 intercept = TRUE ，它将截距列添加到 new_data 上。

例子

library(recipes)
#> Loading required package: dplyr
#> 
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#> 
#>     filter, lag
#> The following objects are masked from ‘package:base’:
#> 
#>     intersect, setdiff, setequal, union
#> 
#> Attaching package: ‘recipes’
#> The following object is masked from ‘package:stats’:
#> 
#>     step

# ---------------------------------------------------------------------------
# Setup

train <- iris[1:100, ]
test <- iris[101:150, ]

# ---------------------------------------------------------------------------
# Recipes example

# Create a recipe that logs a predictor
rec <- recipe(Species ~ Sepal.Length + Sepal.Width, train) %>%
  step_log(Sepal.Length)

processed <- mold(rec, train)

# Sepal.Length has been logged
processed$predictors
#> # A tibble: 100 × 2
#>    Sepal.Length Sepal.Width
#>           <dbl>       <dbl>
#>  1         1.63         3.5
#>  2         1.59         3  
#>  3         1.55         3.2
#>  4         1.53         3.1
#>  5         1.61         3.6
#>  6         1.69         3.9
#>  7         1.53         3.4
#>  8         1.61         3.4
#>  9         1.48         2.9
#> 10         1.59         3.1
#> # ℹ 90 more rows

processed$outcomes
#> # A tibble: 100 × 1
#>    Species
#>    <fct>  
#>  1 setosa 
#>  2 setosa 
#>  3 setosa 
#>  4 setosa 
#>  5 setosa 
#>  6 setosa 
#>  7 setosa 
#>  8 setosa 
#>  9 setosa 
#> 10 setosa 
#> # ℹ 90 more rows

# The underlying blueprint is a prepped recipe
processed$blueprint$recipe
#> 
#> ── Recipe ────────────────────────────────────────────────────────────────
#> 
#> ── Inputs 
#> Number of variables by role
#> outcome:   1
#> predictor: 2
#> 
#> ── Training information 
#> Training data contained 100 data points and no incomplete rows.
#> 
#> ── Operations 
#> • Log transformation on: Sepal.Length | Trained

# Call forge() with the blueprint and the test data
# to have it preprocess the test data in the same way
forge(test, processed$blueprint)
#> $predictors
#> # A tibble: 50 × 2
#>    Sepal.Length Sepal.Width
#>           <dbl>       <dbl>
#>  1         1.84         3.3
#>  2         1.76         2.7
#>  3         1.96         3  
#>  4         1.84         2.9
#>  5         1.87         3  
#>  6         2.03         3  
#>  7         1.59         2.5
#>  8         1.99         2.9
#>  9         1.90         2.5
#> 10         1.97         3.6
#> # ℹ 40 more rows
#> 
#> $outcomes
#> NULL
#> 
#> $extras
#> $extras$roles
#> NULL
#> 
#> 

# Use `outcomes = TRUE` to also extract the preprocessed outcome!
# This logged the Sepal.Length column of `new_data`
forge(test, processed$blueprint, outcomes = TRUE)
#> $predictors
#> # A tibble: 50 × 2
#>    Sepal.Length Sepal.Width
#>           <dbl>       <dbl>
#>  1         1.84         3.3
#>  2         1.76         2.7
#>  3         1.96         3  
#>  4         1.84         2.9
#>  5         1.87         3  
#>  6         2.03         3  
#>  7         1.59         2.5
#>  8         1.99         2.9
#>  9         1.90         2.5
#> 10         1.97         3.6
#> # ℹ 40 more rows
#> 
#> $outcomes
#> # A tibble: 50 × 1
#>    Species  
#>    <fct>    
#>  1 virginica
#>  2 virginica
#>  3 virginica
#>  4 virginica
#>  5 virginica
#>  6 virginica
#>  7 virginica
#>  8 virginica
#>  9 virginica
#> 10 virginica
#> # ℹ 40 more rows
#> 
#> $extras
#> $extras$roles
#> NULL
#> 
#> 

# ---------------------------------------------------------------------------
# With an intercept

# You can add an intercept with `intercept = TRUE`
processed <- mold(rec, train, blueprint = default_recipe_blueprint(intercept = TRUE))

processed$predictors
#> # A tibble: 100 × 3
#>    `(Intercept)` Sepal.Length Sepal.Width
#>            <int>        <dbl>       <dbl>
#>  1             1         1.63         3.5
#>  2             1         1.59         3  
#>  3             1         1.55         3.2
#>  4             1         1.53         3.1
#>  5             1         1.61         3.6
#>  6             1         1.69         3.9
#>  7             1         1.53         3.4
#>  8             1         1.61         3.4
#>  9             1         1.48         2.9
#> 10             1         1.59         3.1
#> # ℹ 90 more rows

# But you also could have used a recipe step
rec2 <- step_intercept(rec)

mold(rec2, iris)$predictors
#> # A tibble: 150 × 3
#>    intercept Sepal.Length Sepal.Width
#>        <int>        <dbl>       <dbl>
#>  1         1         1.63         3.5
#>  2         1         1.59         3  
#>  3         1         1.55         3.2
#>  4         1         1.53         3.1
#>  5         1         1.61         3.6
#>  6         1         1.69         3.9
#>  7         1         1.53         3.4
#>  8         1         1.61         3.4
#>  9         1         1.48         2.9
#> 10         1         1.59         3.1
#> # ℹ 140 more rows

# ---------------------------------------------------------------------------
# Matrix output for predictors

# You can change the `composition` of the predictor data set
bp <- default_recipe_blueprint(composition = "dgCMatrix")
processed <- mold(rec, train, blueprint = bp)
class(processed$predictors)
#> [1] "dgCMatrix"
#> attr(,"package")
#> [1] "Matrix"

# ---------------------------------------------------------------------------
# Non standard roles

# If you have custom recipes roles, they are assumed to be required at
# `bake()` time when passing in `new_data`. This is an assumption that both
# recipes and hardhat makes, meaning that those roles are required at
# `forge()` time as well.
rec_roles <- recipe(train) %>%
  update_role(Sepal.Width, new_role = "predictor") %>%
  update_role(Species, new_role = "outcome") %>%
  update_role(Sepal.Length, new_role = "id") %>%
  update_role(Petal.Length, new_role = "important")

processed_roles <- mold(rec_roles, train)

# The custom roles will be in the `mold()` result in case you need
# them for modeling.
processed_roles$extras
#> $roles
#> $roles$id
#> # A tibble: 100 × 1
#>    Sepal.Length
#>           <dbl>
#>  1          5.1
#>  2          4.9
#>  3          4.7
#>  4          4.6
#>  5          5  
#>  6          5.4
#>  7          4.6
#>  8          5  
#>  9          4.4
#> 10          4.9
#> # ℹ 90 more rows
#> 
#> $roles$important
#> # A tibble: 100 × 1
#>    Petal.Length
#>           <dbl>
#>  1          1.4
#>  2          1.4
#>  3          1.3
#>  4          1.5
#>  5          1.4
#>  6          1.7
#>  7          1.4
#>  8          1.5
#>  9          1.4
#> 10          1.5
#> # ℹ 90 more rows
#> 
#> $roles$`NA`
#> # A tibble: 100 × 1
#>    Petal.Width
#>          <dbl>
#>  1         0.2
#>  2         0.2
#>  3         0.2
#>  4         0.2
#>  5         0.2
#>  6         0.4
#>  7         0.3
#>  8         0.2
#>  9         0.2
#> 10         0.1
#> # ℹ 90 more rows
#> 
#> 

# And they are in the `forge()` result
forge(test, processed_roles$blueprint)$extras
#> $roles
#> $roles$id
#> # A tibble: 50 × 1
#>    Sepal.Length
#>           <dbl>
#>  1          6.3
#>  2          5.8
#>  3          7.1
#>  4          6.3
#>  5          6.5
#>  6          7.6
#>  7          4.9
#>  8          7.3
#>  9          6.7
#> 10          7.2
#> # ℹ 40 more rows
#> 
#> $roles$important
#> # A tibble: 50 × 1
#>    Petal.Length
#>           <dbl>
#>  1          6  
#>  2          5.1
#>  3          5.9
#>  4          5.6
#>  5          5.8
#>  6          6.6
#>  7          4.5
#>  8          6.3
#>  9          5.8
#> 10          6.1
#> # ℹ 40 more rows
#> 
#> $roles$`NA`
#> # A tibble: 50 × 1
#>    Petal.Width
#>          <dbl>
#>  1         2.5
#>  2         1.9
#>  3         2.1
#>  4         1.8
#>  5         2.2
#>  6         2.1
#>  7         1.7
#>  8         1.8
#>  9         1.8
#> 10         2.5
#> # ℹ 40 more rows
#> 
#> 

# If you remove a column with a custom role from the test data, then you
# won't be able to `forge()` even though this recipe technically didn't
# use that column in any steps
test2 <- test
test2$Petal.Length <- NULL
try(forge(test2, processed_roles$blueprint))
#> Error in validate_column_names(data, cols) : 
#>   The following required columns are missing: 'Petal.Length'.

# Most of the time, if you find yourself in the above scenario, then we
# suggest that you remove `Petal.Length` from the data that is supplied to
# the recipe. If that isn't an option, you can declare that that column
# isn't required at `bake()` time by using `update_role_requirements()`
rec_roles <- update_role_requirements(rec_roles, "important", bake = FALSE)
processed_roles <- mold(rec_roles, train)
forge(test2, processed_roles$blueprint)
#> $predictors
#> # A tibble: 50 × 1
#>    Sepal.Width
#>          <dbl>
#>  1         3.3
#>  2         2.7
#>  3         3  
#>  4         2.9
#>  5         3  
#>  6         3  
#>  7         2.5
#>  8         2.9
#>  9         2.5
#> 10         3.6
#> # ℹ 40 more rows
#> 
#> $outcomes
#> NULL
#> 
#> $extras
#> $extras$roles
#> $extras$roles$id
#> # A tibble: 50 × 1
#>    Sepal.Length
#>           <dbl>
#>  1          6.3
#>  2          5.8
#>  3          7.1
#>  4          6.3
#>  5          6.5
#>  6          7.6
#>  7          4.9
#>  8          7.3
#>  9          6.7
#> 10          7.2
#> # ℹ 40 more rows
#> 
#> $extras$roles$important
#> # A tibble: 50 × 0
#> 
#> $extras$roles$`NA`
#> # A tibble: 50 × 1
#>    Petal.Width
#>          <dbl>
#>  1         2.5
#>  2         1.9
#>  3         2.1
#>  4         1.8
#>  5         2.2
#>  6         2.1
#>  7         1.7
#>  8         1.8
#>  9         1.8
#> 10         2.5
#> # ℹ 40 more rows
#> 
#> 
#>

源代码：R/blueprint-recipe-default.R、R/mold.R

相关用法

注：本文由纯净天空筛选整理自Davis Vaughan等大神的英文原创作品 Default recipe blueprint。非经特殊声明，原始代码版权归原作者所有，本译文未经允许或授权，请勿转载或复制。