此頁麵包含配方預處理藍圖的詳細信息。如果 x
是配方,則這是 mold()
默認使用的藍圖。
用法
default_recipe_blueprint(
intercept = FALSE,
allow_novel_levels = FALSE,
fresh = TRUE,
strings_as_factors = TRUE,
composition = "tibble"
)
# S3 method for recipe
mold(x, data, ..., blueprint = NULL)
參數
- intercept
-
一個合乎邏輯的。處理的數據中是否應該包含攔截?該信息由
mold
和forge
函數列表中的process
函數使用。 - allow_novel_levels
-
一個合乎邏輯的。在預測時是否應該允許新的因子水平?此信息由
forge
函數列表中的clean
函數使用,並傳遞給scream()
。 - fresh
-
當調用
prep()
時是否應該重新訓練已經訓練過的操作? - strings_as_factors
-
調用
prep()
時是否應該將字符列轉換為因子? - composition
-
"tibble"、"matrix" 或 "dgCMatrix" 用於已處理預測變量的格式。如果選擇 "matrix" 或 "dgCMatrix",則在應用預處理方法後,所有預測變量都必須為數值;否則會拋出錯誤。
- x
-
從
recipes::recipe()
創建的未準備的配方。 - data
-
包含結果和預測變量的 DataFrame 或矩陣。
- ...
-
不曾用過。
- blueprint
-
預處理
blueprint
。如果保留為NULL
,則使用default_recipe_blueprint()
。
模具
當mold()
與默認配方藍圖一起使用時:
-
它調用
recipes::prep()
來準備配方。 -
它調用
recipes::juice()
來提取結果和預測變量。這些作為 tibbles 返回。 -
如果
intercept = TRUE
,則向預測變量添加截距列。
鍛造
當forge()
與默認配方藍圖一起使用時:
-
它調用
shrink()
將new_data
修剪為僅所需的列,並將new_data
強製為 tibble。 -
它調用
scream()
對new_data
的列結構進行驗證。 -
它使用訓練期間使用的準備配方在
new_data
上調用recipes::bake()
。 -
如果
intercept = TRUE
,它將截距列添加到new_data
上。
例子
library(recipes)
#> Loading required package: dplyr
#>
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#>
#> filter, lag
#> The following objects are masked from ‘package:base’:
#>
#> intersect, setdiff, setequal, union
#>
#> Attaching package: ‘recipes’
#> The following object is masked from ‘package:stats’:
#>
#> step
# ---------------------------------------------------------------------------
# Setup
train <- iris[1:100, ]
test <- iris[101:150, ]
# ---------------------------------------------------------------------------
# Recipes example
# Create a recipe that logs a predictor
rec <- recipe(Species ~ Sepal.Length + Sepal.Width, train) %>%
step_log(Sepal.Length)
processed <- mold(rec, train)
# Sepal.Length has been logged
processed$predictors
#> # A tibble: 100 × 2
#> Sepal.Length Sepal.Width
#> <dbl> <dbl>
#> 1 1.63 3.5
#> 2 1.59 3
#> 3 1.55 3.2
#> 4 1.53 3.1
#> 5 1.61 3.6
#> 6 1.69 3.9
#> 7 1.53 3.4
#> 8 1.61 3.4
#> 9 1.48 2.9
#> 10 1.59 3.1
#> # ℹ 90 more rows
processed$outcomes
#> # A tibble: 100 × 1
#> Species
#> <fct>
#> 1 setosa
#> 2 setosa
#> 3 setosa
#> 4 setosa
#> 5 setosa
#> 6 setosa
#> 7 setosa
#> 8 setosa
#> 9 setosa
#> 10 setosa
#> # ℹ 90 more rows
# The underlying blueprint is a prepped recipe
processed$blueprint$recipe
#>
#> ── Recipe ────────────────────────────────────────────────────────────────
#>
#> ── Inputs
#> Number of variables by role
#> outcome: 1
#> predictor: 2
#>
#> ── Training information
#> Training data contained 100 data points and no incomplete rows.
#>
#> ── Operations
#> • Log transformation on: Sepal.Length | Trained
# Call forge() with the blueprint and the test data
# to have it preprocess the test data in the same way
forge(test, processed$blueprint)
#> $predictors
#> # A tibble: 50 × 2
#> Sepal.Length Sepal.Width
#> <dbl> <dbl>
#> 1 1.84 3.3
#> 2 1.76 2.7
#> 3 1.96 3
#> 4 1.84 2.9
#> 5 1.87 3
#> 6 2.03 3
#> 7 1.59 2.5
#> 8 1.99 2.9
#> 9 1.90 2.5
#> 10 1.97 3.6
#> # ℹ 40 more rows
#>
#> $outcomes
#> NULL
#>
#> $extras
#> $extras$roles
#> NULL
#>
#>
# Use `outcomes = TRUE` to also extract the preprocessed outcome!
# This logged the Sepal.Length column of `new_data`
forge(test, processed$blueprint, outcomes = TRUE)
#> $predictors
#> # A tibble: 50 × 2
#> Sepal.Length Sepal.Width
#> <dbl> <dbl>
#> 1 1.84 3.3
#> 2 1.76 2.7
#> 3 1.96 3
#> 4 1.84 2.9
#> 5 1.87 3
#> 6 2.03 3
#> 7 1.59 2.5
#> 8 1.99 2.9
#> 9 1.90 2.5
#> 10 1.97 3.6
#> # ℹ 40 more rows
#>
#> $outcomes
#> # A tibble: 50 × 1
#> Species
#> <fct>
#> 1 virginica
#> 2 virginica
#> 3 virginica
#> 4 virginica
#> 5 virginica
#> 6 virginica
#> 7 virginica
#> 8 virginica
#> 9 virginica
#> 10 virginica
#> # ℹ 40 more rows
#>
#> $extras
#> $extras$roles
#> NULL
#>
#>
# ---------------------------------------------------------------------------
# With an intercept
# You can add an intercept with `intercept = TRUE`
processed <- mold(rec, train, blueprint = default_recipe_blueprint(intercept = TRUE))
processed$predictors
#> # A tibble: 100 × 3
#> `(Intercept)` Sepal.Length Sepal.Width
#> <int> <dbl> <dbl>
#> 1 1 1.63 3.5
#> 2 1 1.59 3
#> 3 1 1.55 3.2
#> 4 1 1.53 3.1
#> 5 1 1.61 3.6
#> 6 1 1.69 3.9
#> 7 1 1.53 3.4
#> 8 1 1.61 3.4
#> 9 1 1.48 2.9
#> 10 1 1.59 3.1
#> # ℹ 90 more rows
# But you also could have used a recipe step
rec2 <- step_intercept(rec)
mold(rec2, iris)$predictors
#> # A tibble: 150 × 3
#> intercept Sepal.Length Sepal.Width
#> <int> <dbl> <dbl>
#> 1 1 1.63 3.5
#> 2 1 1.59 3
#> 3 1 1.55 3.2
#> 4 1 1.53 3.1
#> 5 1 1.61 3.6
#> 6 1 1.69 3.9
#> 7 1 1.53 3.4
#> 8 1 1.61 3.4
#> 9 1 1.48 2.9
#> 10 1 1.59 3.1
#> # ℹ 140 more rows
# ---------------------------------------------------------------------------
# Matrix output for predictors
# You can change the `composition` of the predictor data set
bp <- default_recipe_blueprint(composition = "dgCMatrix")
processed <- mold(rec, train, blueprint = bp)
class(processed$predictors)
#> [1] "dgCMatrix"
#> attr(,"package")
#> [1] "Matrix"
# ---------------------------------------------------------------------------
# Non standard roles
# If you have custom recipes roles, they are assumed to be required at
# `bake()` time when passing in `new_data`. This is an assumption that both
# recipes and hardhat makes, meaning that those roles are required at
# `forge()` time as well.
rec_roles <- recipe(train) %>%
update_role(Sepal.Width, new_role = "predictor") %>%
update_role(Species, new_role = "outcome") %>%
update_role(Sepal.Length, new_role = "id") %>%
update_role(Petal.Length, new_role = "important")
processed_roles <- mold(rec_roles, train)
# The custom roles will be in the `mold()` result in case you need
# them for modeling.
processed_roles$extras
#> $roles
#> $roles$id
#> # A tibble: 100 × 1
#> Sepal.Length
#> <dbl>
#> 1 5.1
#> 2 4.9
#> 3 4.7
#> 4 4.6
#> 5 5
#> 6 5.4
#> 7 4.6
#> 8 5
#> 9 4.4
#> 10 4.9
#> # ℹ 90 more rows
#>
#> $roles$important
#> # A tibble: 100 × 1
#> Petal.Length
#> <dbl>
#> 1 1.4
#> 2 1.4
#> 3 1.3
#> 4 1.5
#> 5 1.4
#> 6 1.7
#> 7 1.4
#> 8 1.5
#> 9 1.4
#> 10 1.5
#> # ℹ 90 more rows
#>
#> $roles$`NA`
#> # A tibble: 100 × 1
#> Petal.Width
#> <dbl>
#> 1 0.2
#> 2 0.2
#> 3 0.2
#> 4 0.2
#> 5 0.2
#> 6 0.4
#> 7 0.3
#> 8 0.2
#> 9 0.2
#> 10 0.1
#> # ℹ 90 more rows
#>
#>
# And they are in the `forge()` result
forge(test, processed_roles$blueprint)$extras
#> $roles
#> $roles$id
#> # A tibble: 50 × 1
#> Sepal.Length
#> <dbl>
#> 1 6.3
#> 2 5.8
#> 3 7.1
#> 4 6.3
#> 5 6.5
#> 6 7.6
#> 7 4.9
#> 8 7.3
#> 9 6.7
#> 10 7.2
#> # ℹ 40 more rows
#>
#> $roles$important
#> # A tibble: 50 × 1
#> Petal.Length
#> <dbl>
#> 1 6
#> 2 5.1
#> 3 5.9
#> 4 5.6
#> 5 5.8
#> 6 6.6
#> 7 4.5
#> 8 6.3
#> 9 5.8
#> 10 6.1
#> # ℹ 40 more rows
#>
#> $roles$`NA`
#> # A tibble: 50 × 1
#> Petal.Width
#> <dbl>
#> 1 2.5
#> 2 1.9
#> 3 2.1
#> 4 1.8
#> 5 2.2
#> 6 2.1
#> 7 1.7
#> 8 1.8
#> 9 1.8
#> 10 2.5
#> # ℹ 40 more rows
#>
#>
# If you remove a column with a custom role from the test data, then you
# won't be able to `forge()` even though this recipe technically didn't
# use that column in any steps
test2 <- test
test2$Petal.Length <- NULL
try(forge(test2, processed_roles$blueprint))
#> Error in validate_column_names(data, cols) :
#> The following required columns are missing: 'Petal.Length'.
# Most of the time, if you find yourself in the above scenario, then we
# suggest that you remove `Petal.Length` from the data that is supplied to
# the recipe. If that isn't an option, you can declare that that column
# isn't required at `bake()` time by using `update_role_requirements()`
rec_roles <- update_role_requirements(rec_roles, "important", bake = FALSE)
processed_roles <- mold(rec_roles, train)
forge(test2, processed_roles$blueprint)
#> $predictors
#> # A tibble: 50 × 1
#> Sepal.Width
#> <dbl>
#> 1 3.3
#> 2 2.7
#> 3 3
#> 4 2.9
#> 5 3
#> 6 3
#> 7 2.5
#> 8 2.9
#> 9 2.5
#> 10 3.6
#> # ℹ 40 more rows
#>
#> $outcomes
#> NULL
#>
#> $extras
#> $extras$roles
#> $extras$roles$id
#> # A tibble: 50 × 1
#> Sepal.Length
#> <dbl>
#> 1 6.3
#> 2 5.8
#> 3 7.1
#> 4 6.3
#> 5 6.5
#> 6 7.6
#> 7 4.9
#> 8 7.3
#> 9 6.7
#> 10 7.2
#> # ℹ 40 more rows
#>
#> $extras$roles$important
#> # A tibble: 50 × 0
#>
#> $extras$roles$`NA`
#> # A tibble: 50 × 1
#> Petal.Width
#> <dbl>
#> 1 2.5
#> 2 1.9
#> 3 2.1
#> 4 1.8
#> 5 2.2
#> 6 2.1
#> 7 1.7
#> 8 1.8
#> 9 1.8
#> 10 2.5
#> # ℹ 40 more rows
#>
#>
#>
相關用法
- R hardhat default_formula_blueprint 默認公式藍圖
- R hardhat default_xy_blueprint 默認 XY 藍圖
- R hardhat delete_response 從術語對象中刪除響應
- R hardhat validate_prediction_size 確保預測具有正確的行數
- R hardhat is_blueprint x 是預處理藍圖嗎?
- R hardhat validate_column_names 確保數據包含所需的列名
- R hardhat update_blueprint 更新預處理藍圖
- R hardhat weighted_table 加權表
- R hardhat validate_outcomes_are_univariate 確保結果是單變量
- R hardhat get_levels 從 DataFrame 中提取因子水平
- R hardhat add_intercept_column 向數據添加截距列
- R hardhat is_frequency_weights x 是頻率權重向量嗎?
- R hardhat model_offset 提取模型偏移
- R hardhat standardize 標準化結果
- R hardhat model_matrix 構建設計矩陣
- R hardhat is_importance_weights x 是重要性權重向量嗎?
- R hardhat run-mold 根據藍圖 Mold()
- R hardhat get_data_classes 從 DataFrame 或矩陣中提取數據類
- R hardhat fct_encode_one_hot 將一個因子編碼為 one-hot 指標矩陣
- R hardhat new_frequency_weights 構建頻率權重向量
- R hardhat validate_no_formula_duplication 確保公式中不出現重複項
- R hardhat shrink 僅對所需列進行子集化
- R hardhat validate_outcomes_are_numeric 確保結果都是數字
- R hardhat scream ? 尖叫。
- R hardhat frequency_weights 頻率權重
注:本文由純淨天空篩選整理自Davis Vaughan等大神的英文原創作品 Default recipe blueprint。非經特殊聲明,原始代碼版權歸原作者所有,本譯文未經允許或授權,請勿轉載或複製。