step_ns()
function in your recipesplines::ns
functionThe dataset consists of different biomass fuels characterized by the amount of certain molecules and the corresponding “higher heating value” (HHV). We are going to try to predict HHV based on the quantity 5 molecules.
# A tibble: 536 × 8
oxygen nitrogen sulfur HHV carbon_ns_1 carbon_ns_2 hydrogen_ns_1 hydrogen…¹
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 42.9 0.41 0 20.0 0.549 -0.220 0.561 -0.220
2 41.3 0.2 0 19.2 0.547 -0.222 0.564 -0.216
3 46.2 0.11 0.02 18.3 0.536 -0.231 0.567 -0.210
4 35.6 3.3 0.16 18.2 0.512 -0.239 0.530 -0.248
5 40.7 1 0.02 18.4 0.527 -0.235 0.552 -0.233
6 40.2 2.04 0.1 18.5 0.515 -0.238 0.565 -0.213
7 38.2 2.68 0.2 18.7 0.530 -0.233 0.572 -0.196
8 39.7 1.7 0.2 18.3 0.518 -0.238 0.564 -0.216
9 40.9 0.8 0 18.6 0.543 -0.226 0.556 -0.228
10 40 1.2 0.1 18.9 0.530 -0.234 0.569 -0.203
# … with 526 more rows, and abbreviated variable name ¹hydrogen_ns_2
Let’s increase the degree of the spline
library(tidymodels)
rec_splines <- recipe(
HHV ~ carbon + hydrogen + oxygen + nitrogen + sulfur,
data = biomass) |>
step_ns(carbon, hydrogen,
deg_free = 3)
expanded_biomass <- prep(rec_splines, biomass) |>
bake(biomass)
expanded_biomass
# A tibble: 536 × 10
oxygen nitrogen sulfur HHV carbon…¹ carbo…² carbo…³ hydro…⁴ hydro…⁵ hydro…⁶
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 42.9 0.41 0 20.0 0.221 0.536 -0.272 0.228 0.534 -0.279
2 41.3 0.2 0 19.2 0.210 0.540 -0.275 0.245 0.528 -0.275
3 46.2 0.11 0.02 18.3 0.144 0.564 -0.290 0.274 0.518 -0.269
4 35.6 3.3 0.16 18.2 0.0361 0.597 -0.308 0.0529 0.582 -0.304
5 40.7 1 0.02 18.4 0.101 0.579 -0.298 0.159 0.556 -0.291
6 40.2 2.04 0.1 18.5 0.0472 0.595 -0.307 0.260 0.523 -0.272
7 38.2 2.68 0.2 18.7 0.118 0.573 -0.296 0.325 0.499 -0.256
8 39.7 1.7 0.2 18.3 0.0586 0.591 -0.305 0.245 0.528 -0.275
9 40.9 0.8 0 18.6 0.183 0.550 -0.282 0.187 0.547 -0.286
10 40 1.2 0.1 18.9 0.115 0.574 -0.296 0.302 0.508 -0.262
# … with 526 more rows, and abbreviated variable names ¹carbon_ns_1,
# ²carbon_ns_2, ³carbon_ns_3, ⁴hydrogen_ns_1, ⁵hydrogen_ns_2, ⁶hydrogen_ns_3
For a natural cubic spline, what happens beyond the “boundary knots”?
The default interior knots are set as follows:
deg_free |
# of interior knots | quantile |
---|---|---|
2 | 1 | 50% |
3 | 2 | 33.3%, 66.7% |
4 | 3 | 25%, 50%, 75% |
5 | 4 | 20%, 40%, 60%, 80% |
6 | 5 | 16.7%, 33.3%, 50%, 66.7%, 83.3% |
wf <- workflow() |>
add_recipe(rec_splines) |>
add_model(linear_reg())
fit_splines <- fit(wf, data = biomass)
tidy(fit_splines)
# A tibble: 10 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 8.30 0.890 9.32 3.07e- 19
2 oxygen 0.0252 0.00919 2.74 6.39e- 3
3 nitrogen 0.0307 0.0558 0.550 5.82e- 1
4 sulfur 0.251 0.134 1.87 6.14e- 2
5 carbon_ns_1 14.1 0.546 25.8 1.51e- 95
6 carbon_ns_2 19.8 1.54 12.9 2.97e- 33
7 carbon_ns_3 23.9 0.721 33.2 2.67e-131
8 hydrogen_ns_1 1.84 0.519 3.54 4.36e- 4
9 hydrogen_ns_2 7.70 1.30 5.91 6.01e- 9
10 hydrogen_ns_3 4.04 1.26 3.21 1.43e- 3
ggplot2
helper functionYou don’t need to know what is going on here and I will provide this for your labs!
ggplot_pdp <- function(obj, x) {
p <-
as_tibble(obj$agr_profiles) %>%
mutate(`_label_` = stringr::str_remove(`_label_`, "^[^_]*_")) %>%
ggplot(aes(`_x_`, `_yhat_`)) +
geom_line(data = as_tibble(obj$cp_profiles),
aes(x = {{ x }}, group = `_ids_`),
linewidth = 0.5, alpha = 0.05, color = "gray50")
num_colors <- n_distinct(obj$agr_profiles$`_label_`)
if (num_colors > 1) {
p <- p + geom_line(aes(color = `_label_`), linewidth = 1.2, alpha = 0.8)
} else {
p <- p + geom_line(color = "cornflower blue", linewidth = 1.2, alpha = 0.8)
}
p
}
workflow()
allows us to iteratively update our model without having to change the recipe after tuning.select_best
and finalize_workflow
recipe_splines <- recipe(
HHV ~ carbon + hydrogen + oxygen + nitrogen + sulfur,
data = biomass) |>
step_ns(carbon, hydrogen, deg_free = tune())
splines_wf <- workflow() |>
add_recipe(recipe_splines) |>
add_model(linear_reg())
cv <- vfold_cv(biomass)
splines_tuned <- tune_grid(
splines_wf,
cv
)
show_best(splines_tuned, "rmse")
# A tibble: 5 × 7
deg_free .metric .estimator mean n std_err .config
<int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 rmse standard 1.41 10 0.183 Preprocessor1_Model1
2 3 rmse standard 1.42 10 0.184 Preprocessor4_Model1
3 4 rmse standard 1.46 10 0.202 Preprocessor9_Model1
4 6 rmse standard 1.48 10 0.211 Preprocessor3_Model1
5 9 rmse standard 1.48 10 0.219 Preprocessor8_Model1
How many degrees of freedom does the best model have?
best_params <- select_best(splines_tuned, "rmse")
splines_wf_final <- splines_wf |>
finalize_workflow(best_params)
final_fit <- fit(splines_wf_final,
data = biomass)
final_explainer <- explain_tidymodels(
final_fit,
data = dplyr::select(biomass, -HHV),
y = biomass$HHV,
verbose = FALSE
)
carbon_profile <- model_profile(
final_explainer,
variables = "carbon"
)