library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.2 ✔ purrr 0.3.4
## ✔ tibble 3.0.1 ✔ dplyr 1.0.0
## ✔ tidyr 1.1.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.0 ──
## ✔ broom 0.5.6 ✔ rsample 0.0.7
## ✔ dials 0.0.7 ✔ tune 0.1.0
## ✔ infer 0.5.2 ✔ workflows 0.1.1
## ✔ parsnip 0.1.1 ✔ yardstick 0.0.6
## ✔ recipes 0.1.13
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(bikeHelpR)
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
con <- DBI::dbConnect(odbc::odbc(), "Content DB")
pins::board_register_rsconnect(server = Sys.getenv("CONNECT_SERVER"),
key = Sys.getenv("CONNECT_API_KEY"))
all_days <- tbl(con, "bike_model_data")
n_days_test <- 2
months_train <- 6
dates <- all_days %>%
count(date) %>%
arrange(desc(date)) %>%
head(n_days_test + 1) %>%
pull(date) %>%
as.Date()
split_date <- dates[n_days_test + 1]
start_train_date <- split_date - dmonths(months_train)
test_dates <- dates[1:n_days_test]
test_dates_str <- paste(test_dates, collapse = " and ")
print(glue::glue(
"Using data on or before {min(test_dates)} as training, data from {test_dates_str} to test."
))
## Using data on or before 2023-02-27 as training, data from 2023-02-28 and 2023-02-27 to test.
train_dat <- all_days %>%
dplyr::filter(
date <= split_date,
date >= start_train_date
) %>%
dplyr::collect()
recipe <- recipe(n_bikes ~ ., data = train_dat) %>%
step_dummy(dow) %>%
prep(train_dat, retain = FALSE)
Make recipe for model:
# downsample if working interactively
if (interactive()) {
train_dat <- dplyr::sample_frac(train_dat, 0.5)
}
train_mat <- recipe %>%
bake(train_dat)
mod <- parsnip::xgb_train(
train_mat %>% select(-n_bikes, -id, -date),
train_mat %>% pull(n_bikes),
nrounds = ifelse(interactive(), 50, 500)
)
## [08:31:20] WARNING: amalgamation/../src/objective/regression_obj.cu:170: reg:linear is now deprecated in favor of reg:squarederror.
test_date_start <- min(test_dates)
test_dat <- all_days %>%
filter(date >= test_date_start) %>%
collect()
preds <- bake(recipe, test_dat) %>%
select(-n_bikes, -id, -date) %>%
as.matrix() %>%
predict(mod, .)
results <- test_dat %>%
mutate(preds = preds)
oos_metrics(results$n_bikes, results$preds)
## # A tibble: 1 x 4
## rmse mae ccc r2
## <dbl> <dbl> <dbl> <dbl>
## 1 3.43 2.65 0.795 0.685
model_details <- list(
model = mod,
train_date = today(),
train_window_start = start_train_date,
split_date = split_date,
recipe = recipe
)
pins::pin(model_details,
"bike_model_rxgb",
"Model of Capitol Citybikes Available per Station",
board = "rsconnect")
DBI::dbDisconnect(con)