<- DBI::dbConnect(odbc::odbc(), "Content DB") con
Model Step 1 - Train and Deploy Model
This notebook trains a model to predict the number of bikes at a given bike docking station. The model is trained using the bike_model_data table from Content DB. The trained model is then:
- pinned to Posit Connect
- deployed as a plumber API to Posit Connect using vetiver.
Get data
Connect to the database:
Split the data into a train/test split:
<- tbl(con, "bike_model_data")
all_days
# Get a vector that contains all of the dates.
<- all_days %>%
dates distinct(date) %>%
collect() %>%
arrange(desc(date)) %>%
pull(date) %>%
as.Date()
# Split the data into test and train.
<- 2
n_days_test <- 10
n_days_to_train
<- dates[n_days_test + 1]
train_end_date <- train_end_date - n_days_to_train
train_start_date
# Training data split.
<- all_days %>%
train_data filter(
>= train_start_date,
date <= train_end_date
date %>%
) distinct() %>%
collect()
print(glue::glue(
"The model will be trained on data from {start} to {end} ",
"({num_obs} observations). ",
start = min(train_data$date),
end = max(train_data$date),
num_obs = scales::comma(nrow(train_data)),
))## The model will be trained on data from 2023-07-23 to 2023-08-02 (35,110 observations).
# Test data split.
<- all_days %>%
test_data filter(date > train_end_date) %>%
distinct() %>%
collect()
print(glue::glue(
"The model will be tested on data from {start} to {end} ",
"({num_obs} observations). ",
start = min(test_data$date),
end = max(test_data$date),
num_obs = scales::comma(nrow(test_data)),
))## The model will be tested on data from 2023-08-03 to 2023-08-04 (5,586 observations).
Train the model
Data preprocessing
Define a recipe to clean the data.
# Define a recipe to clean the data.
<-
recipe_spec recipe(n_bikes ~ ., data = train_data) %>%
step_dummy(dow) %>%
step_integer(id, date)
# Preview the cleaned training data.
%>%
recipe_spec prep(train_data) %>%
bake(head(train_data)) %>%
glimpse()
## Rows: 6
## Columns: 13
## $ id <dbl> 1, 1, 1, 1, 1, 1
## $ hour <dbl> 0, 0, 0, 0, 0, 0
## $ date <dbl> 1, 2, 3, 4, 5, 6
## $ month <dbl> 7, 7, 7, 7, 7, 7
## $ lat <dbl> 38.89483, 38.89483, 38.89483, 38.89483, 38.89483, 38.894…
## $ lon <dbl> -76.98763, -76.98763, -76.98763, -76.98763, -76.98763, -…
## $ n_bikes <dbl> 7, 9, 14, 6, 9, 6
## $ dow_Monday <dbl> 0, 1, 0, 0, 0, 0
## $ dow_Saturday <dbl> 0, 0, 0, 0, 0, 0
## $ dow_Sunday <dbl> 1, 0, 0, 0, 0, 0
## $ dow_Thursday <dbl> 0, 0, 0, 0, 1, 0
## $ dow_Tuesday <dbl> 0, 0, 1, 0, 0, 0
## $ dow_Wednesday <dbl> 0, 0, 0, 1, 0, 0
Fit model
Fit a random forest model:
<-
model_spec rand_forest() %>%
set_mode("regression") %>%
set_engine("ranger")
<-
model_workflow workflow() %>%
add_recipe(recipe_spec) %>%
add_model(model_spec)
<- fit(model_workflow, data = train_data)
model_fit
model_fit## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_dummy()
## • step_integer()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1))
##
## Type: Regression
## Number of trees: 500
## Sample size: 35110
## Number of independent variables: 12
## Mtry: 3
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 10.08887
## R squared (OOB): 0.6248028
Model evaluation
<- predict(model_fit, test_data)
predictions
<- test_data %>%
results mutate(preds = predictions$.pred)
oos_metrics(results$n_bikes, results$preds)
## # A tibble: 1 × 4
## rmse mae ccc r2
## <dbl> <dbl> <dbl> <dbl>
## 1 4.18 3.32 0.430 0.309
Model deployment
vetiver
Create a vetiver
model object.
<- "bike_predict_model_r"
model_name <- glue("sam.edwardes/{model_name}")
pin_name
# Get the train and test data ranges. This will be passed into the pin metadata
# so that other scripts can access this information.
<- list(
date_metadata train_dates = c(
as.character(min(train_data$date)),
as.character(max(train_data$date))
),test_dates = c(
as.character(min(test_data$date)),
as.character(max(test_data$date))
)
)
print(date_metadata)
## $train_dates
## [1] "2023-07-23" "2023-08-02"
##
## $test_dates
## [1] "2023-08-03" "2023-08-04"
# Create the vetiver model.
<- vetiver_model(
v
model_fit,
model_name,versioned = TRUE,
save_ptype = train_data %>%
head(1) %>%
select(-n_bikes),
metadata = date_metadata
)
v##
## ── bike_predict_model_r ─ <butchered_workflow> model for deployment
## A ranger regression modeling workflow using 7 features
pins
Save the model as a pin to Posit Connect:
# Use Posit Connect as a board.
<- pins::board_rsconnect(
board server = Sys.getenv("CONNECT_SERVER"),
key = Sys.getenv("CONNECT_API_KEY"),
versioned = TRUE
)# Write the model to the board.
%>%
board vetiver_pin_write(vetiver_model = v)
plumber
Then, deploy the model as a plumber API to Posit Connect.
# Add server
::addServer(
rsconnecturl = "https://colorado.posit.co/rsc/__api__",
name = "colorado"
)
# Add account
::connectApiUser(
rsconnectaccount = "sam.edwardes",
server = "colorado",
apiKey = Sys.getenv("CONNECT_API_KEY"),
)
# Deploy to Connect
vetiver_deploy_rsconnect(
board = board,
name = pin_name,
appId = "11314",
launch.browser = FALSE,
appTitle = "Bike Predict - Model - API",
predict_args = list(debug = FALSE),
account = "sam.edwardes",
server = "colorado"
)## Preparing to deploy api...DONE
## Uploading bundle for api: 11314...DONE
## Deploying bundle: 116018 for api: 11314 ...
## Api successfully deployed to https://colorado.posit.co/rsc/bike-predict-r-api/
::dbDisconnect(con) DBI