ETL Step 2 - Tidy data

Published

June 28, 2024

This notebook tidies the raw raw_data_table from the Content DB database. The tidy data is written back to the Content DB database to the bike_model_data table.

Get data from database

Connect to Content DB to get the bike_raw_data and bike_station_info table.

con <- odbc::dbConnect(odbc::odbc(), "Content DB", timeout = 10)

bike_raw_data <- tbl(con, "bike_raw_data")
glimpse(bike_raw_data)
Rows: ??
Columns: 12
Database: postgres  [soleng@localhost:/rds]
$ station_id                <chr> "08250172-1f3f-11e7-bf6b-3863bb334450", "5b7…
$ num_bikes_available       <dbl> 27, 16, 1, 15, 12, 14, 14, 7, 2, 1, 16, 0, 8…
$ num_ebikes_available      <dbl> 12, 0, 1, 1, 0, 0, 0, 0, 0, 0, 4, 0, 1, 1, 2…
$ num_bikes_disabled        <dbl> 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0,…
$ num_docks_available       <dbl> 0, 3, 14, 2, 5, 17, 1, 8, 17, 17, 0, 18, 11,…
$ num_docks_disabled        <dbl> 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ is_installed              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ is_renting                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ is_returning              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ last_reported             <dbl> 1702397888, 1702397888, 1702397892, 17023978…
$ eightd_has_available_keys <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
$ time                      <dttm> 2023-12-12 16:18:49, 2023-12-12 16:18:49, 2…
bike_station_info <- tbl(con, "bike_station_info")
glimpse(bike_station_info)
Rows: ??
Columns: 5
Database: postgres  [soleng@localhost:/rds]
$ station_id   <chr> "e823318f-8406-4230-8072-1667f5de78d4", "08252b28-1f3f-11…
$ name         <chr> "Reston Town Center Metro North", "Gallaudet / 8th St & F…
$ lat          <dbl> 38.95369, 38.90509, 38.96454, 38.84697, 38.90138, 38.9440…
$ lon          <dbl> -77.35972, -76.99410, -77.07513, -77.30335, -76.94188, -7…
$ last_updated <dttm> 2024-06-28 08:01:09, 2024-06-28 08:01:09, 2024-06-28 08:…

Tidy the data

Apply the data tidying steps. All of the tidying steps are performed in SQL. The results are written back to Content DB to the bike_model_data table.

if (odbc::dbExistsTable(con, "bike_model_data")) {
  odbc::dbRemoveTable(con, "bike_model_data")
}

# Build a SQL query to tidy the data.
query <- bike_raw_data %>% 
    group_by(
        id = station_id, 
        hour = hour(time), 
        date = date(time), 
        month = month(time), 
        dow = TRIM(to_char(time, "Day"))
    ) %>%
    summarize(
        n_bikes = mean(num_bikes_available, na.rm = TRUE),
        .groups = "drop"
    ) %>%
    inner_join(
        select(bike_station_info, id = station_id, lat, lon)
    ) %>%
    dbplyr::sql_render() %>%
    stringr::str_replace("SELECT", "CREATE TABLE bike_model_data AS SELECT")

# Execute the SQL query.
odbc::dbSendQuery(con, query)
<OdbcResult>
  SQL  CREATE TABLE bike_model_data AS SELECT "LHS"."id" AS "id", "hour", "date", "month", "dow", "n_bikes", "lat", "lon"
FROM (SELECT "id", "hour", "date", "month", "dow", AVG("num_bikes_available") AS "n_bikes"
FROM (SELECT "station_id", "num_bikes_available", "num_ebikes_available", "num_bikes_disabled", "num_docks_available", "num_docks_disabled", "is_installed", "is_renting", "is_returning", "last_reported", "eightd_has_available_keys", "time", "station_id" AS "id", EXTRACT(hour FROM "time") AS "hour", date("time") AS "date", EXTRACT(MONTH FROM "time") AS "month", TRIM(to_char("time", 'Day')) AS "dow"
FROM "bike_raw_data") "q01"
GROUP BY "id", "hour", "date", "month", "dow") "LHS"
INNER JOIN (SELECT "station_id" AS "id", "lat", "lon"
FROM "bike_station_info") "RHS"
ON ("LHS"."id" = "RHS"."id")

  ROWS Fetched: 0 [complete]
       Changed: 3321813
# Preview the table.
tbl(con, "bike_model_data")
# Source:   table<bike_model_data> [?? x 8]
# Database: postgres [soleng@localhost:/rds]
   id                            hour date       month dow   n_bikes   lat   lon
   <chr>                        <dbl> <date>     <dbl> <chr>   <dbl> <dbl> <dbl>
 1 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-22     3 Wedn…       2  38.9 -76.9
 2 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-23     3 Thur…       2  38.9 -76.9
 3 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-24     3 Frid…       2  38.9 -76.9
 4 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-25     3 Satu…       2  38.9 -76.9
 5 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-26     3 Sund…       2  38.9 -76.9
 6 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-27     3 Mond…       2  38.9 -76.9
 7 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-28     3 Tues…       2  38.9 -76.9
 8 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-29     3 Wedn…       2  38.9 -76.9
 9 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-30     3 Thur…       2  38.9 -76.9
10 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-31     3 Frid…       2  38.9 -76.9
# … with more rows
odbc::dbDisconnect(con)
print("Complete 🎉")
[1] "Complete 🎉"