3 Data transformation
3.1 Group and sort records
Learn how to use group_by()
and arrange()
to better understand aggregated data
- How many flights are there per month?
flights %>%
group_by(month) %>%
tally()
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## month n
## <dbl> <S3: integer64>
## 1 10.0 556205
## 2 4.00 598126
## 3 12.0 544958
## 4 5.00 606293
## 5 2.00 569236
## 6 7.00 627931
## 7 11.0 523272
## 8 9.00 540908
## 9 1.00 605765
## 10 3.00 616090
## # ... with more rows
- Order the results by the month number by using
arrange()
flights %>%
group_by(month) %>%
tally() %>%
arrange(month)
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## # Ordered by: month
## month n
## <dbl> <S3: integer64>
## 1 1.00 605765
## 2 2.00 569236
## 3 3.00 616090
## 4 4.00 598126
## 5 5.00 606293
## 6 6.00 608665
## 7 7.00 627931
## 8 8.00 612279
## 9 9.00 540908
## 10 10.0 556205
## # ... with more rows
- Order the results by the number of flights, starting with the month with most flights by using
desc()
inside thearrange()
command
flights %>%
group_by(month) %>%
tally() %>%
arrange(desc(n))
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## # Ordered by: desc(n)
## month n
## <dbl> <S3: integer64>
## 1 7.00 627931
## 2 3.00 616090
## 3 8.00 612279
## 4 6.00 608665
## 5 5.00 606293
## 6 1.00 605765
## 7 4.00 598126
## 8 2.00 569236
## 9 10.0 556205
## 10 12.0 544958
## # ... with more rows
3.2 Answering questions with dplyr
Quick review of how to translate questions into dplyr
code
- Which are the top 4 months with the most flight activity?
flights %>%
group_by(month) %>%
tally() %>%
arrange(desc(n)) %>%
head(4)
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## # Ordered by: desc(n)
## month n
## <dbl> <S3: integer64>
## 1 7.00 627931
## 2 3.00 616090
## 3 8.00 612279
## 4 6.00 608665
- What were the top 5 calendar days with most flight activity?
flights %>%
group_by(month, dayofmonth) %>%
tally() %>%
arrange(desc(n)) %>%
head(5)
## # Source: lazy query [?? x 3]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## # Groups: month
## # Ordered by: desc(n)
## month dayofmonth n
## <dbl> <dbl> <S3: integer64>
## 1 7.00 18.0 21128
## 2 7.00 11.0 21125
## 3 7.00 25.0 21102
## 4 7.00 10.0 21058
## 5 7.00 17.0 21055
- Which are the top 5 carriers (airlines) with the most flights?
flights %>%
group_by(carriername) %>%
tally() %>%
arrange(desc(n)) %>%
head(5)
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## # Ordered by: desc(n)
## carriername n
## <chr> <S3: integ>
## 1 Southwest Airlines Co. 1201754
## 2 American Airlines Inc. 604885
## 3 Skywest Airlines Inc. 567159
## 4 American Eagle Airlines Inc. 490693
## 5 US Airways Inc. (Merged with America West 9/05. Reporting f… 453589
- Figure the percent ratio of flights per month
flights %>%
group_by(month) %>%
tally() %>%
arrange(desc(n)) %>%
mutate(percent = n/sum(n, na.rm = TRUE))
## # Source: lazy query [?? x 3]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## # Ordered by: desc(n)
## month n percent
## <dbl> <S3: integer64> <dbl>
## 1 7.00 627931 0.0896
## 2 3.00 616090 0.0879
## 3 8.00 612279 0.0873
## 4 6.00 608665 0.0868
## 5 5.00 606293 0.0865
## 6 1.00 605765 0.0864
## 7 4.00 598126 0.0853
## 8 2.00 569236 0.0812
## 9 10.0 556205 0.0793
## 10 12.0 544958 0.0777
## # ... with more rows
- Figure the percent ratio of flights per carrier
flights %>%
group_by(carriername) %>%
tally() %>%
arrange(desc(n)) %>%
mutate(percent = n/sum(n, na.rm = TRUE))
## # Source: lazy query [?? x 3]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## # Ordered by: desc(n)
## carriername n percent
## <chr> <S3: inte> <dbl>
## 1 Southwest Airlines Co. 1201754 0.171
## 2 American Airlines Inc. 604885 0.0863
## 3 Skywest Airlines Inc. 567159 0.0809
## 4 American Eagle Airlines Inc. 490693 0.0700
## 5 US Airways Inc. (Merged with America West 9/05. Rep… 453589 0.0647
## 6 Delta Air Lines Inc. 451931 0.0645
## 7 United Air Lines Inc. 449515 0.0641
## 8 Expressjet Airlines Inc. 374510 0.0534
## 9 Northwest Airlines Inc. 347652 0.0496
## 10 Continental Air Lines Inc. 298455 0.0426
## # ... with more rows
3.3 Aggregate mulitple columns
Practice using summarise _
functions
- Use
summarise_all()
to send the same function to all fields
flights %>%
select(depdelay, arrdelay) %>%
summarise_all(mean, na.rm = TRUE)
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## depdelay arrdelay
## <dbl> <dbl>
## 1 9.78 7.99
- Use
summarise_at()
to pre-select the fields that will receive the function
flights %>%
summarise_at(c("depdelay", "arrdelay"), mean, na.rm = TRUE)
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## depdelay arrdelay
## <dbl> <dbl>
## 1 9.78 7.99
- Use
summarise_if()
to summarize only if the field meets a criterion
flights %>%
summarise_if(is.numeric,mean, na.rm = TRUE)
## Applying predicate on the first 100 rows
## # Source: lazy query [?? x 30]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## carrierdelay originlat originlong destlat destlong flightid year month
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3.43 36.9 -95.1 36.9 -95.1 3504864 2008 6.38
## # ... with 22 more variables: dayofmonth <dbl>, dayofweek <dbl>,
## # deptime <dbl>, crsdeptime <dbl>, arrtime <dbl>, crsarrtime <dbl>,
## # flightnum <dbl>, actualelapsedtime <dbl>, crselapsedtime <dbl>,
## # airtime <dbl>, arrdelay <dbl>, depdelay <dbl>, distance <dbl>,
## # taxiin <dbl>, taxiout <dbl>, cancelled <dbl>, diverted <dbl>,
## # weatherdelay <dbl>, nasdelay <dbl>, securitydelay <dbl>,
## # lateaircraftdelay <dbl>, score <dbl>
- Combine with
group_by()
to create more complex results
flights %>%
select(month, depdelay, arrdelay) %>%
group_by(month) %>%
summarise_all(mean, na.rm = TRUE)
## # Source: lazy query [?? x 3]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## month depdelay arrdelay
## <dbl> <dbl> <dbl>
## 1 10.0 3.78 0.412
## 2 4.00 8.06 6.68
## 3 12.0 16.8 16.1
## 4 5.00 7.56 5.91
## 5 2.00 13.2 12.6
## 6 7.00 11.6 9.78
## 7 11.0 5.38 2.00
## 8 9.00 3.89 0.684
## 9 1.00 11.1 9.88
## 10 3.00 12.2 10.9
## # ... with more rows
3.4 View record level data
Important tips to record preview data
How many flights in July 18th were one or more hours late?
flights %>%
filter(
depdelay >= 60,
month == 7,
dayofmonth == 18
) %>%
tally()
## # Source: lazy query [?? x 1]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## n
## <S3: integer64>
## 1 1239
- Use
filter()
to retrieve only the needed data, andhead()
to limit the preview even further.
flights %>%
filter(
depdelay >= 60,
month == 7,
dayofmonth == 18
) %>%
head(100)
## # Source: lazy query [?? x 44]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## uniquecarrier carrierdelay carriername origin originname origincity
## <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 WN 0 Southwest Ai… ABQ Albuquerque… Albuquerq…
## 2 WN 42.0 Southwest Ai… ABQ Albuquerque… Albuquerq…
## 3 WN 122 Southwest Ai… ABQ Albuquerque… Albuquerq…
## 4 WN 0 Southwest Ai… ABQ Albuquerque… Albuquerq…
## 5 WN 71.0 Southwest Ai… ABQ Albuquerque… Albuquerq…
## 6 WN 14.0 Southwest Ai… ABQ Albuquerque… Albuquerq…
## 7 WN 84.0 Southwest Ai… AUS Austin-Berg… Austin
## 8 WN 56.0 Southwest Ai… AUS Austin-Berg… Austin
## 9 WN 0 Southwest Ai… BNA Nashville I… Nashville
## 10 WN 32.0 Southwest Ai… BNA Nashville I… Nashville
## # ... with more rows, and 38 more variables: originstate <chr>,
## # origincountry <chr>, originlat <dbl>, originlong <dbl>, dest <chr>,
## # destname <chr>, destcity <chr>, deststate <chr>, destcountry <chr>,
## # destlat <dbl>, destlong <dbl>, flightid <int>, year <dbl>,
## # month <dbl>, dayofmonth <dbl>, dayofweek <dbl>, deptime <dbl>,
## # crsdeptime <dbl>, arrtime <dbl>, crsarrtime <dbl>, flightnum <dbl>,
## # tailnum <chr>, actualelapsedtime <dbl>, crselapsedtime <dbl>,
## # airtime <dbl>, arrdelay <dbl>, depdelay <dbl>, distance <dbl>,
## # taxiin <dbl>, taxiout <dbl>, cancelled <dbl>, cancellationcode <chr>,
## # diverted <dbl>, weatherdelay <dbl>, nasdelay <dbl>,
## # securitydelay <dbl>, lateaircraftdelay <dbl>, score <int>
- Use
collect()
andView()
to preview the data in the IDE. Make sure to always limit the number of returned rows. https://github.com/tidyverse/tibble/issues/373
flights %>%
filter(
depdelay >= 60,
month == 7,
dayofmonth == 18
) %>%
collect() %>%
head(100) %>%
View("my_preview")
3.5 Case statements
See how to use the flexibility of case statements for special cases
- Use
case_when()
to bucket each month one of four seasons
flights %>%
mutate(
season = case_when(
month >= 3 && month <= 5 ~ "Spring",
month >= 6 && month <= 8 ~ "Summer",
month >= 9 && month <= 11 ~ "Fall",
TRUE ~ "Winter"
)
) %>%
group_by(season) %>%
tally()
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## season n
## <chr> <S3: integer64>
## 1 Fall 1620385
## 2 Spring 1820509
## 3 Winter 1719959
## 4 Summer 1848875
- Add a specific case for “Winter”
flights %>%
mutate(
season = case_when(
month >= 3 && month <= 5 ~ "Spring",
month >= 6 && month <= 8 ~ "Summer",
month >= 9 && month <= 11 ~ "Fall",
month == 12 | month <= 2 ~ "Winter"
)
) %>%
group_by(season) %>%
tally()
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## season n
## <chr> <S3: integer64>
## 1 Fall 1620385
## 2 Spring 1820509
## 3 Winter 1719959
## 4 Summer 1848875
- Append an entry for Monday at the end of the case statement
flights %>%
mutate(
season = case_when(
month >= 3 && month <= 5 ~ "Spring",
month >= 6 && month <= 8 ~ "Summer",
month >= 9 && month <= 11 ~ "Fall",
month == 12 | month <= 2 ~ "Winter",
dayofweek == 1 ~ "Monday"
)
) %>%
group_by(season) %>%
tally()
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## season n
## <chr> <S3: integer64>
## 1 Fall 1620385
## 2 Spring 1820509
## 3 Winter 1719959
## 4 Summer 1848875
- Move the “Monday” entry to the top of the case statement
flights %>%
mutate(
season = case_when(
dayofweek == 1 ~ "Monday",
month >= 3 && month <= 5 ~ "Spring",
month >= 6 && month <= 8 ~ "Summer",
month >= 9 && month <= 11 ~ "Fall",
month == 12 | month <= 2 ~ "Winter"
)
) %>%
group_by(season) %>%
tally()
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## season n
## <chr> <S3: integer64>
## 1 Fall 1376740
## 2 Spring 1554210
## 3 Winter 1464948
## 4 Summer 1577629
## 5 Monday 1036201
3.6 Data enrichment
Upload a small dataset in order to combine it with the datawarehouse data
- Load the
planes
data into memory
planes <- nycflights13::planes
- Using
DBI
, copy theplanes
data to the datawarehouse as a temporary table, and load it to a variable
dbWriteTable(con, "planes", planes, temporary = TRUE)
tbl_planes <- tbl(con, "planes")
- Create a “lazy” variable that joins the flights table to the new temp table
combined <- flights %>%
left_join(tbl_planes, by = "tailnum")
- View a sample of flights of planes with more than 100 seats
combined %>%
filter(seats > 100) %>%
head()
## # Source: lazy query [?? x 52]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## uniquecarrier carrierdelay carriername origin originname origincity
## <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 WN 0 Southwest Air… TUS Tucson Inte… Tucson
## 2 WN 0 Southwest Air… ABQ Albuquerque… Albuquerq…
## 3 WN 0 Southwest Air… ABQ Albuquerque… Albuquerq…
## 4 WN 0 Southwest Air… ABQ Albuquerque… Albuquerq…
## 5 WN 0 Southwest Air… ABQ Albuquerque… Albuquerq…
## 6 WN 0 Southwest Air… ABQ Albuquerque… Albuquerq…
## # ... with 46 more variables: originstate <chr>, origincountry <chr>,
## # originlat <dbl>, originlong <dbl>, dest <chr>, destname <chr>,
## # destcity <chr>, deststate <chr>, destcountry <chr>, destlat <dbl>,
## # destlong <dbl>, flightid <int>, year.x <dbl>, month <dbl>,
## # dayofmonth <dbl>, dayofweek <dbl>, deptime <dbl>, crsdeptime <dbl>,
## # arrtime <dbl>, crsarrtime <dbl>, flightnum <dbl>, tailnum <chr>,
## # actualelapsedtime <dbl>, crselapsedtime <dbl>, airtime <dbl>,
## # arrdelay <dbl>, depdelay <dbl>, distance <dbl>, taxiin <dbl>,
## # taxiout <dbl>, cancelled <dbl>, cancellationcode <chr>,
## # diverted <dbl>, weatherdelay <dbl>, nasdelay <dbl>,
## # securitydelay <dbl>, lateaircraftdelay <dbl>, score <int>,
## # year.y <int>, type <chr>, manufacturer <chr>, model <chr>,
## # engines <int>, seats <int>, speed <int>, engine <chr>
- How many flights are from McDonnel Douglas planes
combined %>%
filter(manufacturer == "MCDONNELL DOUGLAS") %>%
tally()
## # Source: lazy query [?? x 1]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## n
## <S3: integer64>
## 1 137250
- See how many flights each plane McDonnel Douglas had
combined %>%
filter(manufacturer == "MCDONNELL DOUGLAS") %>%
group_by(tailnum) %>%
tally()
## # Source: lazy query [?? x 2]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## tailnum n
## <chr> <S3: integer64>
## 1 N599AA 1243
## 2 N494AA 1360
## 3 N492AA 1331
## 4 N505AA 1373
## 5 N777NC 1850
## 6 N475AA 1358
## 7 N583AA 1403
## 8 N560AA 1340
## 9 N480AA 1204
## 10 N470AA 1455
## # ... with more rows
- Get the total number of planes, and the average, minimum & maximum number of flights for the manufacturer
combined %>%
filter(manufacturer == "MCDONNELL DOUGLAS") %>%
group_by(tailnum) %>%
tally() %>%
summarise(planes = n(),
avg_flights = mean(n, na.rm = TRUE),
max_flights = max(n, na.rm = TRUE),
min_flights = min(n, na.rm = TRUE))
## # Source: lazy query [?? x 4]
## # Database: postgres [rstudio_dev@localhost:/postgres]
## planes avg_flights max_flights min_flights
## <S3: integer64> <dbl> <S3: integer64> <S3: integer64>
## 1 102 1346 1850 1068
- Disconnect from the database
dbDisconnect(con)