# Libraries
library(tidyverse)
library(nycflights13)Motivation
To practice R programming through hands-on exercises, as it is the best way to enhance your programming skills.
In this tutorial, we will solve problems from Section 25.3.5 of the famous R 4 Data Science by Hadley Wickham et al. Let’s get started!
Loading Required Libraries
In this section, we will load tidyverse and nycflights13 packages. We will set warning and message to false to suppress warnings and notifications.
Section 25.3.5 Exercises
- Using the datasets from nycflights13, write a function that:
- Finds all flights that were cancelled (i.e. is.na(arr_time)) or delayed by more than an hour. About the author
# Subset flights data
canceled_or_delayed_flights <-
flights |>
filter(is.na(arr_time) | dep_delay > 1)
# Write a function
filter_severe <- function(data, condition) {
data |>
filter({{ condition }})
}
# Let's test our function
canceled_or_delayed_flights <-
flights |>
filter_severe(is.na(arr_time) | dep_delay > 1)
# Display the results
canceled_or_delayed_flights |>
slice_head(n = 5) |>
knitr::kable(align = "c")| year | month | day | dep_time | sched_dep_time | dep_delay | arr_time | sched_arr_time | arr_delay | carrier | flight | tailnum | origin | dest | air_time | distance | hour | minute | time_hour |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2013 | 1 | 1 | 517 | 515 | 2 | 830 | 819 | 11 | UA | 1545 | N14228 | EWR | IAH | 227 | 1400 | 5 | 15 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 533 | 529 | 4 | 850 | 830 | 20 | UA | 1714 | N24211 | LGA | IAH | 227 | 1416 | 5 | 29 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 542 | 540 | 2 | 923 | 850 | 33 | AA | 1141 | N619AA | JFK | MIA | 160 | 1089 | 5 | 40 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 608 | 600 | 8 | 807 | 735 | 32 | MQ | 3768 | N9EAMQ | EWR | ORD | 139 | 719 | 6 | 0 | 2013-01-01 06:00:00 |
| 2013 | 1 | 1 | 611 | 600 | 11 | 945 | 931 | 14 | UA | 303 | N532UA | JFK | SFO | 366 | 2586 | 6 | 0 | 2013-01-01 06:00:00 |
In the code snippet above, we first wrote the code for solving the problem and then converted our code into a function as instructed. Next, we tested our function to ensure that it works as expected.
- Counts the number of cancelled flights and the number of flights delayed by more than an hour.
# Write a function
summarize_severe <- function(data, var) {
data |>
summarize(
n = sum({{ var }})
)
}
# Test the function
df_0 <- flights |>
group_by(dest) |>
summarize_severe(is.na(arr_time) | dep_delay > 1)
# OR ------------------------------------------------
# We could do this:
summarise_severe <- function(data, group_var, var) {
data |>
summarize(
total = sum({{ var }}),
.by = {{ group_var }}
)
}
# Test the second function
df_1 <-
summarise_severe(
flights,
var = c(is.na(arr_time) | dep_delay > 1),
group_var = dest
)In the code snippet above, we solved a problem using two approaches. Firstly, we defined the function summarize_severe() and used it along with the group_by() function to calculate the total number of canceled flights and flights delayed by over one hour. Secondly, we defined the function summarise_severe() and used it with the .by argument in the summarize() function to calculate the same total.
We observed that both approaches produced similar results (we prefer the approach with the .by argument).
Finds all flights that were cancelled or delayed by more than a user supplied number of hours.
We solved the question below with our filter function,
filter_severe(), from problem 1 above.
# Use a user supplied number of hours
df_with_user_supplied_hours <-
flights |>
filter_severe(
is.na(arr_time) | dep_delay > 2
)- Summarizes the weather to compute the minimum, mean, and maximum, of a user supplied variable:
By default, across() renames resulting columns with the pattern: {function}{column_name}, for example, temp_min. You can override this behavior by setting the .names option to “{.fn}{.col}”.
# Solution
weather |>
summarize(
# Temperature
min_temp = min(temp, na.rm = TRUE),
mean_temp = mean(temp, na.rm = TRUE),
max_temp = max(temp, na.rm = TRUE),
# Pressure
min_pressure = min(pressure, na.rm = TRUE),
mean_pressure = mean(pressure, na.rm = TRUE),
max_pressure = max(pressure, na.rm = TRUE),
# Precipitation
min_precip = min(precip, na.rm = TRUE),
mean_precip = mean(precip, na.rm = TRUE),
max_precip = max(precip, na.rm = TRUE),
.groups = "drop"
) |>
knitr::kable(align = "c")| min_temp | mean_temp | max_temp | min_pressure | mean_pressure | max_pressure | min_precip | mean_precip | max_precip |
|---|---|---|---|---|---|---|---|---|
| 10.94 | 55.26039 | 100.04 | 983.8 | 1017.899 | 1042.1 | 0 | 0.0044691 | 1.21 |
# Write a function
summarize_weather <- function(data, vars) {
data |>
summarize(
across({{ vars }},
list(
min = \(x) min(x, na.rm = TRUE),
mean = \(x) mean(x, na.rm = TRUE),
max = \(x) max(x, na.rm = TRUE)
),
.names = "{.fn}_{.col}"
)
) |>
mutate(across(where(is.numeric), \(x) round(x, 2)))
}
# Test the function
weather |>
summarize_weather(c(temp, pressure, precip)) |>
knitr::kable(align = "c")| min_temp | mean_temp | max_temp | min_pressure | mean_pressure | max_pressure | min_precip | mean_precip | max_precip |
|---|---|---|---|---|---|---|---|---|
| 10.94 | 55.26 | 100.04 | 983.8 | 1017.9 | 1042.1 | 0 | 0 | 1.21 |
In the code chunk mentioned above, typing out all the calls can be tedious, and the code can become repetitive. This is a sign that we should convert our code into a function. Fortunately, we can quickly achieve this by inserting arguments inside doubled braces.
- Converts the user supplied variable that uses clock time (e.g., dep_time, arr_time, etc.) into a decimal time (i.e. hours + (minutes / 60)).
# Solution
flights |>
select(year:sched_dep_time, arr_time, sched_arr_time) |>
mutate(
hours = floor(arr_time / 100),
minutes = arr_time %% 100,
decimal_time = hours + (minutes / 60)
)# A tibble: 336,776 × 10
year month day dep_time sched_dep_time arr_time sched_arr_time hours
<int> <int> <int> <int> <int> <int> <int> <dbl>
1 2013 1 1 517 515 830 819 8
2 2013 1 1 533 529 850 830 8
3 2013 1 1 542 540 923 850 9
4 2013 1 1 544 545 1004 1022 10
5 2013 1 1 554 600 812 837 8
6 2013 1 1 554 558 740 728 7
7 2013 1 1 555 600 913 854 9
8 2013 1 1 557 600 709 723 7
9 2013 1 1 557 600 838 846 8
10 2013 1 1 558 600 753 745 7
# ℹ 336,766 more rows
# ℹ 2 more variables: minutes <dbl>, decimal_time <dbl>
# Write a function
standardize_time <- function(data, time_var) {
data |>
mutate(
# floor division
hours = floor({{ time_var }} / 100),
# extracting the remainder
minutes = {{ time_var }} %% 100,
# Convert the remainder to minutes; combine the results; round to 2
"{{time_var}}" := round(hours + minutes / 60, 2)
) |>
# Remove unnecessary columns
select(-hours, -minutes)
}
# OR ---------------------------------------------------------------------
# We could simplify it further, thanks to Zakarie Hashi for the suggestion
standardise_time <- function(data, time_var) {
data |>
mutate(
# floor division, extracting the remainder,Convert the remainder to minutes; combine the results; round to 2
decimal_time = round(floor({{ time_var }} / 100) + ({{ time_var }} %% 100) / 60, 2)
)
}
# Test the function
flights |>
standardise_time(arr_time) |>
slice_head(n = 5) # A tibble: 5 × 20
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
# ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>, decimal_time <dbl>
As we demonstrated above, it is possible to complete this task with just one line of code, as suggested by Zakarie Hashi in a LinkedIn post from last year. In the function provided above, we utilized the floor() function to extract the hours and used modulo division to retrieve the remaining minutes. We then combined the outcomes and rounded our answer to two decimal places.
Conclusion
In this tutorial, we have shown you how to answer selected practice problems from section 25.3.5 exercises in R for Data Science, 2nd edition by Wickham et al. We have demonstrated various approaches for solving these questions wherever possible. Although we have highlighted some ways to solve these exercises, there are many other methods available. We encourage you to try them out and share your answers with us and our readers.
Happy Coding 💪!