library(tidyverse)
library(nycflights13)
ch_25_solutions
Prerequisites:
25.2.2 Exercises:
I noticed all the function names in dplyr use snake care so that’s the naming convention I use below. I used the
ls()
function to list objects within a package to get all the function names.<- function(x) { perc_na mean(is.na(x)) } <- function(x) { ratio_of_sum / sum(x, na.rm = TRUE) vector } <- function(x) { perc_total round(vector / sum(x, na.rm = TRUE) * 100, 1) }
I had trouble finding a solution cleaner than a
case_when()
statement in the specific cases that x was equal to Inf or -Inf.<- function(x) { rescale01 <- case_when( x == Inf ~ 1, x == -Inf ~ 0, x .default = x ) <- range(x, na.rm = TRUE, finite = TRUE) rng - rng[1]) / (rng[2] - rng[1]) (x }
I used an interval since it is suited for specific spans of time.
<- function(birthbates) { get_age %--% today()) %/% years(1) (birthdates }
There are multiple definitions of skewness with similar intent. While the function
moments:skewness
used Pearson’s moment coefficient of skewness (can check the source code of the function by usingmoments:::skewness
) I used Pearson’s second skewness coefficient because that was the formula I personally knew offhand.<- function(vector) { variance sum((x - mean(x))**2) / (length(x) - 1) } <- function(vector) { my_skewness 3 * (mean(x) - median(x)) / sqrt(variance(x)) }
Wouldn’t be surprised if you could come up with a more efficient way to write this.
<- c(NA, 1, 2, 3, NA) sample_vector_1 <- c(NA, NA, 4, 5, NA) sample_vector_2 <- function(vector_1, vector_2) { both_na length(intersect(which(is.na(vector_1)), which(is.na(vector_2)))) } both_na(sample_vector_1, sample_vector_2) ## [1] 2
The first function returns a vector of if the given files are directories. The second function returns if the given files are readable. Both functions are a bit niche but allow you to circumvent having to read the documentation of their “parent function.”
23.3.5 Exercises:
I personally find writing a function for this a bit strange, since it takes the data frame as an argument but the columns called are specific to only that data frame.
<- function(df) { filter_severe |> df filter(is.na(arr_time) | arr_delay > 60) } |> filter_severe() flights ## # A tibble: 36,502 × 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <int> <dbl> <int> <int> ## 1 2013 1 1 811 630 101 1047 830 ## 2 2013 1 1 848 1835 853 1001 1950 ## 3 2013 1 1 957 733 144 1056 853 ## 4 2013 1 1 1114 900 134 1447 1222 ## 5 2013 1 1 1120 944 96 1331 1213 ## 6 2013 1 1 1255 1200 55 1451 1330 ## 7 2013 1 1 1301 1150 71 1518 1345 ## 8 2013 1 1 1337 1220 77 1649 1531 ## 9 2013 1 1 1342 1320 22 1617 1504 ## 10 2013 1 1 1400 1250 70 1645 1502 ## # ℹ 36,492 more rows ## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>, ## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, ## # hour <dbl>, minute <dbl>, time_hour <dttm>
<- function(df) { summarize_severe |> df summarise( count_canceled = sum(is.na(arr_time), na.rm = TRUE), count_delayed = sum(dep_delay > 60, na.rm = TRUE) ) } |> flights group_by(dest) |> summarize_severe() ## # A tibble: 105 × 3 ## dest count_canceled count_delayed ## <chr> <int> <int> ## 1 ABQ 0 21 ## 2 ACK 0 11 ## 3 ALB 21 65 ## 4 ANC 0 1 ## 5 ATL 342 1285 ## 6 AUS 22 181 ## 7 AVL 12 16 ## 8 BDL 31 50 ## 9 BGR 17 50 ## 10 BHM 28 50 ## # ℹ 95 more rows
dep_delay
is in minutes.<- function(df, hours) { filter_severe |> df filter(is.na(arr_time) | dep_delay > 60 * hours) }
By default, the mean function returns
NA
and a warning if given an invalid type. Regardless, I included my own warning to showcase how you can write it into a function.<- function(df, variable) { summarize_weather if (df |> pull({{ variable }}) |> is.character()) warning('Wrong type yah nerd.') |> df summarise( minimum = min({{ variable }}, na.rm = TRUE), maximum = max({{ variable }}, na.rm = TRUE), average = mean({{ variable }}, na.rm = TRUE) ) }
|> weather summarize_weather(temp) ## # A tibble: 1 × 3 ## minimum maximum average ## <dbl> <dbl> <dbl> ## 1 10.9 100. 55.3
|> weather summarize_weather(origin) ## Warning in summarize_weather(weather, origin): Wrong type yah nerd. ## Warning: There was 1 warning in `summarise()`. ## ℹ In argument: `average = mean(origin, na.rm = TRUE)`. ## Caused by warning in `mean.default()`: ## ! argument is not numeric or logical: returning NA ## # A tibble: 1 × 3 ## minimum maximum average ## <chr> <chr> <dbl> ## 1 EWR LGA NA
<- function(df, variable) { standardize_time |> df mutate({{ variable }} := round(({{ variable }} %/% 100 + {{ variable }} %% 100 /60), 2)) } |> flights standardize_time(sched_dep_time) ## # A tibble: 336,776 × 19 ## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time ## <int> <int> <int> <int> <dbl> <dbl> <int> <int> ## 1 2013 1 1 517 5.25 2 830 819 ## 2 2013 1 1 533 5.48 4 850 830 ## 3 2013 1 1 542 5.67 2 923 850 ## 4 2013 1 1 544 5.75 -1 1004 1022 ## 5 2013 1 1 554 6 -6 812 837 ## 6 2013 1 1 554 5.97 -4 740 728 ## 7 2013 1 1 555 6 -5 913 854 ## 8 2013 1 1 557 6 -3 709 723 ## 9 2013 1 1 557 6 -3 838 846 ## 10 2013 1 1 558 6 -2 753 745 ## # ℹ 336,766 more rows ## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>, ## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, ## # hour <dbl>, minute <dbl>, time_hour <dttm>
<- function(df, variable) { standardize_time = paste0(deparse(substitute(variable)), '_decimal_time') new_var_name |> df mutate({{ new_var_name }} := round(({{ variable }} %/% 100 + {{ variable }} %% 100 /60), 2)) } |> flights standardize_time(sched_dep_time) |> colnames() ## [1] "year" "month" ## [3] "day" "dep_time" ## [5] "sched_dep_time" "dep_delay" ## [7] "arr_time" "sched_arr_time" ## [9] "arr_delay" "carrier" ## [11] "flight" "tailnum" ## [13] "origin" "dest" ## [15] "air_time" "distance" ## [17] "hour" "minute" ## [19] "time_hour" "sched_dep_time_decimal_time"