ch_25_solutions

Prerequisites:

library(tidyverse)
library(nycflights13)

25.2.2 Exercises:

I noticed all the function names in dplyr use snake care so that’s the naming convention I use below. I used the ls() function to list objects within a package to get all the function names.

perc_na <- function(x) {
  mean(is.na(x))
}

ratio_of_sum <- function(x) {
  vector / sum(x, na.rm = TRUE)
}

perc_total <- function(x) {
  round(vector / sum(x, na.rm = TRUE) * 100, 1)
}

I had trouble finding a solution cleaner than a case_when() statement in the specific cases that x was equal to Inf or -Inf.

rescale01 <- function(x) {
  x <- case_when(
    x == Inf ~ 1,
    x == -Inf ~ 0,
    .default = x
  )

  rng <- range(x, na.rm = TRUE, finite = TRUE)
  (x - rng[1]) / (rng[2] - rng[1])
}

I used an interval since it is suited for specific spans of time.

get_age <- function(birthbates) {
  (birthdates %--% today()) %/% years(1)
}

There are multiple definitions of skewness with similar intent. While the function moments:skewness used Pearson’s moment coefficient of skewness (can check the source code of the function by using moments:::skewness) I used Pearson’s second skewness coefficient because that was the formula I personally knew offhand.
```
variance <- function(vector) {
  sum((x - mean(x))**2) / (length(x) - 1)
}

my_skewness <- function(vector) {
  3 * (mean(x) - median(x)) / sqrt(variance(x))
}
```

Wouldn’t be surprised if you could come up with a more efficient way to write this.

sample_vector_1 <- c(NA, 1, 2, 3, NA)
sample_vector_2 <- c(NA, NA, 4, 5, NA)

both_na <- function(vector_1, vector_2) {
  length(intersect(which(is.na(vector_1)), which(is.na(vector_2))))
}

both_na(sample_vector_1, sample_vector_2)
## [1] 2

The first function returns a vector of if the given files are directories. The second function returns if the given files are readable. Both functions are a bit niche but allow you to circumvent having to read the documentation of their “parent function.”

23.3.5 Exercises:

I personally find writing a function for this a bit strange, since it takes the data frame as an argument but the columns called are specific to only that data frame.

filter_severe <- function(df) {
  df |> 
    filter(is.na(arr_time) | arr_delay > 60)
}

flights |> filter_severe()
## # A tibble: 36,502 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      811            630       101     1047            830
##  2  2013     1     1      848           1835       853     1001           1950
##  3  2013     1     1      957            733       144     1056            853
##  4  2013     1     1     1114            900       134     1447           1222
##  5  2013     1     1     1120            944        96     1331           1213
##  6  2013     1     1     1255           1200        55     1451           1330
##  7  2013     1     1     1301           1150        71     1518           1345
##  8  2013     1     1     1337           1220        77     1649           1531
##  9  2013     1     1     1342           1320        22     1617           1504
## 10  2013     1     1     1400           1250        70     1645           1502
## # ℹ 36,492 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

summarize_severe <- function(df) {
  df |> 
    summarise(
      count_canceled = sum(is.na(arr_time), na.rm = TRUE),
      count_delayed = sum(dep_delay > 60, na.rm = TRUE)
    )
}

flights |> 
  group_by(dest) |> 
  summarize_severe()
## # A tibble: 105 × 3
##    dest  count_canceled count_delayed
##    <chr>          <int>         <int>
##  1 ABQ                0            21
##  2 ACK                0            11
##  3 ALB               21            65
##  4 ANC                0             1
##  5 ATL              342          1285
##  6 AUS               22           181
##  7 AVL               12            16
##  8 BDL               31            50
##  9 BGR               17            50
## 10 BHM               28            50
## # ℹ 95 more rows

dep_delay is in minutes.

filter_severe <- function(df, hours) {
  df |> 
    filter(is.na(arr_time) | dep_delay > 60 * hours)
}

By default, the mean function returns NA and a warning if given an invalid type. Regardless, I included my own warning to showcase how you can write it into a function.

summarize_weather <- function(df, variable) {
  if (df |>  pull({{ variable }}) |> is.character())
    warning('Wrong type yah nerd.')

  df |> 
    summarise(
    minimum = min({{ variable }}, na.rm = TRUE),
    maximum = max({{ variable }}, na.rm = TRUE),
    average = mean({{ variable }}, na.rm = TRUE)
    )
}

weather |> 
  summarize_weather(temp)
## # A tibble: 1 × 3
##   minimum maximum average
##     <dbl>   <dbl>   <dbl>
## 1    10.9    100.    55.3

weather |> 
  summarize_weather(origin)
## Warning in summarize_weather(weather, origin): Wrong type yah nerd.
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `average = mean(origin, na.rm = TRUE)`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## # A tibble: 1 × 3
##   minimum maximum average
##   <chr>   <chr>     <dbl>
## 1 EWR     LGA          NA

standardize_time <- function(df, variable) {
  df |> 
    mutate({{ variable }} := round(({{  variable }} %/% 100 + {{ variable }} %% 100 /60), 2))
}

flights |> 
standardize_time(sched_dep_time)
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <dbl>     <dbl>    <int>          <int>
##  1  2013     1     1      517           5.25         2      830            819
##  2  2013     1     1      533           5.48         4      850            830
##  3  2013     1     1      542           5.67         2      923            850
##  4  2013     1     1      544           5.75        -1     1004           1022
##  5  2013     1     1      554           6           -6      812            837
##  6  2013     1     1      554           5.97        -4      740            728
##  7  2013     1     1      555           6           -5      913            854
##  8  2013     1     1      557           6           -3      709            723
##  9  2013     1     1      557           6           -3      838            846
## 10  2013     1     1      558           6           -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

standardize_time <- function(df, variable) {
  new_var_name = paste0(deparse(substitute(variable)), '_decimal_time')

  df |> 
    mutate({{ new_var_name }} := round(({{ variable }} %/% 100 + {{ variable }} %% 100 /60), 2))
}

flights |> 
standardize_time(sched_dep_time) |> 
  colnames()
##  [1] "year"                        "month"                      
##  [3] "day"                         "dep_time"                   
##  [5] "sched_dep_time"              "dep_delay"                  
##  [7] "arr_time"                    "sched_arr_time"             
##  [9] "arr_delay"                   "carrier"                    
## [11] "flight"                      "tailnum"                    
## [13] "origin"                      "dest"                       
## [15] "air_time"                    "distance"                   
## [17] "hour"                        "minute"                     
## [19] "time_hour"                   "sched_dep_time_decimal_time"