ch_25_solutions

Prerequisites:

library(tidyverse)
library(nycflights13)

25.2.2 Exercises:

  1. I noticed all the function names in dplyr use snake care so that’s the naming convention I use below. I used the ls() function to list objects within a package to get all the function names.

    perc_na <- function(x) {
      mean(is.na(x))
    }
    
    ratio_of_sum <- function(x) {
      vector / sum(x, na.rm = TRUE)
    }
    
    perc_total <- function(x) {
      round(vector / sum(x, na.rm = TRUE) * 100, 1)
    }
  2. I had trouble finding a solution cleaner than a case_when() statement in the specific cases that x was equal to Inf or -Inf.

    rescale01 <- function(x) {
      x <- case_when(
        x == Inf ~ 1,
        x == -Inf ~ 0,
        .default = x
      )
    
      rng <- range(x, na.rm = TRUE, finite = TRUE)
      (x - rng[1]) / (rng[2] - rng[1])
    }
  3. I used an interval since it is suited for specific spans of time.

    get_age <- function(birthbates) {
      (birthdates %--% today()) %/% years(1)
    }
  4. There are multiple definitions of skewness with similar intent. While the function moments:skewness used Pearson’s moment coefficient of skewness (can check the source code of the function by using moments:::skewness) I used Pearson’s second skewness coefficient because that was the formula I personally knew offhand.

    variance <- function(vector) {
      sum((x - mean(x))**2) / (length(x) - 1)
    }
    
    my_skewness <- function(vector) {
      3 * (mean(x) - median(x)) / sqrt(variance(x))
    }
  5. Wouldn’t be surprised if you could come up with a more efficient way to write this.

    sample_vector_1 <- c(NA, 1, 2, 3, NA)
    sample_vector_2 <- c(NA, NA, 4, 5, NA)
    
    both_na <- function(vector_1, vector_2) {
      length(intersect(which(is.na(vector_1)), which(is.na(vector_2))))
    }
    
    both_na(sample_vector_1, sample_vector_2)
    ## [1] 2
  6. The first function returns a vector of if the given files are directories. The second function returns if the given files are readable. Both functions are a bit niche but allow you to circumvent having to read the documentation of their “parent function.”

23.3.5 Exercises:

  1. I personally find writing a function for this a bit strange, since it takes the data frame as an argument but the columns called are specific to only that data frame.

    filter_severe <- function(df) {
      df |> 
        filter(is.na(arr_time) | arr_delay > 60)
    }
    
    flights |> filter_severe()
    ## # A tibble: 36,502 × 19
    ##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
    ##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
    ##  1  2013     1     1      811            630       101     1047            830
    ##  2  2013     1     1      848           1835       853     1001           1950
    ##  3  2013     1     1      957            733       144     1056            853
    ##  4  2013     1     1     1114            900       134     1447           1222
    ##  5  2013     1     1     1120            944        96     1331           1213
    ##  6  2013     1     1     1255           1200        55     1451           1330
    ##  7  2013     1     1     1301           1150        71     1518           1345
    ##  8  2013     1     1     1337           1220        77     1649           1531
    ##  9  2013     1     1     1342           1320        22     1617           1504
    ## 10  2013     1     1     1400           1250        70     1645           1502
    ## # ℹ 36,492 more rows
    ## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
    ## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
    ## #   hour <dbl>, minute <dbl>, time_hour <dttm>
  2. summarize_severe <- function(df) {
      df |> 
        summarise(
          count_canceled = sum(is.na(arr_time), na.rm = TRUE),
          count_delayed = sum(dep_delay > 60, na.rm = TRUE)
        )
    }
    
    flights |> 
      group_by(dest) |> 
      summarize_severe()
    ## # A tibble: 105 × 3
    ##    dest  count_canceled count_delayed
    ##    <chr>          <int>         <int>
    ##  1 ABQ                0            21
    ##  2 ACK                0            11
    ##  3 ALB               21            65
    ##  4 ANC                0             1
    ##  5 ATL              342          1285
    ##  6 AUS               22           181
    ##  7 AVL               12            16
    ##  8 BDL               31            50
    ##  9 BGR               17            50
    ## 10 BHM               28            50
    ## # ℹ 95 more rows
  3. dep_delay is in minutes.

    filter_severe <- function(df, hours) {
      df |> 
        filter(is.na(arr_time) | dep_delay > 60 * hours)
    }
  4. By default, the mean function returns NA and a warning if given an invalid type. Regardless, I included my own warning to showcase how you can write it into a function.

    summarize_weather <- function(df, variable) {
      if (df |>  pull({{ variable }}) |> is.character())
        warning('Wrong type yah nerd.')
    
      df |> 
        summarise(
        minimum = min({{ variable }}, na.rm = TRUE),
        maximum = max({{ variable }}, na.rm = TRUE),
        average = mean({{ variable }}, na.rm = TRUE)
        )
    }
    weather |> 
      summarize_weather(temp)
    ## # A tibble: 1 × 3
    ##   minimum maximum average
    ##     <dbl>   <dbl>   <dbl>
    ## 1    10.9    100.    55.3
    weather |> 
      summarize_weather(origin)
    ## Warning in summarize_weather(weather, origin): Wrong type yah nerd.
    ## Warning: There was 1 warning in `summarise()`.
    ## ℹ In argument: `average = mean(origin, na.rm = TRUE)`.
    ## Caused by warning in `mean.default()`:
    ## ! argument is not numeric or logical: returning NA
    ## # A tibble: 1 × 3
    ##   minimum maximum average
    ##   <chr>   <chr>     <dbl>
    ## 1 EWR     LGA          NA
  5. standardize_time <- function(df, variable) {
      df |> 
        mutate({{ variable }} := round(({{  variable }} %/% 100 + {{ variable }} %% 100 /60), 2))
    }
    
    flights |> 
    standardize_time(sched_dep_time)
    ## # A tibble: 336,776 × 19
    ##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
    ##    <int> <int> <int>    <int>          <dbl>     <dbl>    <int>          <int>
    ##  1  2013     1     1      517           5.25         2      830            819
    ##  2  2013     1     1      533           5.48         4      850            830
    ##  3  2013     1     1      542           5.67         2      923            850
    ##  4  2013     1     1      544           5.75        -1     1004           1022
    ##  5  2013     1     1      554           6           -6      812            837
    ##  6  2013     1     1      554           5.97        -4      740            728
    ##  7  2013     1     1      555           6           -5      913            854
    ##  8  2013     1     1      557           6           -3      709            723
    ##  9  2013     1     1      557           6           -3      838            846
    ## 10  2013     1     1      558           6           -2      753            745
    ## # ℹ 336,766 more rows
    ## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
    ## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
    ## #   hour <dbl>, minute <dbl>, time_hour <dttm>
    standardize_time <- function(df, variable) {
      new_var_name = paste0(deparse(substitute(variable)), '_decimal_time')
    
      df |> 
        mutate({{ new_var_name }} := round(({{ variable }} %/% 100 + {{ variable }} %% 100 /60), 2))
    }
    
    flights |> 
    standardize_time(sched_dep_time) |> 
      colnames()
    ##  [1] "year"                        "month"                      
    ##  [3] "day"                         "dep_time"                   
    ##  [5] "sched_dep_time"              "dep_delay"                  
    ##  [7] "arr_time"                    "sched_arr_time"             
    ##  [9] "arr_delay"                   "carrier"                    
    ## [11] "flight"                      "tailnum"                    
    ## [13] "origin"                      "dest"                       
    ## [15] "air_time"                    "distance"                   
    ## [17] "hour"                        "minute"                     
    ## [19] "time_hour"                   "sched_dep_time_decimal_time"