Data preparation

  • Two numeric - arr_delay, dep_delay
  • Two categories
    • season ("winter", "summer"),
    • day_hour ("morning", "not morning")
  • Three categories - origin ("EWR", "JFK", "LGA")
  • Sixteen categories - carrier

Hypothesis tests

One numerical variable (mean)

Observed stat

( x_bar <- fli_small %>%
  specify(response = dep_delay) %>%
  calculate(stat = "mean") )
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  10.4
null_distn <- fli_small %>%
  specify(response = dep_delay) %>%
  hypothesize(null = "point", mu = 10) %>%
  generate(reps = 1000) %>%
  calculate(stat = "mean")
null_distn %>% 
  visualize(obs_stat = x_bar, direction = "two_sided")

null_distn %>%
  get_pvalue(obs_stat = x_bar, direction = "two_sided")
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.801

One numerical variable (standardized mean \(t\))

Observed stat

( t_bar <- fli_small %>%
  specify(response = dep_delay) %>%
  calculate(stat = "t") )
## Response: dep_delay (numeric)
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  6.93
null_distn <- fli_small %>%
  specify(response = dep_delay) %>%
  hypothesize(null = "point", mu = 8) %>%
  generate(reps = 1000) %>%
  calculate(stat = "t")
null_distn %>% 
  visualize(obs_stat = t_bar, direction = "two_sided")

null_distn %>%
  get_pvalue(obs_stat = t_bar, direction = "two_sided")
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1      0.

One numerical variable (median)

Observed stat

( x_tilde <- fli_small %>%
  specify(response = dep_delay) %>%
  calculate(stat = "median") )
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1   -2.
null_distn <- fli_small %>%
  specify(response = dep_delay) %>%
  hypothesize(null = "point", med = -1) %>% 
  generate(reps = 1000) %>% 
  calculate(stat = "median")
null_distn %>% 
  visualize(obs_stat = x_tilde, direction = "two_sided")

null_distn %>%
  get_pvalue(obs_stat = x_tilde, direction = "two_sided")
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1  0.0760

One categorical (one proportion)

Observed stat

( p_hat <- fli_small %>%
  specify(response = day_hour, success = "morning") %>%
  calculate(stat = "prop") )
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.466
null_distn <- fli_small %>%
  specify(response = day_hour, success = "morning") %>%
  hypothesize(null = "point", p = .5) %>%
  generate(reps = 1000) %>%
  calculate(stat = "prop")
null_distn %>% 
  visualize(obs_stat = p_hat, direction = "two_sided")

null_distn %>%
  get_pvalue(obs_stat = p_hat, direction = "two_sided")
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.101

Logical variables will be coerced to factors:

null_distn <- fli_small %>%
  mutate(day_hour_logical = (day_hour == "morning")) %>%
  specify(response = day_hour_logical, success = "TRUE") %>%
  hypothesize(null = "point", p = .5) %>%
  generate(reps = 1000) %>%
  calculate(stat = "prop")

One categorical variable (standardized proportion \(z\))

Not yet implemented.

Two categorical (2 level) variables

Observed stat

( d_hat <- fli_small %>% 
  specify(day_hour ~ season, success = "morning") %>%
  calculate(stat = "diff in props", order = c("winter", "summer")) )
## # A tibble: 1 x 1
##      stat
##     <dbl>
## 1 -0.0205
null_distn <- fli_small %>%
  specify(day_hour ~ season, success = "morning") %>%
  hypothesize(null = "independence") %>% 
  generate(reps = 1000) %>% 
  calculate(stat = "diff in props", order = c("winter", "summer"))
null_distn %>% 
  visualize(obs_stat = d_hat, direction = "two_sided")

null_distn %>%
  get_pvalue(obs_stat = d_hat, direction = "two_sided")
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.650

Two categorical (2 level) variables (z)

Standardized observed stat

( z_hat <- fli_small %>% 
  specify(day_hour ~ season, success = "morning") %>%
  calculate(stat = "z", order = c("winter", "summer")) )
## # A tibble: 1 x 1
##     stat
##    <dbl>
## 1 -0.460
null_distn <- fli_small %>%
  specify(day_hour ~ season, success = "morning") %>%
  hypothesize(null = "independence") %>% 
  generate(reps = 1000) %>% 
  calculate(stat = "z", order = c("winter", "summer"))
null_distn %>% 
  visualize(obs_stat = z_hat, direction = "two_sided")

null_distn %>%
  get_pvalue(obs_stat = z_hat, direction = "two_sided")
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.633

Note the similarities in this plot and the previous one.

One categorical (>2 level) - GoF

Observed stat

Note the need to add in the hypothesized values here to compute the observed statistic.

## Response: origin (factor)
## Null Hypothesis: point
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  10.4
null_distn <- fli_small %>%
  specify(response = origin) %>%
  hypothesize(null = "point", 
              p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% 
  generate(reps = 1000, type = "simulate") %>% 
  calculate(stat = "Chisq")
null_distn %>% 
  visualize(obs_stat = Chisq_hat, direction = "greater")

null_distn %>%
  get_pvalue(obs_stat = Chisq_hat, direction = "greater")
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1 0.00500

Two categorical (>2 level) variables

Observed stat

( Chisq_hat <- fli_small %>%
  specify(formula = day_hour ~ origin) %>% 
  calculate(stat = "Chisq") )
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  9.03
null_distn <- fli_small %>%
  specify(day_hour ~ origin) %>%
  hypothesize(null = "independence") %>% 
  generate(reps = 1000, type = "permute") %>% 
  calculate(stat = "Chisq")
null_distn %>% 
  visualize(obs_stat = Chisq_hat, direction = "greater")

null_distn %>%
  get_pvalue(obs_stat = Chisq_hat, direction = "greater")
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1 0.00700

One numerical variable, one categorical (2 levels) (diff in means)

Observed stat

( d_hat <- fli_small %>% 
  specify(dep_delay ~ season) %>% 
  calculate(stat = "diff in means", order = c("summer", "winter")) )
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  2.27
null_distn <- fli_small %>%
  specify(dep_delay ~ season) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in means", order = c("summer", "winter"))
null_distn %>% 
  visualize(obs_stat = d_hat, direction = "two_sided")

null_distn %>%
  get_pvalue(obs_stat = d_hat, direction = "two_sided")
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.456

One numerical variable, one categorical (2 levels) (t)

Standardized observed stat

( t_hat <- fli_small %>% 
  specify(dep_delay ~ season) %>% 
  calculate(stat = "t", order = c("summer", "winter")) )
## Response: dep_delay (numeric)
## Explanatory: season (factor)
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.754
null_distn <- fli_small %>%
  specify(dep_delay ~ season) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "t", order = c("summer", "winter"))
null_distn %>% 
  visualize(obs_stat = t_hat, direction = "two_sided")

null_distn %>%
  get_pvalue(obs_stat = t_hat, direction = "two_sided")
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.487

Note the similarities in this plot and the previous one.

One numerical variable, one categorical (2 levels) (diff in medians)

Observed stat

( d_hat <- fli_small %>% 
  specify(dep_delay ~ season) %>% 
  calculate(stat = "diff in medians", order = c("summer", "winter")) )
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1    2.
null_distn <- fli_small %>%
  specify(dep_delay ~ season) %>% # alt: response = dep_delay, 
  # explanatory = season
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in medians", order = c("summer", "winter"))
null_distn %>% 
  visualize(obs_stat = d_hat, direction = "two_sided")

null_distn %>%
  get_pvalue(obs_stat = d_hat, direction = "two_sided")
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1  0.0290

One numerical, one categorical (>2 levels) - ANOVA

Observed stat

( F_hat <- fli_small %>% 
  specify(arr_delay ~ origin) %>%
  calculate(stat = "F") )
## Response: arr_delay (numeric)
## Explanatory: origin (factor)
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  1.08
null_distn <- fli_small %>%
   specify(arr_delay ~ origin) %>%
   hypothesize(null = "independence") %>%
   generate(reps = 1000, type = "permute") %>%
   calculate(stat = "F")
null_distn %>% 
  visualize(obs_stat = F_hat, direction = "greater")

null_distn %>%
  get_pvalue(obs_stat = F_hat, direction = "greater")
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.353

Two numerical vars - SLR

Observed stat

( slope_hat <- fli_small %>% 
  specify(arr_delay ~ dep_delay) %>% 
  calculate(stat = "slope") )
## Response: arr_delay (numeric)
## Explanatory: dep_delay (numeric)
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  1.02
null_distn <- fli_small %>%
   specify(arr_delay ~ dep_delay) %>% 
   hypothesize(null = "independence") %>%
   generate(reps = 1000, type = "permute") %>%
   calculate(stat = "slope")
null_distn %>% 
  visualize(obs_stat = slope_hat, direction = "two_sided")

null_distn %>%
  get_pvalue(obs_stat = slope_hat, direction = "two_sided")
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1      0.

Two numerical vars - correlation

Observed stat

( correlation_hat <- fli_small %>% 
  specify(arr_delay ~ dep_delay) %>% 
  calculate(stat = "correlation") )
## Response: arr_delay (numeric)
## Explanatory: dep_delay (numeric)
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.894
null_distn <- fli_small %>%
   specify(arr_delay ~ dep_delay) %>% 
   hypothesize(null = "independence") %>%
   generate(reps = 1000, type = "permute") %>%
   calculate(stat = "correlation")
null_distn %>% 
  visualize(obs_stat = correlation_hat, direction = "two_sided")

null_distn %>%
  get_pvalue(obs_stat = correlation_hat, direction = "two_sided")
## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1      0.

Two numerical vars - SLR (t)

Not currently implemented since \(t\) could refer to standardized slope or standardized correlation.

Confidence intervals

One numerical (one mean)

Point estimate

( x_bar <- fli_small %>% 
  specify(response = arr_delay) %>%
  calculate(stat = "mean") )
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  4.57
boot <- fli_small %>%
   specify(response = arr_delay) %>%
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "mean")
( percentile_ci <- get_ci(boot) )
## # A tibble: 1 x 2
##   `2.5%` `97.5%`
##    <dbl>   <dbl>
## 1   1.44    7.82
boot %>% visualize(endpoints = percentile_ci, direction = "between")

( standard_error_ci <- get_ci(boot, type = "se", point_estimate = x_bar) )
## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1  1.27  7.88
boot %>% visualize(endpoints = standard_error_ci, direction = "between")

One numerical (one mean - standardized)

Point estimate

( t_hat <- fli_small %>% 
  specify(response = arr_delay) %>%
  calculate(stat = "t") )
## Response: arr_delay (numeric)
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  2.68
boot <- fli_small %>%
   specify(response = arr_delay) %>%
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "t")
( percentile_ci <- get_ci(boot) )
## # A tibble: 1 x 2
##   `2.5%` `97.5%`
##    <dbl>   <dbl>
## 1  0.934    4.36
boot %>% visualize(endpoints = percentile_ci, direction = "between")

( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) )
## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1 0.914  4.44
boot %>% visualize(endpoints = standard_error_ci, direction = "between")

One categorical (one proportion)

Point estimate

( p_hat <- fli_small %>% 
   specify(response = day_hour, success = "morning") %>%
   calculate(stat = "prop") )
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.466
boot <- fli_small %>%
 specify(response = day_hour, success = "morning") %>%
 generate(reps = 1000, type = "bootstrap") %>%
 calculate(stat = "prop")
( percentile_ci <- get_ci(boot) )
## # A tibble: 1 x 2
##   `2.5%` `97.5%`
##    <dbl>   <dbl>
## 1  0.420   0.508
boot %>% visualize(endpoints = percentile_ci, direction = "between")

( standard_error_ci <- get_ci(boot, type = "se", point_estimate = p_hat) )
## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1 0.422 0.510
boot %>% visualize(endpoints = standard_error_ci, direction = "between")

One categorical variable (standardized proportion \(z\))

Not yet implemented.

One numerical variable, one categorical (2 levels) (diff in means)

Point estimate

( d_hat <- fli_small %>%
  specify(arr_delay ~ season) %>%
  calculate(stat = "diff in means", order = c("summer", "winter")) )
## # A tibble: 1 x 1
##     stat
##    <dbl>
## 1 -0.745
boot <- fli_small %>%
   specify(arr_delay ~ season) %>%
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "diff in means", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )
## # A tibble: 1 x 2
##   `2.5%` `97.5%`
##    <dbl>   <dbl>
## 1  -7.17    6.08
boot %>% visualize(endpoints = percentile_ci, direction = "between")

( standard_error_ci <- get_ci(boot, type = "se", point_estimate = d_hat) )
## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1 -7.30  5.81
boot %>% visualize(endpoints = standard_error_ci, direction = "between")

One numerical variable, one categorical (2 levels) (t)

Standardized point estimate

( t_hat <- fli_small %>%
  specify(arr_delay ~ season) %>%
  calculate(stat = "t", order = c("summer", "winter")) )
## Response: arr_delay (numeric)
## Explanatory: season (factor)
## # A tibble: 1 x 1
##     stat
##    <dbl>
## 1 -0.218
boot <- fli_small %>%
   specify(arr_delay ~ season) %>%
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "t", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )
## # A tibble: 1 x 2
##   `2.5%` `97.5%`
##    <dbl>   <dbl>
## 1  -2.24    1.72
boot %>% visualize(endpoints = percentile_ci, direction = "between")

( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) )
## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1 -2.18  1.75
boot %>% visualize(endpoints = standard_error_ci, direction = "between")

Two categorical variables (diff in proportions)

Point estimate

( d_hat <- fli_small %>% 
  specify(day_hour ~ season, success = "morning") %>%
  calculate(stat = "diff in props", order = c("summer", "winter")) )
## # A tibble: 1 x 1
##     stat
##    <dbl>
## 1 0.0205
boot <- fli_small %>%
  specify(day_hour ~ season, success = "morning") %>%
  generate(reps = 1000, type = "bootstrap") %>% 
  calculate(stat = "diff in props", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )
## # A tibble: 1 x 2
##    `2.5%` `97.5%`
##     <dbl>   <dbl>
## 1 -0.0648   0.108
boot %>% visualize(endpoints = percentile_ci, direction = "between")

( standard_error_ci <- get_ci(boot, type = "se", point_estimate = d_hat) )
## # A tibble: 1 x 2
##     lower upper
##     <dbl> <dbl>
## 1 -0.0676 0.109
boot %>% visualize(endpoints = standard_error_ci, direction = "between")

Two categorical variables (z)

Standardized point estimate

( z_hat <- fli_small %>% 
  specify(day_hour ~ season, success = "morning") %>%
  calculate(stat = "z", order = c("summer", "winter")) )
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.460
boot <- fli_small %>%
  specify(day_hour ~ season, success = "morning") %>%
  generate(reps = 1000, type = "bootstrap") %>% 
  calculate(stat = "z", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )
## # A tibble: 1 x 2
##   `2.5%` `97.5%`
##    <dbl>   <dbl>
## 1  -1.48    2.50
boot %>% visualize(endpoints = percentile_ci, direction = "between")

( standard_error_ci <- get_ci(boot, type = "se", point_estimate = z_hat) )
## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1 -1.52  2.44
boot %>% visualize(endpoints = standard_error_ci, direction = "between")

Two numerical vars - SLR

Point estimate

( slope_hat <- fli_small %>% 
  specify(arr_delay ~ dep_delay) %>%
  calculate(stat = "slope") )
## Response: arr_delay (numeric)
## Explanatory: dep_delay (numeric)
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  1.02
boot <- fli_small %>%
   specify(arr_delay ~ dep_delay) %>% 
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "slope")
( percentile_ci <- get_ci(boot) )
## # A tibble: 1 x 2
##   `2.5%` `97.5%`
##    <dbl>   <dbl>
## 1  0.973    1.07
boot %>% visualize(endpoints = percentile_ci, direction = "between")

( standard_error_ci <- get_ci(boot, type = "se", point_estimate = slope_hat) )
## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1 0.965  1.07
boot %>% visualize(endpoints = standard_error_ci, direction = "between") 

Two numerical vars - correlation

Point estimate

( correlation_hat <- fli_small %>% 
  specify(arr_delay ~ dep_delay) %>%
  calculate(stat = "correlation") )
## Response: arr_delay (numeric)
## Explanatory: dep_delay (numeric)
## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.894
boot <- fli_small %>%
   specify(arr_delay ~ dep_delay) %>% 
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "correlation")
( percentile_ci <- get_ci(boot) )
## # A tibble: 1 x 2
##   `2.5%` `97.5%`
##    <dbl>   <dbl>
## 1  0.850   0.922
boot %>% visualize(endpoints = percentile_ci, direction = "between")

## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1 0.858 0.931
boot %>% visualize(endpoints = standard_error_ci, direction = "between")  

Two numerical vars - t

Not currently implemented since \(t\) could refer to standardized slope or standardized correlation.