Data preparation

library(nycflights13)
library(dplyr)
library(ggplot2)
library(stringr)
library(infer)
set.seed(2017)
fli_small <- flights %>% 
  na.omit() %>%
  sample_n(size = 500) %>% 
  mutate(season = case_when(
    month %in% c(10:12, 1:3) ~ "winter",
    month %in% c(4:9) ~ "summer"
  )) %>% 
  mutate(day_hour = case_when(
    between(hour, 1, 12) ~ "morning",
    between(hour, 13, 24) ~ "not morning"
  )) %>% 
  select(arr_delay, dep_delay, season, 
         day_hour, origin, carrier)

Two numeric - arr_delay, dep_delay
Two categories
- season ("winter", "summer"),
- day_hour ("morning", "not morning")
Three categories - origin ("EWR", "JFK", "LGA")
Sixteen categories - carrier

Hypothesis tests

One numerical variable (mean)

Observed stat

( x_bar <- fli_small %>%
  specify(response = dep_delay) %>%
  calculate(stat = "mean") )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  11.5

null_distn <- fli_small %>%
  specify(response = dep_delay) %>%
  hypothesize(null = "point", mu = 10) %>%
  generate(reps = 1000) %>%
  calculate(stat = "mean")

## Setting `type = "bootstrap"` in `generate()`.

visualize(null_distn) +
  shade_p_value(obs_stat = x_bar, direction = "two_sided")

null_distn %>%
  get_p_value(obs_stat = x_bar, direction = "two_sided")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.356

One numerical variable (standardized mean \(t\))

Observed stat

t_bar <- fli_small %>%
  t_stat(response = dep_delay, mu = 8)

null_distn <- fli_small %>%
  specify(response = dep_delay) %>%
  hypothesize(null = "point", mu = 8) %>%
  generate(reps = 1000) %>%
  calculate(stat = "t")

## Setting `type = "bootstrap"` in `generate()`.

visualize(null_distn) +
  shade_p_value(obs_stat = t_bar, direction = "two_sided")

null_distn %>%
  get_p_value(obs_stat = t_bar, direction = "two_sided")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.018

One numerical variable (median)

Observed stat

( x_tilde <- fli_small %>%
  specify(response = dep_delay) %>%
  calculate(stat = "median") )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1    -2

null_distn <- fli_small %>%
  specify(response = dep_delay) %>%
  hypothesize(null = "point", med = -1) %>% 
  generate(reps = 1000) %>% 
  calculate(stat = "median")

## Setting `type = "bootstrap"` in `generate()`.

visualize(null_distn) +
  shade_p_value(obs_stat = x_tilde, direction = "two_sided")

null_distn %>%
  get_p_value(obs_stat = x_tilde, direction = "two_sided")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.018

One categorical (one proportion)

Observed stat

( p_hat <- fli_small %>%
  specify(response = day_hour, success = "morning") %>%
  calculate(stat = "prop") )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.452

null_distn <- fli_small %>%
  specify(response = day_hour, success = "morning") %>%
  hypothesize(null = "point", p = .5) %>%
  generate(reps = 1000) %>%
  calculate(stat = "prop")

## Setting `type = "simulate"` in `generate()`.

visualize(null_distn) +
  shade_p_value(obs_stat = p_hat, direction = "two_sided")

null_distn %>%
  get_p_value(obs_stat = p_hat, direction = "two_sided")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.036

Logical variables will be coerced to factors:

null_distn <- fli_small %>%
  mutate(day_hour_logical = (day_hour == "morning")) %>%
  specify(response = day_hour_logical, success = "TRUE") %>%
  hypothesize(null = "point", p = .5) %>%
  generate(reps = 1000) %>%
  calculate(stat = "prop")

## Setting `type = "simulate"` in `generate()`.

One categorical variable (standardized proportion \(z\))

Not yet implemented.

Two categorical (2 level) variables

Observed stat

( d_hat <- fli_small %>% 
  specify(day_hour ~ season, success = "morning") %>%
  calculate(stat = "diff in props", order = c("winter", "summer")) )

## # A tibble: 1 x 1
##      stat
##     <dbl>
## 1 0.00438

null_distn <- fli_small %>%
  specify(day_hour ~ season, success = "morning") %>%
  hypothesize(null = "independence") %>% 
  generate(reps = 1000) %>% 
  calculate(stat = "diff in props", order = c("winter", "summer"))

## Setting `type = "permute"` in `generate()`.

visualize(null_distn) +
  shade_p_value(obs_stat = d_hat, direction = "two_sided")

null_distn %>%
  get_p_value(obs_stat = d_hat, direction = "two_sided")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.954

Two categorical (2 level) variables (z)

Standardized observed stat

( z_hat <- fli_small %>% 
  specify(day_hour ~ season, success = "morning") %>%
  calculate(stat = "z", order = c("winter", "summer")) )

## # A tibble: 1 x 1
##     stat
##    <dbl>
## 1 0.0985

null_distn <- fli_small %>%
  specify(day_hour ~ season, success = "morning") %>%
  hypothesize(null = "independence") %>% 
  generate(reps = 1000) %>% 
  calculate(stat = "z", order = c("winter", "summer"))

## Setting `type = "permute"` in `generate()`.

visualize(null_distn) +
  shade_p_value(obs_stat = z_hat, direction = "two_sided")

null_distn %>%
  get_p_value(obs_stat = z_hat, direction = "two_sided")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1    0.95

Note the similarities in this plot and the previous one.

One categorical (>2 level) - GoF

Observed stat

Note the need to add in the hypothesized values here to compute the observed statistic.

( Chisq_hat <- fli_small %>%
  specify(response = origin) %>%
  hypothesize(null = "point", 
              p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% 
  calculate(stat = "Chisq") )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  7.01

null_distn <- fli_small %>%
  specify(response = origin) %>%
  hypothesize(null = "point", 
              p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% 
  generate(reps = 1000, type = "simulate") %>% 
  calculate(stat = "Chisq")

visualize(null_distn) +
  shade_p_value(obs_stat = Chisq_hat, direction = "greater")

null_distn %>%
  get_p_value(obs_stat = Chisq_hat, direction = "greater")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.037

Two categorical (>2 level) variables

Observed stat

( Chisq_hat <- fli_small %>%
  specify(formula = day_hour ~ origin) %>% 
  calculate(stat = "Chisq") )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.528

null_distn <- fli_small %>%
  specify(day_hour ~ origin) %>%
  hypothesize(null = "independence") %>% 
  generate(reps = 1000, type = "permute") %>% 
  calculate(stat = "Chisq")

visualize(null_distn) +
  shade_p_value(obs_stat = Chisq_hat, direction = "greater")

null_distn %>%
  get_p_value(obs_stat = Chisq_hat, direction = "greater")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1    0.77

One numerical variable, one categorical (2 levels) (diff in means)

Observed stat

( d_hat <- fli_small %>% 
  specify(dep_delay ~ season) %>% 
  calculate(stat = "diff in means", order = c("summer", "winter")) )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1     3

null_distn <- fli_small %>%
  specify(dep_delay ~ season) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in means", order = c("summer", "winter"))

visualize(null_distn) +
  shade_p_value(obs_stat = d_hat, direction = "two_sided")

null_distn %>%
  get_p_value(obs_stat = d_hat, direction = "two_sided")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.338

One numerical variable, one categorical (2 levels) (t)

Standardized observed stat

( t_hat <- fli_small %>% 
  specify(dep_delay ~ season) %>% 
  calculate(stat = "t", order = c("summer", "winter")) )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.891

null_distn <- fli_small %>%
  specify(dep_delay ~ season) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "t", order = c("summer", "winter"))

visualize(null_distn) +
  shade_p_value(obs_stat = t_hat, direction = "two_sided")

null_distn %>%
  get_p_value(obs_stat = t_hat, direction = "two_sided")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1     0.4

Note the similarities in this plot and the previous one.

One numerical variable, one categorical (2 levels) (diff in medians)

Observed stat

( d_hat <- fli_small %>% 
  specify(dep_delay ~ season) %>% 
  calculate(stat = "diff in medians", order = c("summer", "winter")) )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1     1

null_distn <- fli_small %>%
  specify(dep_delay ~ season) %>% # alt: response = dep_delay, 
  # explanatory = season
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "diff in medians", order = c("summer", "winter"))

visualize(null_distn) +
  shade_p_value(obs_stat = d_hat, direction = "two_sided")

null_distn %>%
  get_p_value(obs_stat = d_hat, direction = "two_sided")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1    0.64

One numerical, one categorical (>2 levels) - ANOVA

Observed stat

( F_hat <- fli_small %>% 
  specify(arr_delay ~ origin) %>%
  calculate(stat = "F") )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.686

null_distn <- fli_small %>%
   specify(arr_delay ~ origin) %>%
   hypothesize(null = "independence") %>%
   generate(reps = 1000, type = "permute") %>%
   calculate(stat = "F")

visualize(null_distn) +
  shade_p_value(obs_stat = F_hat, direction = "greater")

null_distn %>%
  get_p_value(obs_stat = F_hat, direction = "greater")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1   0.529

Two numerical vars - SLR

Observed stat

( slope_hat <- fli_small %>% 
  specify(arr_delay ~ dep_delay) %>% 
  calculate(stat = "slope") )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.992

null_distn <- fli_small %>%
   specify(arr_delay ~ dep_delay) %>% 
   hypothesize(null = "independence") %>%
   generate(reps = 1000, type = "permute") %>%
   calculate(stat = "slope")

visualize(null_distn) +
  shade_p_value(obs_stat = slope_hat, direction = "two_sided")

null_distn %>%
  get_p_value(obs_stat = slope_hat, direction = "two_sided")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1       0

Two numerical vars - correlation

Observed stat

( correlation_hat <- fli_small %>% 
  specify(arr_delay ~ dep_delay) %>% 
  calculate(stat = "correlation") )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.895

null_distn <- fli_small %>%
   specify(arr_delay ~ dep_delay) %>% 
   hypothesize(null = "independence") %>%
   generate(reps = 1000, type = "permute") %>%
   calculate(stat = "correlation")

visualize(null_distn) +
  shade_p_value(obs_stat = correlation_hat, direction = "two_sided")

null_distn %>%
  get_p_value(obs_stat = correlation_hat, direction = "two_sided")

## # A tibble: 1 x 1
##   p_value
##     <dbl>
## 1       0

Two numerical vars - SLR (t)

Not currently implemented since \(t\) could refer to standardized slope or standardized correlation.

Confidence intervals

One numerical (one mean)

Point estimate

( x_bar <- fli_small %>% 
  specify(response = arr_delay) %>%
  calculate(stat = "mean") )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  6.15

boot <- fli_small %>%
   specify(response = arr_delay) %>%
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "mean")
( percentile_ci <- get_ci(boot) )

## # A tibble: 1 x 2
##   `2.5%` `97.5%`
##    <dbl>   <dbl>
## 1   2.61    9.60

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)

( standard_error_ci <- get_ci(boot, type = "se", point_estimate = x_bar) )

## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1  2.61  9.70

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

One numerical (one mean - standardized)

Point estimate

( t_hat <- fli_small %>% 
  specify(response = arr_delay) %>%
  calculate(stat = "t") )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  3.30

boot <- fli_small %>%
   specify(response = arr_delay) %>%
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "t")
( percentile_ci <- get_ci(boot) )

## # A tibble: 1 x 2
##   `2.5%` `97.5%`
##    <dbl>   <dbl>
## 1   1.62    4.88

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)

( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) )

## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1  1.70  4.90

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

One categorical (one proportion)

Point estimate

( p_hat <- fli_small %>% 
   specify(response = day_hour, success = "morning") %>%
   calculate(stat = "prop") )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.452

boot <- fli_small %>%
 specify(response = day_hour, success = "morning") %>%
 generate(reps = 1000, type = "bootstrap") %>%
 calculate(stat = "prop")
( percentile_ci <- get_ci(boot) )

## # A tibble: 1 x 2
##   `2.5%` `97.5%`
##    <dbl>   <dbl>
## 1  0.406   0.496

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)

( standard_error_ci <- get_ci(boot, type = "se", point_estimate = p_hat) )

## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1 0.408 0.496

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

One categorical variable (standardized proportion \(z\))

Not yet implemented.

One numerical variable, one categorical (2 levels) (diff in means)

Point estimate

( d_hat <- fli_small %>%
  specify(arr_delay ~ season) %>%
  calculate(stat = "diff in means", order = c("summer", "winter")) )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  5.63

boot <- fli_small %>%
   specify(arr_delay ~ season) %>%
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "diff in means", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )

## # A tibble: 1 x 2
##   `2.5%` `97.5%`
##    <dbl>   <dbl>
## 1  -2.03    12.5

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)

( standard_error_ci <- get_ci(boot, type = "se", point_estimate = d_hat) )

## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1 -1.61  12.9

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

One numerical variable, one categorical (2 levels) (t)

Standardized point estimate

( t_hat <- fli_small %>%
  specify(arr_delay ~ season) %>%
  calculate(stat = "t", order = c("summer", "winter")) )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1  1.51

boot <- fli_small %>%
   specify(arr_delay ~ season) %>%
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "t", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )

## # A tibble: 1 x 2
##   `2.5%` `97.5%`
##    <dbl>   <dbl>
## 1 -0.359    3.74

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)

( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) )

## # A tibble: 1 x 2
##    lower upper
##    <dbl> <dbl>
## 1 -0.578  3.60

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

Two categorical variables (diff in proportions)

Point estimate

( d_hat <- fli_small %>% 
  specify(day_hour ~ season, success = "morning") %>%
  calculate(stat = "diff in props", order = c("summer", "winter")) )

## # A tibble: 1 x 1
##       stat
##      <dbl>
## 1 -0.00438

boot <- fli_small %>%
  specify(day_hour ~ season, success = "morning") %>%
  generate(reps = 1000, type = "bootstrap") %>% 
  calculate(stat = "diff in props", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )

## # A tibble: 1 x 2
##    `2.5%` `97.5%`
##     <dbl>   <dbl>
## 1 -0.0957  0.0818

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)

( standard_error_ci <- get_ci(boot, type = "se", point_estimate = d_hat) )

## # A tibble: 1 x 2
##     lower  upper
##     <dbl>  <dbl>
## 1 -0.0914 0.0826

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

Two categorical variables (z)

Standardized point estimate

( z_hat <- fli_small %>% 
  specify(day_hour ~ season, success = "morning") %>%
  calculate(stat = "z", order = c("summer", "winter")) )

## # A tibble: 1 x 1
##      stat
##     <dbl>
## 1 -0.0985

boot <- fli_small %>%
  specify(day_hour ~ season, success = "morning") %>%
  generate(reps = 1000, type = "bootstrap") %>% 
  calculate(stat = "z", order = c("summer", "winter"))
( percentile_ci <- get_ci(boot) )

## # A tibble: 1 x 2
##   `2.5%` `97.5%`
##    <dbl>   <dbl>
## 1  -1.96    1.79

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)

( standard_error_ci <- get_ci(boot, type = "se", point_estimate = z_hat) )

## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1 -2.04  1.85

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

Two numerical vars - SLR

Point estimate

( slope_hat <- fli_small %>% 
  specify(arr_delay ~ dep_delay) %>%
  calculate(stat = "slope") )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.992

boot <- fli_small %>%
   specify(arr_delay ~ dep_delay) %>% 
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "slope")
( percentile_ci <- get_ci(boot) )

## # A tibble: 1 x 2
##   `2.5%` `97.5%`
##    <dbl>   <dbl>
## 1  0.946    1.03

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)

( standard_error_ci <- get_ci(boot, type = "se", point_estimate = slope_hat) )

## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1 0.947  1.04

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

Two numerical vars - correlation

Point estimate

( correlation_hat <- fli_small %>% 
  specify(arr_delay ~ dep_delay) %>%
  calculate(stat = "correlation") )

## # A tibble: 1 x 1
##    stat
##   <dbl>
## 1 0.895

boot <- fli_small %>%
   specify(arr_delay ~ dep_delay) %>% 
   generate(reps = 1000, type = "bootstrap") %>%
   calculate(stat = "correlation")
( percentile_ci <- get_ci(boot) )

## # A tibble: 1 x 2
##   `2.5%` `97.5%`
##    <dbl>   <dbl>
## 1  0.827   0.933

visualize(boot) +
  shade_confidence_interval(endpoints = percentile_ci)

( standard_error_ci <- get_ci(boot, type = "se", 
                            point_estimate = correlation_hat) )

## # A tibble: 1 x 2
##   lower upper
##   <dbl> <dbl>
## 1 0.842 0.948

visualize(boot) +
  shade_confidence_interval(endpoints = standard_error_ci)

Two numerical vars - t

Not currently implemented since \(t\) could refer to standardized slope or standardized correlation.

Full infer pipeline examples using `nycflights13` `flights` data

Chester Ismay

Updated on 2018-06-14

Data preparation

Hypothesis tests

One numerical variable (mean)

One numerical variable (standardized mean \(t\))

One numerical variable (median)

One categorical (one proportion)

One categorical variable (standardized proportion \(z\))

Two categorical (2 level) variables

Two categorical (2 level) variables (z)

One categorical (>2 level) - GoF

Two categorical (>2 level) variables

One numerical variable, one categorical (2 levels) (diff in means)

One numerical variable, one categorical (2 levels) (t)

One numerical variable, one categorical (2 levels) (diff in medians)

One numerical, one categorical (>2 levels) - ANOVA

Two numerical vars - SLR

Two numerical vars - correlation

Two numerical vars - SLR (t)

Confidence intervals

One numerical (one mean)

One numerical (one mean - standardized)

One categorical (one proportion)

One categorical variable (standardized proportion \(z\))

One numerical variable, one categorical (2 levels) (diff in means)

One numerical variable, one categorical (2 levels) (t)

Two categorical variables (diff in proportions)

Two categorical variables (z)

Two numerical vars - SLR

Two numerical vars - correlation

Two numerical vars - t

Contents

Full infer pipeline examples using nycflights13 flights data

Chester Ismay

Updated on 2018-06-14

Data preparation

Hypothesis tests

One numerical variable (mean)

One numerical variable (standardized mean \(t\))

One numerical variable (median)

One categorical (one proportion)

One categorical variable (standardized proportion \(z\))

Two categorical (2 level) variables

Two categorical (2 level) variables (z)

One categorical (>2 level) - GoF

Two categorical (>2 level) variables

One numerical variable, one categorical (2 levels) (diff in means)

One numerical variable, one categorical (2 levels) (t)

One numerical variable, one categorical (2 levels) (diff in medians)

One numerical, one categorical (>2 levels) - ANOVA

Two numerical vars - SLR

Two numerical vars - correlation

Two numerical vars - SLR (t)

Confidence intervals

One numerical (one mean)

One numerical (one mean - standardized)

One categorical (one proportion)

One categorical variable (standardized proportion \(z\))

One numerical variable, one categorical (2 levels) (diff in means)

One numerical variable, one categorical (2 levels) (t)

Two categorical variables (diff in proportions)

Two categorical variables (z)

Two numerical vars - SLR

Two numerical vars - correlation

Two numerical vars - t

Contents

Full infer pipeline examples using `nycflights13` `flights` data