Model building

MACS 30500
University of Chicago

October 17, 2016

Statistical models

Statistical models

Linear models

\[y = \beta_0 + \beta_1 * x\]

Least squares

Using ggplot

lm()

sim1_mod <- lm(y ~ x, data = sim1)
coef(sim1_mod)
## (Intercept)           x 
##    4.220822    2.051533

Drawing the line

grid <- sim1 %>% 
  data_grid(x) 
grid
## # A tibble: 10 × 1
##        x
##    <int>
## 1      1
## 2      2
## 3      3
## 4      4
## 5      5
## 6      6
## 7      7
## 8      8
## 9      9
## 10    10
grid <- grid %>% 
  add_predictions(sim1_mod) 
grid
## # A tibble: 10 × 2
##        x      pred
##    <int>     <dbl>
## 1      1  6.272355
## 2      2  8.323888
## 3      3 10.375421
## 4      4 12.426954
## 5      5 14.478487
## 6      6 16.530020
## 7      7 18.581553
## 8      8 20.633087
## 9      9 22.684620
## 10    10 24.736153
ggplot(sim1, aes(x)) +
  geom_point(aes(y = y)) +
  geom_line(aes(y = pred), data = grid, color = "red", size = 1)

Residuals

sim1 <- sim1 %>% 
  add_residuals(sim1_mod)
sim1
## # A tibble: 30 × 3
##        x         y        resid
##    <int>     <dbl>        <dbl>
## 1      1  4.199913 -2.072442018
## 2      1  7.510634  1.238279125
## 3      1  2.125473 -4.146882207
## 4      2  8.988857  0.664969362
## 5      2 10.243105  1.919217378
## 6      2 11.296823  2.972935148
## 7      3  7.356365 -3.019056466
## 8      3 10.505349  0.129928252
## 9      3 10.511601  0.136179642
## 10     4 12.434589  0.007634878
## # ... with 20 more rows
ggplot(sim1, aes(resid)) + 
  geom_freqpoly(binwidth = 0.5)

Other families of models

  • Stepwise regression
  • Generalized linear models (GLM)
  • Generalized additive models (GAM)
  • Decision trees/random forests
  • Neural networks
  • Support vector machines

gapminder

library(gapminder)
gapminder
## # A tibble: 1,704 × 6
##        country continent  year lifeExp      pop gdpPercap
##         <fctr>    <fctr> <int>   <dbl>    <int>     <dbl>
## 1  Afghanistan      Asia  1952  28.801  8425333  779.4453
## 2  Afghanistan      Asia  1957  30.332  9240934  820.8530
## 3  Afghanistan      Asia  1962  31.997 10267083  853.1007
## 4  Afghanistan      Asia  1967  34.020 11537966  836.1971
## 5  Afghanistan      Asia  1972  36.088 13079460  739.9811
## 6  Afghanistan      Asia  1977  38.438 14880372  786.1134
## 7  Afghanistan      Asia  1982  39.854 12881816  978.0114
## 8  Afghanistan      Asia  1987  40.822 13867957  852.3959
## 9  Afghanistan      Asia  1992  41.674 16317921  649.3414
## 10 Afghanistan      Asia  1997  41.763 22227415  635.3414
## # ... with 1,694 more rows

Life expectancy over time

Life expectancy over time

gapminder_mod <- lm(lifeExp ~ year, data = gapminder)
summary(gapminder_mod)
## 
## Call:
## lm(formula = lifeExp ~ year, data = gapminder)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.949  -9.651   1.697  10.335  22.158 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -585.65219   32.31396  -18.12   <2e-16 ***
## year           0.32590    0.01632   19.96   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.63 on 1702 degrees of freedom
## Multiple R-squared:  0.1898, Adjusted R-squared:  0.1893 
## F-statistic: 398.6 on 1 and 1702 DF,  p-value: < 2.2e-16
grid <- gapminder %>% 
  data_grid(year, country) 

grid <- grid %>% 
  add_predictions(gapminder_mod) 

ggplot(gapminder, aes(year, group = country)) +
  geom_line(aes(y = lifeExp), alpha = .2) +
  geom_line(aes(y = pred), data = grid, color = "red", size = 1)

Extracting model statistics

str(gapminder_mod)
## List of 12
##  $ coefficients : Named num [1:2] -585.652 0.326
##   ..- attr(*, "names")= chr [1:2] "(Intercept)" "year"
##  $ residuals    : Named num [1:1704] -21.7 -21.8 -21.8 -21.4 -20.9 ...
##   ..- attr(*, "names")= chr [1:1704] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:1704] -2455.1 232.2 -20.8 -20.5 -20.2 ...
##   ..- attr(*, "names")= chr [1:1704] "(Intercept)" "year" "" "" ...
##  $ rank         : int 2
##  $ fitted.values: Named num [1:1704] 50.5 52.1 53.8 55.4 57 ...
##   ..- attr(*, "names")= chr [1:1704] "1" "2" "3" "4" ...
##  $ assign       : int [1:2] 0 1
##  $ qr           :List of 5
##   ..$ qr   : num [1:1704, 1:2] -41.2795 0.0242 0.0242 0.0242 0.0242 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:1704] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:2] "(Intercept)" "year"
##   .. ..- attr(*, "assign")= int [1:2] 0 1
##   ..$ qraux: num [1:2] 1.02 1.03
##   ..$ pivot: int [1:2] 1 2
##   ..$ tol  : num 1e-07
##   ..$ rank : int 2
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 1702
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = lifeExp ~ year, data = gapminder)
##  $ terms        :Classes 'terms', 'formula'  language lifeExp ~ year
##   .. ..- attr(*, "variables")= language list(lifeExp, year)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "lifeExp" "year"
##   .. .. .. ..$ : chr "year"
##   .. ..- attr(*, "term.labels")= chr "year"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(lifeExp, year)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:2] "lifeExp" "year"
##  $ model        :'data.frame':   1704 obs. of  2 variables:
##   ..$ lifeExp: num [1:1704] 28.8 30.3 32 34 36.1 ...
##   ..$ year   : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language lifeExp ~ year
##   .. .. ..- attr(*, "variables")= language list(lifeExp, year)
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "lifeExp" "year"
##   .. .. .. .. ..$ : chr "year"
##   .. .. ..- attr(*, "term.labels")= chr "year"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(lifeExp, year)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "lifeExp" "year"
##  - attr(*, "class")= chr "lm"

tidy

tidy(gapminder_mod)
##          term     estimate   std.error statistic      p.value
## 1 (Intercept) -585.6521874 32.31396452 -18.12381 2.897807e-67
## 2        year    0.3259038  0.01632369  19.96509 7.546795e-80
tidy(gapminder_mod) %>%
  str()
## 'data.frame':    2 obs. of  5 variables:
##  $ term     : chr  "(Intercept)" "year"
##  $ estimate : num  -585.652 0.326
##  $ std.error: num  32.314 0.0163
##  $ statistic: num  -18.1 20
##  $ p.value  : num  2.90e-67 7.55e-80

augment

augment(gapminder_mod) %>%
  tbl_df()
## # A tibble: 1,704 × 9
##    lifeExp  year  .fitted   .se.fit    .resid         .hat   .sigma
##      <dbl> <int>    <dbl>     <dbl>     <dbl>        <dbl>    <dbl>
## 1   28.801  1952 50.51208 0.5299963 -21.71108 0.0020765619 11.62203
## 2   30.332  1957 52.14160 0.4629044 -21.80960 0.0015840967 11.62193
## 3   31.997  1962 53.77112 0.4012330 -21.77412 0.0011901244 11.62197
## 4   34.020  1967 55.40064 0.3478771 -21.38064 0.0008946453 11.62241
## 5   36.088  1972 57.03016 0.3072006 -20.94216 0.0006976591 11.62288
## 6   38.438  1977 58.65968 0.2846912 -20.22168 0.0005991661 11.62363
## 7   39.854  1982 60.28920 0.2846912 -20.43520 0.0005991661 11.62341
## 8   40.822  1987 61.91872 0.3072006 -21.09672 0.0006976591 11.62271
## 9   41.674  1992 63.54824 0.3478771 -21.87424 0.0008946453 11.62187
## 10  41.763  1997 65.17776 0.4012330 -23.41476 0.0011901244 11.62010
## # ... with 1,694 more rows, and 2 more variables: .cooksd <dbl>,
## #   .std.resid <dbl>

glance

glance(gapminder_mod)
##   r.squared adj.r.squared    sigma statistic      p.value df    logLik
## 1 0.1897571     0.1892811 11.63055  398.6047 7.546795e-80  2 -6597.866
##        AIC      BIC deviance df.residual
## 1 13201.73 13218.05 230229.2        1702

Separate model

Separate models for each country

by_country <- gapminder %>% 
  group_by(country, continent) %>% 
  nest()

by_country
## # A tibble: 142 × 3
##        country continent              data
##         <fctr>    <fctr>            <list>
## 1  Afghanistan      Asia <tibble [12 × 4]>
## 2      Albania    Europe <tibble [12 × 4]>
## 3      Algeria    Africa <tibble [12 × 4]>
## 4       Angola    Africa <tibble [12 × 4]>
## 5    Argentina  Americas <tibble [12 × 4]>
## 6    Australia   Oceania <tibble [12 × 4]>
## 7      Austria    Europe <tibble [12 × 4]>
## 8      Bahrain      Asia <tibble [12 × 4]>
## 9   Bangladesh      Asia <tibble [12 × 4]>
## 10     Belgium    Europe <tibble [12 × 4]>
## # ... with 132 more rows

Separate models for each country

by_country$data[[1]]
## # A tibble: 12 × 4
##     year lifeExp      pop gdpPercap
##    <int>   <dbl>    <int>     <dbl>
## 1   1952  28.801  8425333  779.4453
## 2   1957  30.332  9240934  820.8530
## 3   1962  31.997 10267083  853.1007
## 4   1967  34.020 11537966  836.1971
## 5   1972  36.088 13079460  739.9811
## 6   1977  38.438 14880372  786.1134
## 7   1982  39.854 12881816  978.0114
## 8   1987  40.822 13867957  852.3959
## 9   1992  41.674 16317921  649.3414
## 10  1997  41.763 22227415  635.3414
## 11  2002  42.129 25268405  726.7341
## 12  2007  43.828 31889923  974.5803

Separate models for each country

country_model <- function(df) {
  lm(lifeExp ~ year, data = df)
}

by_country <- by_country %>%
  mutate(model = map(data, country_model))
by_country
## # A tibble: 142 × 4
##        country continent              data    model
##         <fctr>    <fctr>            <list>   <list>
## 1  Afghanistan      Asia <tibble [12 × 4]> <S3: lm>
## 2      Albania    Europe <tibble [12 × 4]> <S3: lm>
## 3      Algeria    Africa <tibble [12 × 4]> <S3: lm>
## 4       Angola    Africa <tibble [12 × 4]> <S3: lm>
## 5    Argentina  Americas <tibble [12 × 4]> <S3: lm>
## 6    Australia   Oceania <tibble [12 × 4]> <S3: lm>
## 7      Austria    Europe <tibble [12 × 4]> <S3: lm>
## 8      Bahrain      Asia <tibble [12 × 4]> <S3: lm>
## 9   Bangladesh      Asia <tibble [12 × 4]> <S3: lm>
## 10     Belgium    Europe <tibble [12 × 4]> <S3: lm>
## # ... with 132 more rows

Separate models for each country

by_country %>% 
  filter(continent == "Europe")
## # A tibble: 30 × 4
##                   country continent              data    model
##                    <fctr>    <fctr>            <list>   <list>
## 1                 Albania    Europe <tibble [12 × 4]> <S3: lm>
## 2                 Austria    Europe <tibble [12 × 4]> <S3: lm>
## 3                 Belgium    Europe <tibble [12 × 4]> <S3: lm>
## 4  Bosnia and Herzegovina    Europe <tibble [12 × 4]> <S3: lm>
## 5                Bulgaria    Europe <tibble [12 × 4]> <S3: lm>
## 6                 Croatia    Europe <tibble [12 × 4]> <S3: lm>
## 7          Czech Republic    Europe <tibble [12 × 4]> <S3: lm>
## 8                 Denmark    Europe <tibble [12 × 4]> <S3: lm>
## 9                 Finland    Europe <tibble [12 × 4]> <S3: lm>
## 10                 France    Europe <tibble [12 × 4]> <S3: lm>
## # ... with 20 more rows

Unnesting

by_country <- by_country %>% 
  mutate(
    resids = map2(data, model, add_residuals)
  )
by_country
## # A tibble: 142 × 5
##        country continent              data    model            resids
##         <fctr>    <fctr>            <list>   <list>            <list>
## 1  Afghanistan      Asia <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 2      Albania    Europe <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 3      Algeria    Africa <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 4       Angola    Africa <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 5    Argentina  Americas <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 6    Australia   Oceania <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 7      Austria    Europe <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 8      Bahrain      Asia <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 9   Bangladesh      Asia <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 10     Belgium    Europe <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## # ... with 132 more rows

Residuals

resids <- unnest(by_country, resids)
resids
## # A tibble: 1,704 × 7
##        country continent  year lifeExp      pop gdpPercap       resid
##         <fctr>    <fctr> <int>   <dbl>    <int>     <dbl>       <dbl>
## 1  Afghanistan      Asia  1952  28.801  8425333  779.4453 -1.10629487
## 2  Afghanistan      Asia  1957  30.332  9240934  820.8530 -0.95193823
## 3  Afghanistan      Asia  1962  31.997 10267083  853.1007 -0.66358159
## 4  Afghanistan      Asia  1967  34.020 11537966  836.1971 -0.01722494
## 5  Afghanistan      Asia  1972  36.088 13079460  739.9811  0.67413170
## 6  Afghanistan      Asia  1977  38.438 14880372  786.1134  1.64748834
## 7  Afghanistan      Asia  1982  39.854 12881816  978.0114  1.68684499
## 8  Afghanistan      Asia  1987  40.822 13867957  852.3959  1.27820163
## 9  Afghanistan      Asia  1992  41.674 16317921  649.3414  0.75355828
## 10 Afghanistan      Asia  1997  41.763 22227415  635.3414 -0.53408508
## # ... with 1,694 more rows
resids %>% 
  ggplot(aes(year, resid)) +
    geom_line(aes(group = country), alpha = 1 / 3) + 
    geom_smooth(se = FALSE)

Practice with scorecard

  • What is the relationship between admission rate and cost?
  • Estimate a linear regression model for admission rate and cost
  • Does the type of college make a difference?
  • Separate models for each type of college
  • Compare separate models’ residuals to original model