Model building

MACS 30500
University of Chicago

October 17, 2016

Statistical models

Linear models

\[y = \beta_0 + \beta_1 * x\]

Least squares

Using `ggplot`

`lm()`

sim1_mod <- lm(y ~ x, data = sim1)
coef(sim1_mod)

## (Intercept)           x 
##    4.220822    2.051533

Drawing the line

grid <- sim1 %>% 
  data_grid(x) 
grid

## # A tibble: 10 × 1
##        x
##    <int>
## 1      1
## 2      2
## 3      3
## 4      4
## 5      5
## 6      6
## 7      7
## 8      8
## 9      9
## 10    10

grid <- grid %>% 
  add_predictions(sim1_mod) 
grid

## # A tibble: 10 × 2
##        x      pred
##    <int>     <dbl>
## 1      1  6.272355
## 2      2  8.323888
## 3      3 10.375421
## 4      4 12.426954
## 5      5 14.478487
## 6      6 16.530020
## 7      7 18.581553
## 8      8 20.633087
## 9      9 22.684620
## 10    10 24.736153

ggplot(sim1, aes(x)) +
  geom_point(aes(y = y)) +
  geom_line(aes(y = pred), data = grid, color = "red", size = 1)

Residuals

sim1 <- sim1 %>% 
  add_residuals(sim1_mod)
sim1

## # A tibble: 30 × 3
##        x         y        resid
##    <int>     <dbl>        <dbl>
## 1      1  4.199913 -2.072442018
## 2      1  7.510634  1.238279125
## 3      1  2.125473 -4.146882207
## 4      2  8.988857  0.664969362
## 5      2 10.243105  1.919217378
## 6      2 11.296823  2.972935148
## 7      3  7.356365 -3.019056466
## 8      3 10.505349  0.129928252
## 9      3 10.511601  0.136179642
## 10     4 12.434589  0.007634878
## # ... with 20 more rows

ggplot(sim1, aes(resid)) + 
  geom_freqpoly(binwidth = 0.5)

Other families of models

Stepwise regression
Generalized linear models (GLM)
Generalized additive models (GAM)
Decision trees/random forests
Neural networks
Support vector machines

`gapminder`

library(gapminder)
gapminder

## # A tibble: 1,704 × 6
##        country continent  year lifeExp      pop gdpPercap
##         <fctr>    <fctr> <int>   <dbl>    <int>     <dbl>
## 1  Afghanistan      Asia  1952  28.801  8425333  779.4453
## 2  Afghanistan      Asia  1957  30.332  9240934  820.8530
## 3  Afghanistan      Asia  1962  31.997 10267083  853.1007
## 4  Afghanistan      Asia  1967  34.020 11537966  836.1971
## 5  Afghanistan      Asia  1972  36.088 13079460  739.9811
## 6  Afghanistan      Asia  1977  38.438 14880372  786.1134
## 7  Afghanistan      Asia  1982  39.854 12881816  978.0114
## 8  Afghanistan      Asia  1987  40.822 13867957  852.3959
## 9  Afghanistan      Asia  1992  41.674 16317921  649.3414
## 10 Afghanistan      Asia  1997  41.763 22227415  635.3414
## # ... with 1,694 more rows

Life expectancy over time

gapminder_mod <- lm(lifeExp ~ year, data = gapminder)
summary(gapminder_mod)

## 
## Call:
## lm(formula = lifeExp ~ year, data = gapminder)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.949  -9.651   1.697  10.335  22.158 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -585.65219   32.31396  -18.12   <2e-16 ***
## year           0.32590    0.01632   19.96   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.63 on 1702 degrees of freedom
## Multiple R-squared:  0.1898, Adjusted R-squared:  0.1893 
## F-statistic: 398.6 on 1 and 1702 DF,  p-value: < 2.2e-16

grid <- gapminder %>% 
  data_grid(year, country) 

grid <- grid %>% 
  add_predictions(gapminder_mod) 

ggplot(gapminder, aes(year, group = country)) +
  geom_line(aes(y = lifeExp), alpha = .2) +
  geom_line(aes(y = pred), data = grid, color = "red", size = 1)

Extracting model statistics

str(gapminder_mod)

## List of 12
##  $ coefficients : Named num [1:2] -585.652 0.326
##   ..- attr(*, "names")= chr [1:2] "(Intercept)" "year"
##  $ residuals    : Named num [1:1704] -21.7 -21.8 -21.8 -21.4 -20.9 ...
##   ..- attr(*, "names")= chr [1:1704] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:1704] -2455.1 232.2 -20.8 -20.5 -20.2 ...
##   ..- attr(*, "names")= chr [1:1704] "(Intercept)" "year" "" "" ...
##  $ rank         : int 2
##  $ fitted.values: Named num [1:1704] 50.5 52.1 53.8 55.4 57 ...
##   ..- attr(*, "names")= chr [1:1704] "1" "2" "3" "4" ...
##  $ assign       : int [1:2] 0 1
##  $ qr           :List of 5
##   ..$ qr   : num [1:1704, 1:2] -41.2795 0.0242 0.0242 0.0242 0.0242 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:1704] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:2] "(Intercept)" "year"
##   .. ..- attr(*, "assign")= int [1:2] 0 1
##   ..$ qraux: num [1:2] 1.02 1.03
##   ..$ pivot: int [1:2] 1 2
##   ..$ tol  : num 1e-07
##   ..$ rank : int 2
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 1702
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = lifeExp ~ year, data = gapminder)
##  $ terms        :Classes 'terms', 'formula'  language lifeExp ~ year
##   .. ..- attr(*, "variables")= language list(lifeExp, year)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "lifeExp" "year"
##   .. .. .. ..$ : chr "year"
##   .. ..- attr(*, "term.labels")= chr "year"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(lifeExp, year)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:2] "lifeExp" "year"
##  $ model        :'data.frame':   1704 obs. of  2 variables:
##   ..$ lifeExp: num [1:1704] 28.8 30.3 32 34 36.1 ...
##   ..$ year   : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language lifeExp ~ year
##   .. .. ..- attr(*, "variables")= language list(lifeExp, year)
##   .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:2] "lifeExp" "year"
##   .. .. .. .. ..$ : chr "year"
##   .. .. ..- attr(*, "term.labels")= chr "year"
##   .. .. ..- attr(*, "order")= int 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(lifeExp, year)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. .. ..- attr(*, "names")= chr [1:2] "lifeExp" "year"
##  - attr(*, "class")= chr "lm"

`tidy`

tidy(gapminder_mod)

##          term     estimate   std.error statistic      p.value
## 1 (Intercept) -585.6521874 32.31396452 -18.12381 2.897807e-67
## 2        year    0.3259038  0.01632369  19.96509 7.546795e-80

tidy(gapminder_mod) %>%
  str()

## 'data.frame':    2 obs. of  5 variables:
##  $ term     : chr  "(Intercept)" "year"
##  $ estimate : num  -585.652 0.326
##  $ std.error: num  32.314 0.0163
##  $ statistic: num  -18.1 20
##  $ p.value  : num  2.90e-67 7.55e-80

`augment`

augment(gapminder_mod) %>%
  tbl_df()

## # A tibble: 1,704 × 9
##    lifeExp  year  .fitted   .se.fit    .resid         .hat   .sigma
##      <dbl> <int>    <dbl>     <dbl>     <dbl>        <dbl>    <dbl>
## 1   28.801  1952 50.51208 0.5299963 -21.71108 0.0020765619 11.62203
## 2   30.332  1957 52.14160 0.4629044 -21.80960 0.0015840967 11.62193
## 3   31.997  1962 53.77112 0.4012330 -21.77412 0.0011901244 11.62197
## 4   34.020  1967 55.40064 0.3478771 -21.38064 0.0008946453 11.62241
## 5   36.088  1972 57.03016 0.3072006 -20.94216 0.0006976591 11.62288
## 6   38.438  1977 58.65968 0.2846912 -20.22168 0.0005991661 11.62363
## 7   39.854  1982 60.28920 0.2846912 -20.43520 0.0005991661 11.62341
## 8   40.822  1987 61.91872 0.3072006 -21.09672 0.0006976591 11.62271
## 9   41.674  1992 63.54824 0.3478771 -21.87424 0.0008946453 11.62187
## 10  41.763  1997 65.17776 0.4012330 -23.41476 0.0011901244 11.62010
## # ... with 1,694 more rows, and 2 more variables: .cooksd <dbl>,
## #   .std.resid <dbl>

`glance`

glance(gapminder_mod)

##   r.squared adj.r.squared    sigma statistic      p.value df    logLik
## 1 0.1897571     0.1892811 11.63055  398.6047 7.546795e-80  2 -6597.866
##        AIC      BIC deviance df.residual
## 1 13201.73 13218.05 230229.2        1702

Separate model

Separate models for each country

by_country <- gapminder %>% 
  group_by(country, continent) %>% 
  nest()

by_country

## # A tibble: 142 × 3
##        country continent              data
##         <fctr>    <fctr>            <list>
## 1  Afghanistan      Asia <tibble [12 × 4]>
## 2      Albania    Europe <tibble [12 × 4]>
## 3      Algeria    Africa <tibble [12 × 4]>
## 4       Angola    Africa <tibble [12 × 4]>
## 5    Argentina  Americas <tibble [12 × 4]>
## 6    Australia   Oceania <tibble [12 × 4]>
## 7      Austria    Europe <tibble [12 × 4]>
## 8      Bahrain      Asia <tibble [12 × 4]>
## 9   Bangladesh      Asia <tibble [12 × 4]>
## 10     Belgium    Europe <tibble [12 × 4]>
## # ... with 132 more rows

Separate models for each country

by_country$data[[1]]

## # A tibble: 12 × 4
##     year lifeExp      pop gdpPercap
##    <int>   <dbl>    <int>     <dbl>
## 1   1952  28.801  8425333  779.4453
## 2   1957  30.332  9240934  820.8530
## 3   1962  31.997 10267083  853.1007
## 4   1967  34.020 11537966  836.1971
## 5   1972  36.088 13079460  739.9811
## 6   1977  38.438 14880372  786.1134
## 7   1982  39.854 12881816  978.0114
## 8   1987  40.822 13867957  852.3959
## 9   1992  41.674 16317921  649.3414
## 10  1997  41.763 22227415  635.3414
## 11  2002  42.129 25268405  726.7341
## 12  2007  43.828 31889923  974.5803

Separate models for each country

country_model <- function(df) {
  lm(lifeExp ~ year, data = df)
}

by_country <- by_country %>%
  mutate(model = map(data, country_model))
by_country

## # A tibble: 142 × 4
##        country continent              data    model
##         <fctr>    <fctr>            <list>   <list>
## 1  Afghanistan      Asia <tibble [12 × 4]> <S3: lm>
## 2      Albania    Europe <tibble [12 × 4]> <S3: lm>
## 3      Algeria    Africa <tibble [12 × 4]> <S3: lm>
## 4       Angola    Africa <tibble [12 × 4]> <S3: lm>
## 5    Argentina  Americas <tibble [12 × 4]> <S3: lm>
## 6    Australia   Oceania <tibble [12 × 4]> <S3: lm>
## 7      Austria    Europe <tibble [12 × 4]> <S3: lm>
## 8      Bahrain      Asia <tibble [12 × 4]> <S3: lm>
## 9   Bangladesh      Asia <tibble [12 × 4]> <S3: lm>
## 10     Belgium    Europe <tibble [12 × 4]> <S3: lm>
## # ... with 132 more rows

Separate models for each country

by_country %>% 
  filter(continent == "Europe")

## # A tibble: 30 × 4
##                   country continent              data    model
##                    <fctr>    <fctr>            <list>   <list>
## 1                 Albania    Europe <tibble [12 × 4]> <S3: lm>
## 2                 Austria    Europe <tibble [12 × 4]> <S3: lm>
## 3                 Belgium    Europe <tibble [12 × 4]> <S3: lm>
## 4  Bosnia and Herzegovina    Europe <tibble [12 × 4]> <S3: lm>
## 5                Bulgaria    Europe <tibble [12 × 4]> <S3: lm>
## 6                 Croatia    Europe <tibble [12 × 4]> <S3: lm>
## 7          Czech Republic    Europe <tibble [12 × 4]> <S3: lm>
## 8                 Denmark    Europe <tibble [12 × 4]> <S3: lm>
## 9                 Finland    Europe <tibble [12 × 4]> <S3: lm>
## 10                 France    Europe <tibble [12 × 4]> <S3: lm>
## # ... with 20 more rows

Unnesting

by_country <- by_country %>% 
  mutate(
    resids = map2(data, model, add_residuals)
  )
by_country

## # A tibble: 142 × 5
##        country continent              data    model            resids
##         <fctr>    <fctr>            <list>   <list>            <list>
## 1  Afghanistan      Asia <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 2      Albania    Europe <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 3      Algeria    Africa <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 4       Angola    Africa <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 5    Argentina  Americas <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 6    Australia   Oceania <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 7      Austria    Europe <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 8      Bahrain      Asia <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 9   Bangladesh      Asia <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 10     Belgium    Europe <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## # ... with 132 more rows

Residuals

resids <- unnest(by_country, resids)
resids

## # A tibble: 1,704 × 7
##        country continent  year lifeExp      pop gdpPercap       resid
##         <fctr>    <fctr> <int>   <dbl>    <int>     <dbl>       <dbl>
## 1  Afghanistan      Asia  1952  28.801  8425333  779.4453 -1.10629487
## 2  Afghanistan      Asia  1957  30.332  9240934  820.8530 -0.95193823
## 3  Afghanistan      Asia  1962  31.997 10267083  853.1007 -0.66358159
## 4  Afghanistan      Asia  1967  34.020 11537966  836.1971 -0.01722494
## 5  Afghanistan      Asia  1972  36.088 13079460  739.9811  0.67413170
## 6  Afghanistan      Asia  1977  38.438 14880372  786.1134  1.64748834
## 7  Afghanistan      Asia  1982  39.854 12881816  978.0114  1.68684499
## 8  Afghanistan      Asia  1987  40.822 13867957  852.3959  1.27820163
## 9  Afghanistan      Asia  1992  41.674 16317921  649.3414  0.75355828
## 10 Afghanistan      Asia  1997  41.763 22227415  635.3414 -0.53408508
## # ... with 1,694 more rows

resids %>% 
  ggplot(aes(year, resid)) +
    geom_line(aes(group = country), alpha = 1 / 3) + 
    geom_smooth(se = FALSE)

Practice with `scorecard`

What is the relationship between admission rate and cost?
Estimate a linear regression model for admission rate and cost
Does the type of college make a difference?
Separate models for each type of college
Compare separate models’ residuals to original model

Model building

MACS 30500 University of Chicago

October 17, 2016

Statistical models

Statistical models

Linear models

Least squares

Using ggplot

lm()

Drawing the line

Residuals

Other families of models

gapminder

Life expectancy over time

Life expectancy over time

Extracting model statistics

tidy

augment

glance

Separate model

Separate models for each country

Separate models for each country

Separate models for each country

Separate models for each country

Unnesting

Residuals

Practice with scorecard

MACS 30500
University of Chicago

Using `ggplot`

`lm()`

`gapminder`

`tidy`

`augment`

`glance`

Practice with `scorecard`