MACS 30500
University of Chicago
\[y = \beta_0 + \beta_1 * x\]
ggplot
lm()
sim1_mod <- lm(y ~ x, data = sim1)
coef(sim1_mod)
## (Intercept) x
## 4.220822 2.051533
grid <- sim1 %>%
data_grid(x)
grid
## # A tibble: 10 × 1
## x
## <int>
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## 7 7
## 8 8
## 9 9
## 10 10
grid <- grid %>%
add_predictions(sim1_mod)
grid
## # A tibble: 10 × 2
## x pred
## <int> <dbl>
## 1 1 6.272355
## 2 2 8.323888
## 3 3 10.375421
## 4 4 12.426954
## 5 5 14.478487
## 6 6 16.530020
## 7 7 18.581553
## 8 8 20.633087
## 9 9 22.684620
## 10 10 24.736153
ggplot(sim1, aes(x)) +
geom_point(aes(y = y)) +
geom_line(aes(y = pred), data = grid, color = "red", size = 1)
sim1 <- sim1 %>%
add_residuals(sim1_mod)
sim1
## # A tibble: 30 × 3
## x y resid
## <int> <dbl> <dbl>
## 1 1 4.199913 -2.072442018
## 2 1 7.510634 1.238279125
## 3 1 2.125473 -4.146882207
## 4 2 8.988857 0.664969362
## 5 2 10.243105 1.919217378
## 6 2 11.296823 2.972935148
## 7 3 7.356365 -3.019056466
## 8 3 10.505349 0.129928252
## 9 3 10.511601 0.136179642
## 10 4 12.434589 0.007634878
## # ... with 20 more rows
ggplot(sim1, aes(resid)) +
geom_freqpoly(binwidth = 0.5)
gapminder
library(gapminder)
gapminder
## # A tibble: 1,704 × 6
## country continent year lifeExp pop gdpPercap
## <fctr> <fctr> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.801 8425333 779.4453
## 2 Afghanistan Asia 1957 30.332 9240934 820.8530
## 3 Afghanistan Asia 1962 31.997 10267083 853.1007
## 4 Afghanistan Asia 1967 34.020 11537966 836.1971
## 5 Afghanistan Asia 1972 36.088 13079460 739.9811
## 6 Afghanistan Asia 1977 38.438 14880372 786.1134
## 7 Afghanistan Asia 1982 39.854 12881816 978.0114
## 8 Afghanistan Asia 1987 40.822 13867957 852.3959
## 9 Afghanistan Asia 1992 41.674 16317921 649.3414
## 10 Afghanistan Asia 1997 41.763 22227415 635.3414
## # ... with 1,694 more rows
gapminder_mod <- lm(lifeExp ~ year, data = gapminder)
summary(gapminder_mod)
##
## Call:
## lm(formula = lifeExp ~ year, data = gapminder)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.949 -9.651 1.697 10.335 22.158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -585.65219 32.31396 -18.12 <2e-16 ***
## year 0.32590 0.01632 19.96 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.63 on 1702 degrees of freedom
## Multiple R-squared: 0.1898, Adjusted R-squared: 0.1893
## F-statistic: 398.6 on 1 and 1702 DF, p-value: < 2.2e-16
grid <- gapminder %>%
data_grid(year, country)
grid <- grid %>%
add_predictions(gapminder_mod)
ggplot(gapminder, aes(year, group = country)) +
geom_line(aes(y = lifeExp), alpha = .2) +
geom_line(aes(y = pred), data = grid, color = "red", size = 1)
str(gapminder_mod)
## List of 12
## $ coefficients : Named num [1:2] -585.652 0.326
## ..- attr(*, "names")= chr [1:2] "(Intercept)" "year"
## $ residuals : Named num [1:1704] -21.7 -21.8 -21.8 -21.4 -20.9 ...
## ..- attr(*, "names")= chr [1:1704] "1" "2" "3" "4" ...
## $ effects : Named num [1:1704] -2455.1 232.2 -20.8 -20.5 -20.2 ...
## ..- attr(*, "names")= chr [1:1704] "(Intercept)" "year" "" "" ...
## $ rank : int 2
## $ fitted.values: Named num [1:1704] 50.5 52.1 53.8 55.4 57 ...
## ..- attr(*, "names")= chr [1:1704] "1" "2" "3" "4" ...
## $ assign : int [1:2] 0 1
## $ qr :List of 5
## ..$ qr : num [1:1704, 1:2] -41.2795 0.0242 0.0242 0.0242 0.0242 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:1704] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:2] "(Intercept)" "year"
## .. ..- attr(*, "assign")= int [1:2] 0 1
## ..$ qraux: num [1:2] 1.02 1.03
## ..$ pivot: int [1:2] 1 2
## ..$ tol : num 1e-07
## ..$ rank : int 2
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 1702
## $ xlevels : Named list()
## $ call : language lm(formula = lifeExp ~ year, data = gapminder)
## $ terms :Classes 'terms', 'formula' language lifeExp ~ year
## .. ..- attr(*, "variables")= language list(lifeExp, year)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "lifeExp" "year"
## .. .. .. ..$ : chr "year"
## .. ..- attr(*, "term.labels")= chr "year"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(lifeExp, year)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:2] "lifeExp" "year"
## $ model :'data.frame': 1704 obs. of 2 variables:
## ..$ lifeExp: num [1:1704] 28.8 30.3 32 34 36.1 ...
## ..$ year : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language lifeExp ~ year
## .. .. ..- attr(*, "variables")= language list(lifeExp, year)
## .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:2] "lifeExp" "year"
## .. .. .. .. ..$ : chr "year"
## .. .. ..- attr(*, "term.labels")= chr "year"
## .. .. ..- attr(*, "order")= int 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(lifeExp, year)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. .. ..- attr(*, "names")= chr [1:2] "lifeExp" "year"
## - attr(*, "class")= chr "lm"
tidy
tidy(gapminder_mod)
## term estimate std.error statistic p.value
## 1 (Intercept) -585.6521874 32.31396452 -18.12381 2.897807e-67
## 2 year 0.3259038 0.01632369 19.96509 7.546795e-80
tidy(gapminder_mod) %>%
str()
## 'data.frame': 2 obs. of 5 variables:
## $ term : chr "(Intercept)" "year"
## $ estimate : num -585.652 0.326
## $ std.error: num 32.314 0.0163
## $ statistic: num -18.1 20
## $ p.value : num 2.90e-67 7.55e-80
augment
augment(gapminder_mod) %>%
tbl_df()
## # A tibble: 1,704 × 9
## lifeExp year .fitted .se.fit .resid .hat .sigma
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 28.801 1952 50.51208 0.5299963 -21.71108 0.0020765619 11.62203
## 2 30.332 1957 52.14160 0.4629044 -21.80960 0.0015840967 11.62193
## 3 31.997 1962 53.77112 0.4012330 -21.77412 0.0011901244 11.62197
## 4 34.020 1967 55.40064 0.3478771 -21.38064 0.0008946453 11.62241
## 5 36.088 1972 57.03016 0.3072006 -20.94216 0.0006976591 11.62288
## 6 38.438 1977 58.65968 0.2846912 -20.22168 0.0005991661 11.62363
## 7 39.854 1982 60.28920 0.2846912 -20.43520 0.0005991661 11.62341
## 8 40.822 1987 61.91872 0.3072006 -21.09672 0.0006976591 11.62271
## 9 41.674 1992 63.54824 0.3478771 -21.87424 0.0008946453 11.62187
## 10 41.763 1997 65.17776 0.4012330 -23.41476 0.0011901244 11.62010
## # ... with 1,694 more rows, and 2 more variables: .cooksd <dbl>,
## # .std.resid <dbl>
glance
glance(gapminder_mod)
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.1897571 0.1892811 11.63055 398.6047 7.546795e-80 2 -6597.866
## AIC BIC deviance df.residual
## 1 13201.73 13218.05 230229.2 1702
by_country <- gapminder %>%
group_by(country, continent) %>%
nest()
by_country
## # A tibble: 142 × 3
## country continent data
## <fctr> <fctr> <list>
## 1 Afghanistan Asia <tibble [12 × 4]>
## 2 Albania Europe <tibble [12 × 4]>
## 3 Algeria Africa <tibble [12 × 4]>
## 4 Angola Africa <tibble [12 × 4]>
## 5 Argentina Americas <tibble [12 × 4]>
## 6 Australia Oceania <tibble [12 × 4]>
## 7 Austria Europe <tibble [12 × 4]>
## 8 Bahrain Asia <tibble [12 × 4]>
## 9 Bangladesh Asia <tibble [12 × 4]>
## 10 Belgium Europe <tibble [12 × 4]>
## # ... with 132 more rows
by_country$data[[1]]
## # A tibble: 12 × 4
## year lifeExp pop gdpPercap
## <int> <dbl> <int> <dbl>
## 1 1952 28.801 8425333 779.4453
## 2 1957 30.332 9240934 820.8530
## 3 1962 31.997 10267083 853.1007
## 4 1967 34.020 11537966 836.1971
## 5 1972 36.088 13079460 739.9811
## 6 1977 38.438 14880372 786.1134
## 7 1982 39.854 12881816 978.0114
## 8 1987 40.822 13867957 852.3959
## 9 1992 41.674 16317921 649.3414
## 10 1997 41.763 22227415 635.3414
## 11 2002 42.129 25268405 726.7341
## 12 2007 43.828 31889923 974.5803
country_model <- function(df) {
lm(lifeExp ~ year, data = df)
}
by_country <- by_country %>%
mutate(model = map(data, country_model))
by_country
## # A tibble: 142 × 4
## country continent data model
## <fctr> <fctr> <list> <list>
## 1 Afghanistan Asia <tibble [12 × 4]> <S3: lm>
## 2 Albania Europe <tibble [12 × 4]> <S3: lm>
## 3 Algeria Africa <tibble [12 × 4]> <S3: lm>
## 4 Angola Africa <tibble [12 × 4]> <S3: lm>
## 5 Argentina Americas <tibble [12 × 4]> <S3: lm>
## 6 Australia Oceania <tibble [12 × 4]> <S3: lm>
## 7 Austria Europe <tibble [12 × 4]> <S3: lm>
## 8 Bahrain Asia <tibble [12 × 4]> <S3: lm>
## 9 Bangladesh Asia <tibble [12 × 4]> <S3: lm>
## 10 Belgium Europe <tibble [12 × 4]> <S3: lm>
## # ... with 132 more rows
by_country %>%
filter(continent == "Europe")
## # A tibble: 30 × 4
## country continent data model
## <fctr> <fctr> <list> <list>
## 1 Albania Europe <tibble [12 × 4]> <S3: lm>
## 2 Austria Europe <tibble [12 × 4]> <S3: lm>
## 3 Belgium Europe <tibble [12 × 4]> <S3: lm>
## 4 Bosnia and Herzegovina Europe <tibble [12 × 4]> <S3: lm>
## 5 Bulgaria Europe <tibble [12 × 4]> <S3: lm>
## 6 Croatia Europe <tibble [12 × 4]> <S3: lm>
## 7 Czech Republic Europe <tibble [12 × 4]> <S3: lm>
## 8 Denmark Europe <tibble [12 × 4]> <S3: lm>
## 9 Finland Europe <tibble [12 × 4]> <S3: lm>
## 10 France Europe <tibble [12 × 4]> <S3: lm>
## # ... with 20 more rows
by_country <- by_country %>%
mutate(
resids = map2(data, model, add_residuals)
)
by_country
## # A tibble: 142 × 5
## country continent data model resids
## <fctr> <fctr> <list> <list> <list>
## 1 Afghanistan Asia <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 2 Albania Europe <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 3 Algeria Africa <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 4 Angola Africa <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 5 Argentina Americas <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 6 Australia Oceania <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 7 Austria Europe <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 8 Bahrain Asia <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 9 Bangladesh Asia <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## 10 Belgium Europe <tibble [12 × 4]> <S3: lm> <tibble [12 × 5]>
## # ... with 132 more rows
resids <- unnest(by_country, resids)
resids
## # A tibble: 1,704 × 7
## country continent year lifeExp pop gdpPercap resid
## <fctr> <fctr> <int> <dbl> <int> <dbl> <dbl>
## 1 Afghanistan Asia 1952 28.801 8425333 779.4453 -1.10629487
## 2 Afghanistan Asia 1957 30.332 9240934 820.8530 -0.95193823
## 3 Afghanistan Asia 1962 31.997 10267083 853.1007 -0.66358159
## 4 Afghanistan Asia 1967 34.020 11537966 836.1971 -0.01722494
## 5 Afghanistan Asia 1972 36.088 13079460 739.9811 0.67413170
## 6 Afghanistan Asia 1977 38.438 14880372 786.1134 1.64748834
## 7 Afghanistan Asia 1982 39.854 12881816 978.0114 1.68684499
## 8 Afghanistan Asia 1987 40.822 13867957 852.3959 1.27820163
## 9 Afghanistan Asia 1992 41.674 16317921 649.3414 0.75355828
## 10 Afghanistan Asia 1997 41.763 22227415 635.3414 -0.53408508
## # ... with 1,694 more rows
resids %>%
ggplot(aes(year, resid)) +
geom_line(aes(group = country), alpha = 1 / 3) +
geom_smooth(se = FALSE)
scorecard