library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(modelr)
library(broom)
## 
## Attaching package: 'broom'
## The following object is masked from 'package:modelr':
## 
##     bootstrap
library(rcfss)
options(na.action = na.warn)

set.seed(11091987)

Practice modeling with scorecard

What is the relationship between admission rate and cost?

With a graph

ggplot(scorecard, aes(admrate, cost)) +
  geom_point()
## Warning: Removed 21 rows containing missing values (geom_point).

Add a linear regression

ggplot(scorecard, aes(admrate, cost)) +
  geom_point() +
  geom_smooth(method = "lm")
## Warning: Removed 21 rows containing non-finite values (stat_smooth).
## Warning: Removed 21 rows containing missing values (geom_point).

Estimate a linear regression

scorecard_mod <- lm(cost ~ admrate, data = scorecard)
## Warning: Dropping 21 rows with missing values
tidy(scorecard_mod)
##          term  estimate std.error statistic       p.value
## 1 (Intercept)  43607.18  1001.390  43.54666 1.036472e-284
## 2     admrate -18169.23  1438.405 -12.63151  3.980318e-35

Is there a difference for type of college?

scorecard_resid <- scorecard %>% 
  add_residuals(scorecard_mod)
scorecard_resid
## # A tibble: 1,849 × 13
##    unitid                                 name state                type
##     <int>                                <chr> <chr>               <chr>
## 1  450234      ITT Technical Institute-Wichita    KS Private, for-profit
## 2  448479 ITT Technical Institute-Swartz Creek    MI Private, for-profit
## 3  456427      ITT Technical Institute-Concord    CA Private, for-profit
## 4  459596  ITT Technical Institute-Tallahassee    FL Private, for-profit
## 5  459851        Herzing University-Brookfield    WI Private, for-profit
## 6  482477            DeVry University-Illinois    IL Private, for-profit
## 7  482547              DeVry University-Nevada    NV Private, for-profit
## 8  482592              DeVry University-Oregon    OR Private, for-profit
## 9  482617           DeVry University-Tennessee    TN Private, for-profit
## 10 482662          DeVry University-Washington    WA Private, for-profit
## # ... with 1,839 more rows, and 9 more variables: cost <int>,
## #   admrate <dbl>, satavg <dbl>, avgfacsal <dbl>, pctpell <dbl>,
## #   comprate <dbl>, firstgen <dbl>, debt <dbl>, resid <dbl>
ggplot(scorecard_resid, aes(resid)) + 
  geom_freqpoly(aes(color = type))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 21 rows containing non-finite values (stat_bin).

Separate models for each type of college

type_model <- function(df) {
  lm(cost ~ admrate, data = df)
}

by_type <- scorecard %>%
  group_by(type) %>%
  nest()
by_type
## # A tibble: 3 × 2
##                  type                  data
##                 <chr>                <list>
## 1 Private, for-profit   <tibble [216 × 11]>
## 2  Private, nonprofit <tibble [1,092 × 11]>
## 3              Public   <tibble [541 × 11]>
by_type <- by_type %>%
  mutate(model = map(data, type_model),
         resids = map2(data, model, add_residuals))
## Warning: Dropping 8 rows with missing values
## Warning: Dropping 12 rows with missing values
## Warning: Dropping 1 rows with missing values
by_type
## # A tibble: 3 × 4
##                  type                  data    model                resids
##                 <chr>                <list>   <list>                <list>
## 1 Private, for-profit   <tibble [216 × 11]> <S3: lm>   <tibble [216 × 12]>
## 2  Private, nonprofit <tibble [1,092 × 11]> <S3: lm> <tibble [1,092 × 12]>
## 3              Public   <tibble [541 × 11]> <S3: lm>   <tibble [541 × 12]>

Compare to original model

glance(scorecard_mod)
##    r.squared adj.r.squared    sigma statistic      p.value df    logLik
## 1 0.08035796    0.07985432 12146.48  159.5552 3.980318e-35  2 -19784.78
##        AIC     BIC     deviance df.residual
## 1 39575.57 39592.1 269402514670        1826
by_type %>%
  mutate(glance = map(model, glance)) %>%
  unnest(glance, .drop = TRUE)
## # A tibble: 3 × 12
##                  type    r.squared adj.r.squared     sigma   statistic
##                 <chr>        <dbl>         <dbl>     <dbl>       <dbl>
## 1 Private, for-profit 0.0488797330  0.0442626443  5189.999  10.5867001
## 2  Private, nonprofit 0.1181801128  0.1173620981 11200.479 144.4718626
## 3              Public 0.0009124439 -0.0009445961  4123.599   0.4913432
## # ... with 7 more variables: p.value <dbl>, df <int>, logLik <dbl>,
## #   AIC <dbl>, BIC <dbl>, deviance <dbl>, df.residual <int>
resids <- unnest(by_type, resids)

ggplot(resids, aes(resid)) + 
  geom_freqpoly(aes(color = type))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 21 rows containing non-finite values (stat_bin).

Our residuals are now centered around 0 for each type of college. Obviously we could go much deeper here, but this is a good starting point.

Session Info

devtools::session_info()
## Session info --------------------------------------------------------------
##  setting  value                       
##  version  R version 3.3.1 (2016-06-21)
##  system   x86_64, darwin13.4.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  tz       America/Chicago             
##  date     2016-10-18
## Packages ------------------------------------------------------------------
##  package    * version    date       source                            
##  assertthat   0.1        2013-12-06 CRAN (R 3.3.0)                    
##  broom      * 0.4.1      2016-06-24 CRAN (R 3.3.0)                    
##  codetools    0.2-14     2015-07-15 CRAN (R 3.3.1)                    
##  colorspace   1.2-6      2015-03-11 CRAN (R 3.3.0)                    
##  DBI          0.5-1      2016-09-10 CRAN (R 3.3.0)                    
##  devtools     1.12.0     2016-06-24 CRAN (R 3.3.0)                    
##  digest       0.6.10     2016-08-02 CRAN (R 3.3.0)                    
##  dplyr      * 0.5.0      2016-06-24 CRAN (R 3.3.0)                    
##  evaluate     0.9        2016-04-29 CRAN (R 3.3.0)                    
##  foreign      0.8-67     2016-09-13 CRAN (R 3.3.0)                    
##  formatR      1.4        2016-05-09 CRAN (R 3.3.0)                    
##  ggplot2    * 2.1.0.9001 2016-10-01 Github (hadley/ggplot2@feb3ffd)   
##  gtable       0.2.0      2016-02-26 CRAN (R 3.3.0)                    
##  htmltools    0.3.5      2016-03-21 CRAN (R 3.3.0)                    
##  knitr        1.14       2016-08-13 CRAN (R 3.3.0)                    
##  labeling     0.3        2014-08-23 CRAN (R 3.3.0)                    
##  lattice      0.20-34    2016-09-06 CRAN (R 3.3.0)                    
##  lazyeval     0.2.0      2016-06-12 CRAN (R 3.3.0)                    
##  magrittr     1.5        2014-11-22 CRAN (R 3.3.0)                    
##  memoise      1.0.0      2016-01-29 CRAN (R 3.3.0)                    
##  mnormt       1.5-4      2016-03-09 CRAN (R 3.3.0)                    
##  modelr     * 0.1.0      2016-08-31 CRAN (R 3.3.0)                    
##  munsell      0.4.3      2016-02-13 CRAN (R 3.3.0)                    
##  nlme         3.1-128    2016-05-10 CRAN (R 3.3.1)                    
##  plyr         1.8.4      2016-06-08 CRAN (R 3.3.0)                    
##  psych        1.6.9      2016-09-17 cran (@1.6.9)                     
##  purrr      * 0.2.2      2016-06-18 CRAN (R 3.3.0)                    
##  R6           2.1.3      2016-08-19 CRAN (R 3.3.0)                    
##  rcfss      * 0.1.0      2016-10-06 local                             
##  Rcpp         0.12.7     2016-09-05 cran (@0.12.7)                    
##  readr      * 1.0.0      2016-08-03 CRAN (R 3.3.0)                    
##  reshape2     1.4.1      2014-12-06 CRAN (R 3.3.0)                    
##  rmarkdown    1.0.9016   2016-10-14 Github (rstudio/rmarkdown@33aef17)
##  scales       0.4.0      2016-02-26 CRAN (R 3.3.0)                    
##  stringi      1.1.1      2016-05-27 CRAN (R 3.3.0)                    
##  stringr      1.1.0      2016-08-19 cran (@1.1.0)                     
##  tibble     * 1.2        2016-08-26 cran (@1.2)                       
##  tidyr      * 0.6.0      2016-08-12 CRAN (R 3.3.0)                    
##  tidyverse  * 1.0.0      2016-09-09 CRAN (R 3.3.0)                    
##  withr        1.0.2      2016-06-20 CRAN (R 3.3.0)                    
##  yaml         2.1.13     2014-06-12 CRAN (R 3.3.0)

References