scorecard
Benjamin Soltoff
MACS 30500 - Computing for the Social Sciences
University of Chicago
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(modelr)
library(broom)
##
## Attaching package: 'broom'
## The following object is masked from 'package:modelr':
##
## bootstrap
library(rcfss)
options(na.action = na.warn)
set.seed(11091987)
scorecard
ggplot(scorecard, aes(admrate, cost)) +
geom_point()
## Warning: Removed 21 rows containing missing values (geom_point).
ggplot(scorecard, aes(admrate, cost)) +
geom_point() +
geom_smooth(method = "lm")
## Warning: Removed 21 rows containing non-finite values (stat_smooth).
## Warning: Removed 21 rows containing missing values (geom_point).
scorecard_mod <- lm(cost ~ admrate, data = scorecard)
## Warning: Dropping 21 rows with missing values
tidy(scorecard_mod)
## term estimate std.error statistic p.value
## 1 (Intercept) 43607.18 1001.390 43.54666 1.036472e-284
## 2 admrate -18169.23 1438.405 -12.63151 3.980318e-35
scorecard_resid <- scorecard %>%
add_residuals(scorecard_mod)
scorecard_resid
## # A tibble: 1,849 × 13
## unitid name state type
## <int> <chr> <chr> <chr>
## 1 450234 ITT Technical Institute-Wichita KS Private, for-profit
## 2 448479 ITT Technical Institute-Swartz Creek MI Private, for-profit
## 3 456427 ITT Technical Institute-Concord CA Private, for-profit
## 4 459596 ITT Technical Institute-Tallahassee FL Private, for-profit
## 5 459851 Herzing University-Brookfield WI Private, for-profit
## 6 482477 DeVry University-Illinois IL Private, for-profit
## 7 482547 DeVry University-Nevada NV Private, for-profit
## 8 482592 DeVry University-Oregon OR Private, for-profit
## 9 482617 DeVry University-Tennessee TN Private, for-profit
## 10 482662 DeVry University-Washington WA Private, for-profit
## # ... with 1,839 more rows, and 9 more variables: cost <int>,
## # admrate <dbl>, satavg <dbl>, avgfacsal <dbl>, pctpell <dbl>,
## # comprate <dbl>, firstgen <dbl>, debt <dbl>, resid <dbl>
ggplot(scorecard_resid, aes(resid)) +
geom_freqpoly(aes(color = type))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 21 rows containing non-finite values (stat_bin).
type_model <- function(df) {
lm(cost ~ admrate, data = df)
}
by_type <- scorecard %>%
group_by(type) %>%
nest()
by_type
## # A tibble: 3 × 2
## type data
## <chr> <list>
## 1 Private, for-profit <tibble [216 × 11]>
## 2 Private, nonprofit <tibble [1,092 × 11]>
## 3 Public <tibble [541 × 11]>
by_type <- by_type %>%
mutate(model = map(data, type_model),
resids = map2(data, model, add_residuals))
## Warning: Dropping 8 rows with missing values
## Warning: Dropping 12 rows with missing values
## Warning: Dropping 1 rows with missing values
by_type
## # A tibble: 3 × 4
## type data model resids
## <chr> <list> <list> <list>
## 1 Private, for-profit <tibble [216 × 11]> <S3: lm> <tibble [216 × 12]>
## 2 Private, nonprofit <tibble [1,092 × 11]> <S3: lm> <tibble [1,092 × 12]>
## 3 Public <tibble [541 × 11]> <S3: lm> <tibble [541 × 12]>
glance(scorecard_mod)
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.08035796 0.07985432 12146.48 159.5552 3.980318e-35 2 -19784.78
## AIC BIC deviance df.residual
## 1 39575.57 39592.1 269402514670 1826
by_type %>%
mutate(glance = map(model, glance)) %>%
unnest(glance, .drop = TRUE)
## # A tibble: 3 × 12
## type r.squared adj.r.squared sigma statistic
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Private, for-profit 0.0488797330 0.0442626443 5189.999 10.5867001
## 2 Private, nonprofit 0.1181801128 0.1173620981 11200.479 144.4718626
## 3 Public 0.0009124439 -0.0009445961 4123.599 0.4913432
## # ... with 7 more variables: p.value <dbl>, df <int>, logLik <dbl>,
## # AIC <dbl>, BIC <dbl>, deviance <dbl>, df.residual <int>
resids <- unnest(by_type, resids)
ggplot(resids, aes(resid)) +
geom_freqpoly(aes(color = type))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 21 rows containing non-finite values (stat_bin).
Our residuals are now centered around 0 for each type of college. Obviously we could go much deeper here, but this is a good starting point.
devtools::session_info()
## Session info --------------------------------------------------------------
## setting value
## version R version 3.3.1 (2016-06-21)
## system x86_64, darwin13.4.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## tz America/Chicago
## date 2016-10-18
## Packages ------------------------------------------------------------------
## package * version date source
## assertthat 0.1 2013-12-06 CRAN (R 3.3.0)
## broom * 0.4.1 2016-06-24 CRAN (R 3.3.0)
## codetools 0.2-14 2015-07-15 CRAN (R 3.3.1)
## colorspace 1.2-6 2015-03-11 CRAN (R 3.3.0)
## DBI 0.5-1 2016-09-10 CRAN (R 3.3.0)
## devtools 1.12.0 2016-06-24 CRAN (R 3.3.0)
## digest 0.6.10 2016-08-02 CRAN (R 3.3.0)
## dplyr * 0.5.0 2016-06-24 CRAN (R 3.3.0)
## evaluate 0.9 2016-04-29 CRAN (R 3.3.0)
## foreign 0.8-67 2016-09-13 CRAN (R 3.3.0)
## formatR 1.4 2016-05-09 CRAN (R 3.3.0)
## ggplot2 * 2.1.0.9001 2016-10-01 Github (hadley/ggplot2@feb3ffd)
## gtable 0.2.0 2016-02-26 CRAN (R 3.3.0)
## htmltools 0.3.5 2016-03-21 CRAN (R 3.3.0)
## knitr 1.14 2016-08-13 CRAN (R 3.3.0)
## labeling 0.3 2014-08-23 CRAN (R 3.3.0)
## lattice 0.20-34 2016-09-06 CRAN (R 3.3.0)
## lazyeval 0.2.0 2016-06-12 CRAN (R 3.3.0)
## magrittr 1.5 2014-11-22 CRAN (R 3.3.0)
## memoise 1.0.0 2016-01-29 CRAN (R 3.3.0)
## mnormt 1.5-4 2016-03-09 CRAN (R 3.3.0)
## modelr * 0.1.0 2016-08-31 CRAN (R 3.3.0)
## munsell 0.4.3 2016-02-13 CRAN (R 3.3.0)
## nlme 3.1-128 2016-05-10 CRAN (R 3.3.1)
## plyr 1.8.4 2016-06-08 CRAN (R 3.3.0)
## psych 1.6.9 2016-09-17 cran (@1.6.9)
## purrr * 0.2.2 2016-06-18 CRAN (R 3.3.0)
## R6 2.1.3 2016-08-19 CRAN (R 3.3.0)
## rcfss * 0.1.0 2016-10-06 local
## Rcpp 0.12.7 2016-09-05 cran (@0.12.7)
## readr * 1.0.0 2016-08-03 CRAN (R 3.3.0)
## reshape2 1.4.1 2014-12-06 CRAN (R 3.3.0)
## rmarkdown 1.0.9016 2016-10-14 Github (rstudio/rmarkdown@33aef17)
## scales 0.4.0 2016-02-26 CRAN (R 3.3.0)
## stringi 1.1.1 2016-05-27 CRAN (R 3.3.0)
## stringr 1.1.0 2016-08-19 cran (@1.1.0)
## tibble * 1.2 2016-08-26 cran (@1.2)
## tidyr * 0.6.0 2016-08-12 CRAN (R 3.3.0)
## tidyverse * 1.0.0 2016-09-09 CRAN (R 3.3.0)
## withr 1.0.2 2016-06-20 CRAN (R 3.3.0)
## yaml 2.1.13 2014-06-12 CRAN (R 3.3.0)