Benjamin Soltoff
MACS 30500 - Computing for the Social Sciences
University of Chicago
# load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# get dataset from rcfss library
# install.packages("devtools") # install package to install
# other packages from Github
# library(devtools)
# install_github("uc-cfss/rcfss") # install package from Github
library(rcfss)
data("scorecard")
scorecard
## # A tibble: 1,849 × 12
## unitid name state type
## <int> <chr> <chr> <chr>
## 1 450234 ITT Technical Institute-Wichita KS Private, for-profit
## 2 448479 ITT Technical Institute-Swartz Creek MI Private, for-profit
## 3 456427 ITT Technical Institute-Concord CA Private, for-profit
## 4 459596 ITT Technical Institute-Tallahassee FL Private, for-profit
## 5 459851 Herzing University-Brookfield WI Private, for-profit
## 6 482477 DeVry University-Illinois IL Private, for-profit
## 7 482547 DeVry University-Nevada NV Private, for-profit
## 8 482592 DeVry University-Oregon OR Private, for-profit
## 9 482617 DeVry University-Tennessee TN Private, for-profit
## 10 482662 DeVry University-Washington WA Private, for-profit
## # ... with 1,839 more rows, and 8 more variables: cost <int>,
## # admrate <dbl>, satavg <dbl>, avgfacsal <dbl>, pctpell <dbl>,
## # comprate <dbl>, firstgen <dbl>, debt <dbl>
# using arrange() and desc()
scorecard %>%
arrange(desc(cost))
## # A tibble: 1,849 × 12
## unitid name state
## <int> <chr> <chr>
## 1 195304 Sarah Lawrence College NY
## 2 179867 Washington University in St Louis MO
## 3 144050 University of Chicago IL
## 4 190150 Columbia University in the City of New York NY
## 5 182670 Dartmouth College NH
## 6 130697 Wesleyan University CT
## 7 147767 Northwestern University IL
## 8 120254 Occidental College CA
## 9 115409 Harvey Mudd College CA
## 10 230816 Bennington College VT
## # ... with 1,839 more rows, and 9 more variables: type <chr>, cost <int>,
## # admrate <dbl>, satavg <dbl>, avgfacsal <dbl>, pctpell <dbl>,
## # comprate <dbl>, firstgen <dbl>, debt <dbl>
# use filter() and percent_rank() to calculate percentage
scorecard %>%
select(name, type, cost) %>%
filter(type == "Private, nonprofit") %>%
mutate(cost_rank = percent_rank(cost)) %>%
filter(name == "University of Chicago")
## # A tibble: 1 × 4
## name type cost cost_rank
## <chr> <chr> <int> <dbl>
## 1 University of Chicago Private, nonprofit 62425 0.9981464
# use group_by() and summarize()
scorecard %>%
group_by(type) %>%
summarize(mean_sat = mean(satavg, na.rm = TRUE))
## # A tibble: 3 × 2
## type mean_sat
## <chr> <dbl>
## 1 Private, for-profit 1002.500
## 2 Private, nonprofit 1075.287
## 3 Public 1037.410
# using a boxplot
ggplot(scorecard, aes(type, satavg)) +
geom_boxplot()
## Warning: Removed 471 rows containing non-finite values (stat_boxplot).
# using a frequency polygon
ggplot(scorecard, aes(satavg, y = ..density.., color = type)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 471 rows containing non-finite values (stat_bin).
# using a histogram and facets
ggplot(scorecard, aes(satavg)) +
geom_histogram() +
facet_wrap(~ type)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 471 rows containing non-finite values (stat_bin).
# scatterplot and smoothing line
ggplot(scorecard, aes(cost, avgfacsal)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam'
## Warning: Removed 42 rows containing non-finite values (stat_smooth).
## Warning: Removed 42 rows containing missing values (geom_point).
# same as before, but use scale_color to visualize different relationships
# by college type and use alpha to make points semi-transparent
ggplot(scorecard, aes(cost, avgfacsal, color = type)) +
geom_point(alpha = .2) +
geom_smooth()
## `geom_smooth()` using method = 'gam'
## Warning: Removed 42 rows containing non-finite values (stat_smooth).
## Warning: Removed 42 rows containing missing values (geom_point).
# scatterplot and smoothing line
ggplot(scorecard, aes(pctpell, debt)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'gam'
## Warning: Removed 75 rows containing non-finite values (stat_smooth).
## Warning: Removed 75 rows containing missing values (geom_point).
devtools::session_info()
## Session info --------------------------------------------------------------
## setting value
## version R version 3.3.1 (2016-06-21)
## system x86_64, darwin13.4.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## tz America/Chicago
## date 2016-10-04
## Packages ------------------------------------------------------------------
## package * version date source
## assertthat 0.1 2013-12-06 CRAN (R 3.3.0)
## codetools 0.2-14 2015-07-15 CRAN (R 3.3.1)
## colorspace 1.2-6 2015-03-11 CRAN (R 3.3.0)
## DBI 0.5-1 2016-09-10 CRAN (R 3.3.0)
## devtools 1.12.0 2016-06-24 CRAN (R 3.3.0)
## digest 0.6.10 2016-08-02 CRAN (R 3.3.0)
## dplyr * 0.5.0 2016-06-24 CRAN (R 3.3.0)
## evaluate 0.9 2016-04-29 CRAN (R 3.3.0)
## formatR 1.4 2016-05-09 CRAN (R 3.3.0)
## ggplot2 * 2.1.0.9001 2016-10-01 Github (hadley/ggplot2@feb3ffd)
## gtable 0.2.0 2016-02-26 CRAN (R 3.3.0)
## htmltools 0.3.5 2016-03-21 CRAN (R 3.3.0)
## knitr 1.14 2016-08-13 CRAN (R 3.3.0)
## labeling 0.3 2014-08-23 CRAN (R 3.3.0)
## lattice 0.20-34 2016-09-06 CRAN (R 3.3.0)
## lazyeval 0.2.0 2016-06-12 CRAN (R 3.3.0)
## magrittr 1.5 2014-11-22 CRAN (R 3.3.0)
## Matrix 1.2-7.1 2016-09-01 CRAN (R 3.3.0)
## memoise 1.0.0 2016-01-29 CRAN (R 3.3.0)
## mgcv 1.8-15 2016-09-14 CRAN (R 3.3.0)
## munsell 0.4.3 2016-02-13 CRAN (R 3.3.0)
## nlme 3.1-128 2016-05-10 CRAN (R 3.3.1)
## plyr 1.8.4 2016-06-08 CRAN (R 3.3.0)
## R6 2.1.3 2016-08-19 CRAN (R 3.3.0)
## rcfss * 0.1.0 2016-10-04 local
## Rcpp 0.12.7 2016-09-05 cran (@0.12.7)
## rmarkdown 1.0.9016 2016-10-02 Github (rstudio/rmarkdown@fe693c3)
## scales 0.4.0 2016-02-26 CRAN (R 3.3.0)
## stringi 1.1.1 2016-05-27 CRAN (R 3.3.0)
## stringr 1.1.0 2016-08-19 cran (@1.1.0)
## tibble 1.2 2016-08-26 cran (@1.2)
## withr 1.0.2 2016-06-20 CRAN (R 3.3.0)
## yaml 2.1.13 2014-06-12 CRAN (R 3.3.0)