Practice transforming and exploring data

# load necessary libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

# get dataset from rcfss library
# install.packages("devtools")      # install package to install
                                    # other packages from Github
# library(devtools)
# install_github("uc-cfss/rcfss")   # install package from Github
library(rcfss)
data("scorecard")
scorecard
## # A tibble: 1,849 × 12
##    unitid                                 name state                type
##     <int>                                <chr> <chr>               <chr>
## 1  450234      ITT Technical Institute-Wichita    KS Private, for-profit
## 2  448479 ITT Technical Institute-Swartz Creek    MI Private, for-profit
## 3  456427      ITT Technical Institute-Concord    CA Private, for-profit
## 4  459596  ITT Technical Institute-Tallahassee    FL Private, for-profit
## 5  459851        Herzing University-Brookfield    WI Private, for-profit
## 6  482477            DeVry University-Illinois    IL Private, for-profit
## 7  482547              DeVry University-Nevada    NV Private, for-profit
## 8  482592              DeVry University-Oregon    OR Private, for-profit
## 9  482617           DeVry University-Tennessee    TN Private, for-profit
## 10 482662          DeVry University-Washington    WA Private, for-profit
## # ... with 1,839 more rows, and 8 more variables: cost <int>,
## #   admrate <dbl>, satavg <dbl>, avgfacsal <dbl>, pctpell <dbl>,
## #   comprate <dbl>, firstgen <dbl>, debt <dbl>

Which were the 10 most expensive colleges in 2013?

# using arrange() and desc()
scorecard %>%
  arrange(desc(cost))
## # A tibble: 1,849 × 12
##    unitid                                        name state
##     <int>                                       <chr> <chr>
## 1  195304                      Sarah Lawrence College    NY
## 2  179867           Washington University in St Louis    MO
## 3  144050                       University of Chicago    IL
## 4  190150 Columbia University in the City of New York    NY
## 5  182670                           Dartmouth College    NH
## 6  130697                         Wesleyan University    CT
## 7  147767                     Northwestern University    IL
## 8  120254                          Occidental College    CA
## 9  115409                         Harvey Mudd College    CA
## 10 230816                          Bennington College    VT
## # ... with 1,839 more rows, and 9 more variables: type <chr>, cost <int>,
## #   admrate <dbl>, satavg <dbl>, avgfacsal <dbl>, pctpell <dbl>,
## #   comprate <dbl>, firstgen <dbl>, debt <dbl>

What percent of private, nonprofit schools are cheaper than the University of Chicago?

# use filter() and percent_rank() to calculate percentage
scorecard %>%
  select(name, type, cost) %>%
  filter(type == "Private, nonprofit") %>%
  mutate(cost_rank = percent_rank(cost)) %>%
  filter(name == "University of Chicago")
## # A tibble: 1 × 4
##                    name               type  cost cost_rank
##                   <chr>              <chr> <int>     <dbl>
## 1 University of Chicago Private, nonprofit 62425 0.9981464

Which type of college has the highest average SAT score?

# use group_by() and summarize()
scorecard %>%
  group_by(type) %>%
  summarize(mean_sat = mean(satavg, na.rm = TRUE))
## # A tibble: 3 × 2
##                  type mean_sat
##                 <chr>    <dbl>
## 1 Private, for-profit 1002.500
## 2  Private, nonprofit 1075.287
## 3              Public 1037.410
# using a boxplot
ggplot(scorecard, aes(type, satavg)) +
  geom_boxplot()
## Warning: Removed 471 rows containing non-finite values (stat_boxplot).

# using a frequency polygon
ggplot(scorecard, aes(satavg, y = ..density.., color = type)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 471 rows containing non-finite values (stat_bin).

# using a histogram and facets
ggplot(scorecard, aes(satavg)) +
  geom_histogram() +
  facet_wrap(~ type)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 471 rows containing non-finite values (stat_bin).

What is the relationship between cost and faculty salaries?

# scatterplot and smoothing line
ggplot(scorecard, aes(cost, avgfacsal)) +
  geom_point() +
  geom_smooth()
## `geom_smooth()` using method = 'gam'
## Warning: Removed 42 rows containing non-finite values (stat_smooth).
## Warning: Removed 42 rows containing missing values (geom_point).

# same as before, but use scale_color to visualize different relationships
# by college type and use alpha to make points semi-transparent
ggplot(scorecard, aes(cost, avgfacsal, color = type)) +
  geom_point(alpha = .2) +
  geom_smooth()
## `geom_smooth()` using method = 'gam'
## Warning: Removed 42 rows containing non-finite values (stat_smooth).

## Warning: Removed 42 rows containing missing values (geom_point).

How does a college’s Pell Grant recipients effect the average student’s education debt?

# scatterplot and smoothing line
ggplot(scorecard, aes(pctpell, debt)) +
  geom_point() +
  geom_smooth()
## `geom_smooth()` using method = 'gam'
## Warning: Removed 75 rows containing non-finite values (stat_smooth).
## Warning: Removed 75 rows containing missing values (geom_point).

Session Info

devtools::session_info()
## Session info --------------------------------------------------------------
##  setting  value                       
##  version  R version 3.3.1 (2016-06-21)
##  system   x86_64, darwin13.4.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  tz       America/Chicago             
##  date     2016-10-04
## Packages ------------------------------------------------------------------
##  package    * version    date       source                            
##  assertthat   0.1        2013-12-06 CRAN (R 3.3.0)                    
##  codetools    0.2-14     2015-07-15 CRAN (R 3.3.1)                    
##  colorspace   1.2-6      2015-03-11 CRAN (R 3.3.0)                    
##  DBI          0.5-1      2016-09-10 CRAN (R 3.3.0)                    
##  devtools     1.12.0     2016-06-24 CRAN (R 3.3.0)                    
##  digest       0.6.10     2016-08-02 CRAN (R 3.3.0)                    
##  dplyr      * 0.5.0      2016-06-24 CRAN (R 3.3.0)                    
##  evaluate     0.9        2016-04-29 CRAN (R 3.3.0)                    
##  formatR      1.4        2016-05-09 CRAN (R 3.3.0)                    
##  ggplot2    * 2.1.0.9001 2016-10-01 Github (hadley/ggplot2@feb3ffd)   
##  gtable       0.2.0      2016-02-26 CRAN (R 3.3.0)                    
##  htmltools    0.3.5      2016-03-21 CRAN (R 3.3.0)                    
##  knitr        1.14       2016-08-13 CRAN (R 3.3.0)                    
##  labeling     0.3        2014-08-23 CRAN (R 3.3.0)                    
##  lattice      0.20-34    2016-09-06 CRAN (R 3.3.0)                    
##  lazyeval     0.2.0      2016-06-12 CRAN (R 3.3.0)                    
##  magrittr     1.5        2014-11-22 CRAN (R 3.3.0)                    
##  Matrix       1.2-7.1    2016-09-01 CRAN (R 3.3.0)                    
##  memoise      1.0.0      2016-01-29 CRAN (R 3.3.0)                    
##  mgcv         1.8-15     2016-09-14 CRAN (R 3.3.0)                    
##  munsell      0.4.3      2016-02-13 CRAN (R 3.3.0)                    
##  nlme         3.1-128    2016-05-10 CRAN (R 3.3.1)                    
##  plyr         1.8.4      2016-06-08 CRAN (R 3.3.0)                    
##  R6           2.1.3      2016-08-19 CRAN (R 3.3.0)                    
##  rcfss      * 0.1.0      2016-10-04 local                             
##  Rcpp         0.12.7     2016-09-05 cran (@0.12.7)                    
##  rmarkdown    1.0.9016   2016-10-02 Github (rstudio/rmarkdown@fe693c3)
##  scales       0.4.0      2016-02-26 CRAN (R 3.3.0)                    
##  stringi      1.1.1      2016-05-27 CRAN (R 3.3.0)                    
##  stringr      1.1.0      2016-08-19 cran (@1.1.0)                     
##  tibble       1.2        2016-08-26 cran (@1.2)                       
##  withr        1.0.2      2016-06-20 CRAN (R 3.3.0)                    
##  yaml         2.1.13     2014-06-12 CRAN (R 3.3.0)