MACS 30500
University of Chicago
diamonds
## # A tibble: 53,940 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39
## # ... with 53,930 more rows
data("diamonds")
diamonds_ideal <- filter(diamonds, cut == "Ideal")
summarize(diamonds_ideal, avg_price = mean(price))
## # A tibble: 1 × 1
## avg_price
## <dbl>
## 1 3457.542
cut
data("diamonds")
diamonds_cut <- group_by(diamonds, cut)
summarize(diamonds_cut, avg_price = mean(price))
## # A tibble: 5 × 2
## cut avg_price
## <ord> <dbl>
## 1 Fair 4358.758
## 2 Good 3928.864
## 3 Very Good 3981.760
## 4 Premium 4584.258
## 5 Ideal 3457.542
diamonds
as the inputdiamonds
to only keep observations where the color is rated as “I”diamonds
data frame by cutdiamonds
data frame by calculating the average carat size and pricedata("diamonds")
diamonds_i <- filter(diamonds, color == "I")
diamonds_i_group <- group_by(diamonds_i, cut)
summarize(
diamonds_i_group,
carat = mean(carat),
price = mean(price)
)
## # A tibble: 5 × 3
## cut carat price
## <ord> <dbl> <dbl>
## 1 Fair 1.1980571 4685.446
## 2 Good 1.0572222 5078.533
## 3 Very Good 1.0469518 5255.880
## 4 Premium 1.1449370 5946.181
## 5 Ideal 0.9130291 4451.970
dplyr
dplyr
function() |
Action performed |
---|---|
filter() |
Subsets observations based on their values |
arrange() |
Changes the order of observations based on their values |
select() |
Selects a subset of columns from the data frame |
rename() |
Changes the name of columns in the data frame |
mutate() |
Creates new columns (or variables) |
group_by() |
Changes the unit of analysis from the complete dataset to individual groups |
summarize() |
Collapses the data frame to a smaller number of rows which summarize the larger data |
The holy grail: “For consistency, aim to use British (rather than American) spelling.” #rstats http://t.co/7qQSWIowcl. Colour is right!
— Hadley Wickham (@hadleywickham) November 27, 2013
summarize()
= summarise()
color()
= colour()
<-
)# printed, but not saved
filter(diamonds, cut == "Ideal")
## # A tibble: 21,551 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.23 Ideal J VS1 62.8 56 340 3.93 3.90 2.46
## 3 0.31 Ideal J SI2 62.2 54 344 4.35 4.37 2.71
## 4 0.30 Ideal I SI2 62.0 54 348 4.31 4.34 2.68
## 5 0.33 Ideal I SI2 61.8 55 403 4.49 4.51 2.78
## 6 0.33 Ideal I SI2 61.2 56 403 4.49 4.50 2.75
## 7 0.33 Ideal J SI1 61.1 56 403 4.49 4.55 2.76
## 8 0.23 Ideal G VS1 61.9 54 404 3.93 3.95 2.44
## 9 0.32 Ideal I SI1 60.9 55 404 4.45 4.48 2.72
## 10 0.30 Ideal I SI2 61.0 59 405 4.30 4.33 2.63
## # ... with 21,541 more rows
# saved, but not printed
diamonds_ideal <- filter(diamonds, cut == "Ideal")
# saved and printed
(diamonds_ideal <- filter(diamonds, cut == "Ideal"))
## # A tibble: 21,551 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.23 Ideal J VS1 62.8 56 340 3.93 3.90 2.46
## 3 0.31 Ideal J SI2 62.2 54 344 4.35 4.37 2.71
## 4 0.30 Ideal I SI2 62.0 54 348 4.31 4.34 2.68
## 5 0.33 Ideal I SI2 61.8 55 403 4.49 4.51 2.78
## 6 0.33 Ideal I SI2 61.2 56 403 4.49 4.50 2.75
## 7 0.33 Ideal J SI1 61.1 56 403 4.49 4.55 2.76
## 8 0.23 Ideal G VS1 61.9 54 404 3.93 3.95 2.44
## 9 0.32 Ideal I SI1 60.9 55 404 4.45 4.48 2.72
## 10 0.30 Ideal I SI2 61.0 59 405 4.30 4.33 2.63
## # ... with 21,541 more rows
Symbol | Action |
---|---|
> |
greater than |
>= |
greater than or equal to |
< |
less than |
<= |
less than or equal to |
!= |
not equal |
== |
equal |
NA > 5
## [1] NA
10 == NA
## [1] NA
NA + 10
## [1] NA
na.rm
argumentdf <- tibble(x = c(1, NA, 3))
filter(df, x > 1)
## # A tibble: 1 × 1
## x
## <dbl>
## 1 3
filter(df, is.na(x) | x > 1)
## # A tibble: 2 × 1
## x
## <dbl>
## 1 NA
## 2 3
df <- tibble(
x = c(1, 2, 3, 5, NA)
)
summarize(df, meanx = mean(x))
## # A tibble: 1 × 1
## meanx
## <dbl>
## 1 NA
summarize(df, meanx = mean(x, na.rm = TRUE))
## # A tibble: 1 × 1
## meanx
## <dbl>
## 1 2.75
+
, -
, *
, /
, ^
log()
, log2()
, log10()
summarize(flights, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 1 × 1
## delay
## <dbl>
## 1 12.63907
by_day <- group_by(flights, year, month, day)
summarize(by_day, delay = mean(dep_delay, na.rm = TRUE))
## Source: local data frame [365 x 4]
## Groups: year, month [?]
##
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.548926
## 2 2013 1 2 13.858824
## 3 2013 1 3 10.987832
## 4 2013 1 4 8.951595
## 5 2013 1 5 5.732218
## 6 2013 1 6 7.148014
## 7 2013 1 7 5.417204
## 8 2013 1 8 2.553073
## 9 2013 1 9 2.276477
## 10 2013 1 10 2.844995
## # ... with 355 more rows
mean()
median()
sd()
IQR()
mad()
min()
max()
quantile()
n()
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
)
delay <- filter(delay, count > 20, dest != "HNL")
ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) +
geom_smooth(se = FALSE)
%>%
)# No pipes - confusing
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
)
delay <- filter(delay, count > 20, dest != "HNL")
# Pipes - clear
delays <- flights %>%
group_by(dest) %>%
summarize(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut))
diamonds %>%
count(cut)
## # A tibble: 5 × 2
## cut n
## <ord> <int>
## 1 Fair 1610
## 2 Good 4906
## 3 Very Good 12082
## 4 Premium 13791
## 5 Ideal 21551
ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = carat))
ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = carat), binwidth = 0.5)
ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = carat), binwidth = 0.1)
ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5)
ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
coord_cartesian(ylim = c(0, 50))
diamonds %>%
filter(y < 3 | y > 20) %>%
arrange(y)
## # A tibble: 9 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 1.00 Very Good H VS2 63.3 53 5139 0.00 0.0 0.00
## 2 1.14 Fair G VS1 57.5 67 6381 0.00 0.0 0.00
## 3 1.56 Ideal G VS2 62.2 54 12800 0.00 0.0 0.00
## 4 1.20 Premium D VVS1 62.1 59 15686 0.00 0.0 0.00
## 5 2.25 Premium H SI2 62.8 59 18034 0.00 0.0 0.00
## 6 0.71 Good F SI2 64.1 60 2130 0.00 0.0 0.00
## 7 0.71 Good F SI2 64.1 60 2130 0.00 0.0 0.00
## 8 0.51 Ideal E VS1 61.8 55 2075 5.15 31.8 5.12
## 9 2.00 Premium H SI2 58.9 57 12210 8.09 58.9 8.06
ggplot(data = diamonds, mapping = aes(x = price)) +
geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)
ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
geom_boxplot()
ggplot(data = diamonds) +
geom_count(mapping = aes(x = cut, y = color))
diamonds %>%
count(color, cut) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = n))
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price))
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price), alpha = .2)
ggplot(data = diamonds, mapping = aes(x = carat, y = price)) +
geom_point() +
geom_smooth()
ggplot(data = diamonds) +
geom_bin2d(mapping = aes(x = carat, y = price))