MACS 30500
University of Chicago
airlines
## # A tibble: 16 × 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
airports
## # A tibble: 1,396 × 7
## faa name lat lon alt tz
## <chr> <chr> <dbl> <dbl> <int> <dbl>
## 1 04G Lansdowne Airport 41.13047 -80.61958 1044 -5
## 2 06A Moton Field Municipal Airport 32.46057 -85.68003 264 -5
## 3 06C Schaumburg Regional 41.98934 -88.10124 801 -6
## 4 06N Randall Airport 41.43191 -74.39156 523 -5
## 5 09J Jekyll Island Airport 31.07447 -81.42778 11 -4
## 6 0A9 Elizabethton Municipal Airport 36.37122 -82.17342 1593 -4
## 7 0G6 Williams County Airport 41.46731 -84.50678 730 -5
## 8 0G7 Finger Lakes Regional Airport 42.88356 -76.78123 492 -5
## 9 0P2 Shoestring Aviation Airfield 39.79482 -76.64719 1000 -5
## 10 0S9 Jefferson County Intl 48.05381 -122.81064 108 -8
## # ... with 1,386 more rows, and 1 more variables: dst <chr>
planes
## # A tibble: 3,322 × 9
## tailnum year type manufacturer model
## <chr> <int> <chr> <chr> <chr>
## 1 N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR
## 2 N102UW 1998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 3 N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 4 N104UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 5 N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR
## 6 N105UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 7 N107US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 8 N108UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 9 N109UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## 10 N110UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214
## # ... with 3,312 more rows, and 4 more variables: engines <int>,
## # seats <int>, speed <int>, engine <chr>
weather
## # A tibble: 26,130 × 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <dbl> <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 0 37.04 21.92 53.97 230 10.35702
## 2 EWR 2013 1 1 1 37.04 21.92 53.97 230 13.80936
## 3 EWR 2013 1 1 2 37.94 21.92 52.09 230 12.65858
## 4 EWR 2013 1 1 3 37.94 23.00 54.51 230 13.80936
## 5 EWR 2013 1 1 4 37.94 24.08 57.04 240 14.96014
## 6 EWR 2013 1 1 6 39.02 26.06 59.37 270 10.35702
## 7 EWR 2013 1 1 7 39.02 26.96 61.63 250 8.05546
## 8 EWR 2013 1 1 8 39.02 28.04 64.43 240 11.50780
## 9 EWR 2013 1 1 9 39.92 28.04 62.21 250 12.65858
## 10 EWR 2013 1 1 10 39.02 28.04 64.43 260 12.65858
## # ... with 26,120 more rows, and 5 more variables: wind_gust <dbl>,
## # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>
planes %>%
count(tailnum) %>%
filter(n > 1)
## # A tibble: 0 × 2
## # ... with 2 variables: tailnum <chr>, n <int>
weather %>%
count(year, month, day, hour, origin) %>%
filter(n > 1)
## Source: local data frame [0 x 6]
## Groups: year, month, day, hour [0]
##
## # ... with 6 variables: year <dbl>, month <dbl>, day <int>, hour <int>,
## # origin <chr>, n <int>
flights %>%
count(year, month, day, flight) %>%
filter(n > 1)
## Source: local data frame [29,768 x 5]
## Groups: year, month, day [365]
##
## year month day flight n
## <int> <int> <int> <int> <int>
## 1 2013 1 1 1 2
## 2 2013 1 1 3 2
## 3 2013 1 1 4 2
## 4 2013 1 1 11 3
## 5 2013 1 1 15 2
## 6 2013 1 1 21 2
## 7 2013 1 1 27 4
## 8 2013 1 1 31 2
## 9 2013 1 1 32 2
## 10 2013 1 1 35 2
## # ... with 29,758 more rows
flights %>%
count(year, month, day, tailnum) %>%
filter(n > 1)
## Source: local data frame [64,928 x 5]
## Groups: year, month, day [365]
##
## year month day tailnum n
## <int> <int> <int> <chr> <int>
## 1 2013 1 1 N0EGMQ 2
## 2 2013 1 1 N11189 2
## 3 2013 1 1 N11536 2
## 4 2013 1 1 N11544 3
## 5 2013 1 1 N11551 2
## 6 2013 1 1 N12540 2
## 7 2013 1 1 N12567 2
## 8 2013 1 1 N13123 2
## 9 2013 1 1 N13538 3
## 10 2013 1 1 N13566 3
## # ... with 64,918 more rows
flights %>%
mutate(id = row_number()) %>%
count(row_number()) %>%
filter(n > 1)
## # A tibble: 0 × 2
## # ... with 2 variables: row_number() <int>, n <int>
flights2 <- flights %>%
select(year:day, hour, origin, dest, tailnum, carrier)
flights2
## # A tibble: 336,776 × 8
## year month day hour origin dest tailnum carrier
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA
## 2 2013 1 1 5 LGA IAH N24211 UA
## 3 2013 1 1 5 JFK MIA N619AA AA
## 4 2013 1 1 5 JFK BQN N804JB B6
## 5 2013 1 1 6 LGA ATL N668DN DL
## 6 2013 1 1 5 EWR ORD N39463 UA
## 7 2013 1 1 6 EWR FLL N516JB B6
## 8 2013 1 1 6 LGA IAD N829AS EV
## 9 2013 1 1 6 JFK MCO N593JB B6
## 10 2013 1 1 6 LGA ORD N3ALAA AA
## # ... with 336,766 more rows
flights2 %>%
select(-origin, -dest) %>%
left_join(airlines, by = "carrier")
## # A tibble: 336,776 × 7
## year month day hour tailnum carrier name
## <int> <int> <int> <dbl> <chr> <chr> <chr>
## 1 2013 1 1 5 N14228 UA United Air Lines Inc.
## 2 2013 1 1 5 N24211 UA United Air Lines Inc.
## 3 2013 1 1 5 N619AA AA American Airlines Inc.
## 4 2013 1 1 5 N804JB B6 JetBlue Airways
## 5 2013 1 1 6 N668DN DL Delta Air Lines Inc.
## 6 2013 1 1 5 N39463 UA United Air Lines Inc.
## 7 2013 1 1 6 N516JB B6 JetBlue Airways
## 8 2013 1 1 6 N829AS EV ExpressJet Airlines Inc.
## 9 2013 1 1 6 N593JB B6 JetBlue Airways
## 10 2013 1 1 6 N3ALAA AA American Airlines Inc.
## # ... with 336,766 more rows
x %>%
inner_join(y, by = "key")
## # A tibble: 2 × 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## # A tibble: 3 × 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 3 x3 <NA>
## # A tibble: 3 × 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 4 <NA> y3
## # A tibble: 4 × 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 3 x3 <NA>
## 4 4 <NA> y3
flights2 %>%
left_join(weather)
## # A tibble: 336,776 × 18
## year month day hour origin dest tailnum carrier temp dewp humid
## <dbl> <dbl> <int> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA NA NA NA
## 2 2013 1 1 5 LGA IAH N24211 UA NA NA NA
## 3 2013 1 1 5 JFK MIA N619AA AA NA NA NA
## 4 2013 1 1 5 JFK BQN N804JB B6 NA NA NA
## 5 2013 1 1 6 LGA ATL N668DN DL 39.92 26.06 57.33
## 6 2013 1 1 5 EWR ORD N39463 UA NA NA NA
## 7 2013 1 1 6 EWR FLL N516JB B6 39.02 26.06 59.37
## 8 2013 1 1 6 LGA IAD N829AS EV 39.92 26.06 57.33
## 9 2013 1 1 6 JFK MCO N593JB B6 39.02 26.06 59.37
## 10 2013 1 1 6 LGA ORD N3ALAA AA 39.92 26.06 57.33
## # ... with 336,766 more rows, and 7 more variables: wind_dir <dbl>,
## # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, time_hour <dttm>
flights2 %>%
left_join(planes, by = "tailnum")
## # A tibble: 336,776 × 16
## year.x month day hour origin dest tailnum carrier year.y
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <int>
## 1 2013 1 1 5 EWR IAH N14228 UA 1999
## 2 2013 1 1 5 LGA IAH N24211 UA 1998
## 3 2013 1 1 5 JFK MIA N619AA AA 1990
## 4 2013 1 1 5 JFK BQN N804JB B6 2012
## 5 2013 1 1 6 LGA ATL N668DN DL 1991
## 6 2013 1 1 5 EWR ORD N39463 UA 2012
## 7 2013 1 1 6 EWR FLL N516JB B6 2000
## 8 2013 1 1 6 LGA IAD N829AS EV 1998
## 9 2013 1 1 6 JFK MCO N593JB B6 2004
## 10 2013 1 1 6 LGA ORD N3ALAA AA NA
## # ... with 336,766 more rows, and 7 more variables: type <chr>,
## # manufacturer <chr>, model <chr>, engines <int>, seats <int>,
## # speed <int>, engine <chr>
flights2 %>%
left_join(airports, c("dest" = "faa"))
## # A tibble: 336,776 × 14
## year month day hour origin dest tailnum carrier
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA
## 2 2013 1 1 5 LGA IAH N24211 UA
## 3 2013 1 1 5 JFK MIA N619AA AA
## 4 2013 1 1 5 JFK BQN N804JB B6
## 5 2013 1 1 6 LGA ATL N668DN DL
## 6 2013 1 1 5 EWR ORD N39463 UA
## 7 2013 1 1 6 EWR FLL N516JB B6
## 8 2013 1 1 6 LGA IAD N829AS EV
## 9 2013 1 1 6 JFK MCO N593JB B6
## 10 2013 1 1 6 LGA ORD N3ALAA AA
## # ... with 336,766 more rows, and 6 more variables: name <chr>, lat <dbl>,
## # lon <dbl>, alt <int>, tz <dbl>, dst <chr>
flights2 %>%
left_join(airports, c("origin" = "faa"))
## # A tibble: 336,776 × 14
## year month day hour origin dest tailnum carrier
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA
## 2 2013 1 1 5 LGA IAH N24211 UA
## 3 2013 1 1 5 JFK MIA N619AA AA
## 4 2013 1 1 5 JFK BQN N804JB B6
## 5 2013 1 1 6 LGA ATL N668DN DL
## 6 2013 1 1 5 EWR ORD N39463 UA
## 7 2013 1 1 6 EWR FLL N516JB B6
## 8 2013 1 1 6 LGA IAD N829AS EV
## 9 2013 1 1 6 JFK MCO N593JB B6
## 10 2013 1 1 6 LGA ORD N3ALAA AA
## # ... with 336,766 more rows, and 6 more variables: name <chr>, lat <dbl>,
## # lon <dbl>, alt <int>, tz <dbl>, dst <chr>
top_dest <- flights %>%
count(dest, sort = TRUE) %>%
head(10)
top_dest
## # A tibble: 10 × 2
## dest n
## <chr> <int>
## 1 ORD 17283
## 2 ATL 17215
## 3 LAX 16174
## 4 BOS 15508
## 5 MCO 14082
## 6 CLT 14064
## 7 SFO 13331
## 8 FLL 12055
## 9 MIA 11728
## 10 DCA 9705
flights %>%
semi_join(top_dest)
## # A tibble: 141,145 × 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 554 558 -4 740
## 2 2013 1 1 558 600 -2 753
## 3 2013 1 1 608 600 8 807
## 4 2013 1 1 629 630 -1 824
## 5 2013 1 1 656 700 -4 854
## 6 2013 1 1 709 700 9 852
## 7 2013 1 1 715 713 2 911
## 8 2013 1 1 739 745 -6 918
## 9 2013 1 1 749 710 39 939
## 10 2013 1 1 828 830 -2 1027
## # ... with 141,135 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
flights %>%
anti_join(planes, by = "tailnum") %>%
count(tailnum, sort = TRUE)
## # A tibble: 722 × 2
## tailnum n
## <chr> <int>
## 1 <NA> 2512
## 2 N725MQ 575
## 3 N722MQ 513
## 4 N723MQ 507
## 5 N713MQ 483
## 6 N735MQ 396
## 7 N0EGMQ 371
## 8 N534MQ 364
## 9 N542MQ 363
## 10 N531MQ 349
## # ... with 712 more rows
superheroes_kable <- knitr::kable(superheroes)
publishers_kable <- knitr::kable(publishers)
superheroes_kable
name | alignment | gender | publisher |
---|---|---|---|
Magneto | bad | male | Marvel |
Storm | good | female | Marvel |
Mystique | bad | female | Marvel |
Batman | good | male | DC |
Joker | bad | male | DC |
Catwoman | bad | female | DC |
Hellboy | good | male | Dark Horse Comics |
publishers_kable
publisher | yr_founded |
---|---|
DC | 1934 |
Marvel | 1939 |
Image | 1992 |
(ijsp <- inner_join(superheroes, publishers))
## # A tibble: 6 × 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
(sjsp <- semi_join(superheroes, publishers))
## # A tibble: 6 × 4
## name alignment gender publisher
## <chr> <chr> <chr> <chr>
## 1 Batman good male DC
## 2 Joker bad male DC
## 3 Catwoman bad female DC
## 4 Magneto bad male Marvel
## 5 Storm good female Marvel
## 6 Mystique bad female Marvel
(ljsp <- left_join(superheroes, publishers))
## # A tibble: 7 × 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 Hellboy good male Dark Horse Comics NA
(ajsp <- anti_join(superheroes, publishers))
## # A tibble: 1 × 4
## name alignment gender publisher
## <chr> <chr> <chr> <chr>
## 1 Hellboy good male Dark Horse Comics
(fjsp <- full_join(superheroes, publishers))
## # A tibble: 8 × 5
## name alignment gender publisher yr_founded
## <chr> <chr> <chr> <chr> <int>
## 1 Magneto bad male Marvel 1939
## 2 Storm good female Marvel 1939
## 3 Mystique bad female Marvel 1939
## 4 Batman good male DC 1934
## 5 Joker bad male DC 1934
## 6 Catwoman bad female DC 1934
## 7 Hellboy good male Dark Horse Comics NA
## 8 <NA> <NA> <NA> Image 1992
library(RSQLite)
my_db <- src_sqlite("my_db.sqlite3", create = T)
flights_sqlite <- copy_to(my_db, flights, temporary = FALSE, indexes = list(
c("year", "month", "day"), "carrier", "tailnum"))
flights_sqlite
## Source: query [?? x 19]
## Database: sqlite 3.8.6 [my_db.sqlite3]
##
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dbl>
select(flights_sqlite, year:day, dep_delay, arr_delay)
## Source: query [?? x 5]
## Database: sqlite 3.8.6 [my_db.sqlite3]
##
## year month day dep_delay arr_delay
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 2 11
## 2 2013 1 1 4 20
## 3 2013 1 1 2 33
## 4 2013 1 1 -1 -18
## 5 2013 1 1 -6 -25
## 6 2013 1 1 -4 12
## 7 2013 1 1 -5 19
## 8 2013 1 1 -3 -14
## 9 2013 1 1 -3 -8
## 10 2013 1 1 -2 8
## # ... with more rows
filter(flights_sqlite, dep_delay > 240)
## Source: query [?? x 19]
## Database: sqlite 3.8.6 [my_db.sqlite3]
##
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 848 1835 853 1001
## 2 2013 1 1 1815 1325 290 2120
## 3 2013 1 1 1842 1422 260 1958
## 4 2013 1 1 2115 1700 255 2330
## 5 2013 1 1 2205 1720 285 46
## 6 2013 1 1 2343 1724 379 314
## 7 2013 1 2 1332 904 268 1616
## 8 2013 1 2 1412 838 334 1710
## 9 2013 1 2 1607 1030 337 2003
## 10 2013 1 2 2131 1512 379 2340
## # ... with more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dbl>
arrange(flights_sqlite, year, month, day)
## Source: query [?? x 19]
## Database: sqlite 3.8.6 [my_db.sqlite3]
##
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dbl>
mutate(flights_sqlite, speed = air_time / distance)
## Source: query [?? x 20]
## Database: sqlite 3.8.6 [my_db.sqlite3]
##
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## # ... with more rows, and 13 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dbl>, speed <dbl>
summarise(flights_sqlite, delay = mean(dep_time))
## Source: query [?? x 1]
## Database: sqlite 3.8.6 [my_db.sqlite3]
##
## delay
## <dbl>
## 1 1349.11
select(flights_sqlite, year:day, dep_delay, arr_delay) %>%
explain()
SELECT `year` AS `year`, `month` AS `month`, `day` AS `day`, `dep_delay` AS `dep_delay`, `arr_delay` AS `arr_delay`
FROM `flights`
c1 <- filter(flights_sqlite, year == 2013, month == 1, day == 1)
c2 <- select(c1, year, month, day, carrier, dep_delay, air_time, distance)
c3 <- mutate(c2, speed = distance / air_time * 60)
c4 <- arrange(c3, year, month, day, carrier)
c4
## Source: query [?? x 8]
## Database: sqlite 3.8.6 [my_db.sqlite3]
##
## year month day carrier dep_delay air_time distance speed
## <int> <int> <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 9E 0 189 1029 326.6667
## 2 2013 1 1 9E -9 57 228 240.0000
## 3 2013 1 1 9E -3 68 301 265.5882
## 4 2013 1 1 9E -6 57 209 220.0000
## 5 2013 1 1 9E -8 66 264 240.0000
## 6 2013 1 1 9E 0 40 184 276.0000
## 7 2013 1 1 9E 6 146 740 304.1096
## 8 2013 1 1 9E 0 139 665 287.0504
## 9 2013 1 1 9E -8 150 765 306.0000
## 10 2013 1 1 9E -6 41 187 273.6585
## # ... with more rows
collect(c4)
## # A tibble: 842 × 8
## year month day carrier dep_delay air_time distance speed
## <int> <int> <int> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 9E 0 189 1029 326.6667
## 2 2013 1 1 9E -9 57 228 240.0000
## 3 2013 1 1 9E -3 68 301 265.5882
## 4 2013 1 1 9E -6 57 209 220.0000
## 5 2013 1 1 9E -8 66 264 240.0000
## 6 2013 1 1 9E 0 40 184 276.0000
## 7 2013 1 1 9E 6 146 740 304.1096
## 8 2013 1 1 9E 0 139 665 287.0504
## 9 2013 1 1 9E -8 150 765 306.0000
## 10 2013 1 1 9E -6 41 187 273.6585
## # ... with 832 more rows
dplyr
library(bigrquery)
taxi <- src_bigquery(project = "nyc-tlc",
dataset = "yellow",
billing = getOption("bigquery_id"))
taxi
## src: bigquery [nyc-tlc:yellow]
## tbls: trips, trips_2015_07, trips_2015_08, trips_2015_09, trips_2015_10,
## trips_2015_11, trips_2015_12, Trips_ByMonth_ByMedallion
How many trips per taken each month in yellow cabs?
SELECT
LEFT(STRING(pickup_datetime), 7) month,
COUNT(*) trips
FROM
[nyc-tlc:yellow.trips]
WHERE
YEAR(pickup_datetime) = 2014
GROUP BY
1
ORDER BY
1
system.time({
trips_by_month <- taxi %>%
tbl("trips") %>%
filter(year(pickup_datetime) == 2014) %>%
mutate(month = month(pickup_datetime)) %>%
count(month) %>%
arrange(month) %>%
collect()
})
## user system elapsed
## 0.053 0.001 1.352
trips_by_month
## # A tibble: 12 × 2
## month n
## * <int> <int>
## 1 1 13782492
## 2 2 13063791
## 3 3 15428127
## 4 4 14618759
## 5 5 14774041
## 6 6 13813029
## 7 7 13106365
## 8 8 12688877
## 9 9 13374016
## 10 10 14232487
## 11 11 13218216
## 12 12 13014161
What about the average speed per hour of day in yellow cabs?
system.time({
speed_per_hour <- taxi %>%
tbl("trips") %>%
mutate(hour = hour(pickup_datetime),
trip_duration = (dropoff_datetime - pickup_datetime) / 3600000000) %>%
mutate(speed = trip_distance / trip_duration) %>%
filter(fare_amount / trip_distance >= 2,
fare_amount / trip_distance <= 10) %>%
group_by(hour) %>%
summarize(speed = mean(speed)) %>%
arrange(hour) %>%
collect()
})
## user system elapsed
## 0.054 0.001 2.387
ggplot(speed_per_hour, aes(hour, speed)) +
geom_line() +
labs(title = "Average Speed of NYC Yellow Taxis",
x = "Hour of day",
y = "Average speed, in MPH")
Finally, what is the average speed by day of the week?
system.time({
speed_per_day <- taxi %>%
tbl("trips") %>%
mutate(hour = hour(pickup_datetime),
day = dayofweek(pickup_datetime),
trip_duration = (dropoff_datetime - pickup_datetime) / 3600000000) %>%
mutate(speed = trip_distance / trip_duration) %>%
filter(fare_amount / trip_distance >= 2,
fare_amount / trip_distance <= 10,
hour >= 8,
hour <= 18) %>%
group_by(day) %>%
summarize(speed = mean(speed)) %>%
arrange(day) %>%
collect()
})
## user system elapsed
## 0.056 0.002 2.320
speed_per_day
## # A tibble: 7 × 2
## day speed
## * <int> <dbl>
## 1 1 14.29703
## 2 2 12.21557
## 3 3 11.11933
## 4 4 10.93281
## 5 5 10.97011
## 6 6 11.24917
## 7 7 13.09473