Getting data from the web: scraping

MACS 30500
University of Chicago

November 2, 2016

OMDB

http://www.omdbapi.com/?t=Interstellar&y=2014&plot=short&r=xml

Better way

omdb <- function(Title, Year, Plot, Format){
  baseurl <- "http://www.omdbapi.com/?"
  params <- c("t=", "y=", "plot=", "r=")
  values <- c(Title, Year, Plot, Format)
  param_values <- map2_chr(params, values, str_c)
  args <- str_c(param_values, collapse = "&")
  str_c(baseurl, args)
}

omdb("Interstellar", "2014", "short", "xml")
## [1] "http://www.omdbapi.com/?t=Interstellar&y=2014&plot=short&r=xml"

Obtain OMDB data

request_interstellar <- omdb("Interstellar", "2014", "short", "xml")
con <- curl(request_interstellar)
answer_xml <- readLines(con)
close(con)
answer_xml
## [1] "<?xml version=\"1.0\" encoding=\"UTF-8\"?><root response=\"True\"><movie title=\"Interstellar\" year=\"2014\" rated=\"PG-13\" released=\"07 Nov 2014\" runtime=\"169 min\" genre=\"Adventure, Drama, Sci-Fi\" director=\"Christopher Nolan\" writer=\"Jonathan Nolan, Christopher Nolan\" actors=\"Ellen Burstyn, Matthew McConaughey, Mackenzie Foy, John Lithgow\" plot=\"A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival.\" language=\"English\" country=\"USA, UK\" awards=\"Won 1 Oscar. Another 39 wins &amp; 134 nominations.\" poster=\"https://images-na.ssl-images-amazon.com/images/M/MV5BMjIxNTU4MzY4MF5BMl5BanBnXkFtZTgwMzM4ODI3MjE@._V1_SX300.jpg\" metascore=\"74\" imdbRating=\"8.6\" imdbVotes=\"950,670\" imdbID=\"tt0816692\" type=\"movie\"/></root>"

Obtain OMDB data

request_interstellar <- omdb("Interstellar", "2014", "short", "json")
con <- curl(request_interstellar)
answer_json <- readLines(con)
close(con)
answer_json %>% 
  prettify()
## {
##     "Title": "Interstellar",
##     "Year": "2014",
##     "Rated": "PG-13",
##     "Released": "07 Nov 2014",
##     "Runtime": "169 min",
##     "Genre": "Adventure, Drama, Sci-Fi",
##     "Director": "Christopher Nolan",
##     "Writer": "Jonathan Nolan, Christopher Nolan",
##     "Actors": "Ellen Burstyn, Matthew McConaughey, Mackenzie Foy, John Lithgow",
##     "Plot": "A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival.",
##     "Language": "English",
##     "Country": "USA, UK",
##     "Awards": "Won 1 Oscar. Another 39 wins & 134 nominations.",
##     "Poster": "https://images-na.ssl-images-amazon.com/images/M/MV5BMjIxNTU4MzY4MF5BMl5BanBnXkFtZTgwMzM4ODI3MjE@._V1_SX300.jpg",
##     "Metascore": "74",
##     "imdbRating": "8.6",
##     "imdbVotes": "950,670",
##     "imdbID": "tt0816692",
##     "Type": "movie",
##     "Response": "True"
## }
## 

JavaScript Object Notation

{
  "crust": "original",
  "toppings": ["cheese", "pepperoni", "garlic"],
  "status": "cooking",
  "customer": {
    "name": "Brian",
    "phone": "573-111-1111"
  }
}

eXtensible Markup Language

<order>
    <crust>original</crust>
    <toppings>
        <topping>cheese</topping>
        <topping>pepperoni</topping>
        <topping>garlic</topping>
    </toppings>
    <status>cooking</status>
</order>

Parsing JSON

answer_json %>% 
  fromJSON()
## $Title
## [1] "Interstellar"
## 
## $Year
## [1] "2014"
## 
## $Rated
## [1] "PG-13"
## 
## $Released
## [1] "07 Nov 2014"
## 
## $Runtime
## [1] "169 min"
## 
## $Genre
## [1] "Adventure, Drama, Sci-Fi"
## 
## $Director
## [1] "Christopher Nolan"
## 
## $Writer
## [1] "Jonathan Nolan, Christopher Nolan"
## 
## $Actors
## [1] "Ellen Burstyn, Matthew McConaughey, Mackenzie Foy, John Lithgow"
## 
## $Plot
## [1] "A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival."
## 
## $Language
## [1] "English"
## 
## $Country
## [1] "USA, UK"
## 
## $Awards
## [1] "Won 1 Oscar. Another 39 wins & 134 nominations."
## 
## $Poster
## [1] "https://images-na.ssl-images-amazon.com/images/M/MV5BMjIxNTU4MzY4MF5BMl5BanBnXkFtZTgwMzM4ODI3MjE@._V1_SX300.jpg"
## 
## $Metascore
## [1] "74"
## 
## $imdbRating
## [1] "8.6"
## 
## $imdbVotes
## [1] "950,670"
## 
## $imdbID
## [1] "tt0816692"
## 
## $Type
## [1] "movie"
## 
## $Response
## [1] "True"
answer_json %>% 
  fromJSON() %>% 
  tbl_df() %>% 
  kable()
Title Year Rated Released Runtime Genre Director Writer Actors Plot Language Country Awards Poster Metascore imdbRating imdbVotes imdbID Type Response
Interstellar 2014 PG-13 07 Nov 2014 169 min Adventure, Drama, Sci-Fi Christopher Nolan Jonathan Nolan, Christopher Nolan Ellen Burstyn, Matthew McConaughey, Mackenzie Foy, John Lithgow A team of explorers travel through a wormhole in space in an attempt to ensure humanity’s survival. English USA, UK Won 1 Oscar. Another 39 wins & 134 nominations. https://images-na.ssl-images-amazon.com/images/M/MV5BMjIxNTU4MzY4MF5BMl5BanBnXkFtZTgwMzM4ODI3MjE@._V1_SX300.jpg 74 8.6 950,670 tt0816692 movie True

Parsing XML

ans_xml_parsed <- xmlParse(answer_xml)
ans_xml_parsed
## <?xml version="1.0" encoding="UTF-8"?>
## <root response="True">
##   <movie title="Interstellar" year="2014" rated="PG-13" released="07 Nov 2014" runtime="169 min" genre="Adventure, Drama, Sci-Fi" director="Christopher Nolan" writer="Jonathan Nolan, Christopher Nolan" actors="Ellen Burstyn, Matthew McConaughey, Mackenzie Foy, John Lithgow" plot="A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival." language="English" country="USA, UK" awards="Won 1 Oscar. Another 39 wins &amp; 134 nominations." poster="https://images-na.ssl-images-amazon.com/images/M/MV5BMjIxNTU4MzY4MF5BMl5BanBnXkFtZTgwMzM4ODI3MjE@._V1_SX300.jpg" metascore="74" imdbRating="8.6" imdbVotes="950,670" imdbID="tt0816692" type="movie"/>
## </root>
## 

Parsing XML

field method
name xmlName()
attributes xmlAttrs()
children xmlChildren()
value xmlValue()

Parsing XML

ans_xml_parsed_root <- xmlRoot(ans_xml_parsed)[["movie"]]  # could also use [[1]]
ans_xml_parsed_root
## <movie title="Interstellar" year="2014" rated="PG-13" released="07 Nov 2014" runtime="169 min" genre="Adventure, Drama, Sci-Fi" director="Christopher Nolan" writer="Jonathan Nolan, Christopher Nolan" actors="Ellen Burstyn, Matthew McConaughey, Mackenzie Foy, John Lithgow" plot="A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival." language="English" country="USA, UK" awards="Won 1 Oscar. Another 39 wins &amp; 134 nominations." poster="https://images-na.ssl-images-amazon.com/images/M/MV5BMjIxNTU4MzY4MF5BMl5BanBnXkFtZTgwMzM4ODI3MjE@._V1_SX300.jpg" metascore="74" imdbRating="8.6" imdbVotes="950,670" imdbID="tt0816692" type="movie"/>
ans_xml_attrs <- xmlAttrs(ans_xml_parsed_root)
ans_xml_attrs
##                                                                                                             title 
##                                                                                                    "Interstellar" 
##                                                                                                              year 
##                                                                                                            "2014" 
##                                                                                                             rated 
##                                                                                                           "PG-13" 
##                                                                                                          released 
##                                                                                                     "07 Nov 2014" 
##                                                                                                           runtime 
##                                                                                                         "169 min" 
##                                                                                                             genre 
##                                                                                        "Adventure, Drama, Sci-Fi" 
##                                                                                                          director 
##                                                                                               "Christopher Nolan" 
##                                                                                                            writer 
##                                                                               "Jonathan Nolan, Christopher Nolan" 
##                                                                                                            actors 
##                                                 "Ellen Burstyn, Matthew McConaughey, Mackenzie Foy, John Lithgow" 
##                                                                                                              plot 
##             "A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival." 
##                                                                                                          language 
##                                                                                                         "English" 
##                                                                                                           country 
##                                                                                                         "USA, UK" 
##                                                                                                            awards 
##                                                                 "Won 1 Oscar. Another 39 wins & 134 nominations." 
##                                                                                                            poster 
## "https://images-na.ssl-images-amazon.com/images/M/MV5BMjIxNTU4MzY4MF5BMl5BanBnXkFtZTgwMzM4ODI3MjE@._V1_SX300.jpg" 
##                                                                                                         metascore 
##                                                                                                              "74" 
##                                                                                                        imdbRating 
##                                                                                                             "8.6" 
##                                                                                                         imdbVotes 
##                                                                                                         "950,670" 
##                                                                                                            imdbID 
##                                                                                                       "tt0816692" 
##                                                                                                              type 
##                                                                                                           "movie"
ans_xml_attrs %>%
  t() %>%
  tbl_df() %>%
  kable()
title year rated released runtime genre director writer actors plot language country awards poster metascore imdbRating imdbVotes imdbID type
Interstellar 2014 PG-13 07 Nov 2014 169 min Adventure, Drama, Sci-Fi Christopher Nolan Jonathan Nolan, Christopher Nolan Ellen Burstyn, Matthew McConaughey, Mackenzie Foy, John Lithgow A team of explorers travel through a wormhole in space in an attempt to ensure humanity’s survival. English USA, UK Won 1 Oscar. Another 39 wins & 134 nominations. https://images-na.ssl-images-amazon.com/images/M/MV5BMjIxNTU4MzY4MF5BMl5BanBnXkFtZTgwMzM4ODI3MjE@._V1_SX300.jpg 74 8.6 950,670 tt0816692 movie

Introducing the easy way: httr

  • GET: fetch an existing resource
  • POST: create a new resource
  • PUT: update an existing resource
  • DELETE: delete an existing resource

Redo the OMDB example

interstellar_json <- omdb("Interstellar", "2014", "short", "json")
response_json <- GET(interstellar_json)
content(response_json, as = "parsed", type = "application/json")
## $Title
## [1] "Interstellar"
## 
## $Year
## [1] "2014"
## 
## $Rated
## [1] "PG-13"
## 
## $Released
## [1] "07 Nov 2014"
## 
## $Runtime
## [1] "169 min"
## 
## $Genre
## [1] "Adventure, Drama, Sci-Fi"
## 
## $Director
## [1] "Christopher Nolan"
## 
## $Writer
## [1] "Jonathan Nolan, Christopher Nolan"
## 
## $Actors
## [1] "Ellen Burstyn, Matthew McConaughey, Mackenzie Foy, John Lithgow"
## 
## $Plot
## [1] "A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival."
## 
## $Language
## [1] "English"
## 
## $Country
## [1] "USA, UK"
## 
## $Awards
## [1] "Won 1 Oscar. Another 39 wins & 134 nominations."
## 
## $Poster
## [1] "https://images-na.ssl-images-amazon.com/images/M/MV5BMjIxNTU4MzY4MF5BMl5BanBnXkFtZTgwMzM4ODI3MjE@._V1_SX300.jpg"
## 
## $Metascore
## [1] "74"
## 
## $imdbRating
## [1] "8.6"
## 
## $imdbVotes
## [1] "950,670"
## 
## $imdbID
## [1] "tt0816692"
## 
## $Type
## [1] "movie"
## 
## $Response
## [1] "True"

Redo the OMDB example

interstellar_xml <- omdb("Interstellar", "2014", "short", "xml")
response_xml <- GET(interstellar_xml)
content(response_xml, as = "parsed")
## {xml_document}
## <root response="True">
## [1] <movie title="Interstellar" year="2014" rated="PG-13" released="07 N ...

Headers

headers(response_xml)
## $date
## [1] "Wed, 02 Nov 2016 17:54:22 GMT"
## 
## $`content-type`
## [1] "text/xml; charset=utf-8"
## 
## $`transfer-encoding`
## [1] "chunked"
## 
## $connection
## [1] "keep-alive"
## 
## $`cache-control`
## [1] "public, max-age=86400"
## 
## $expires
## [1] "Thu, 03 Nov 2016 17:54:22 GMT"
## 
## $`last-modified`
## [1] "Wed, 02 Nov 2016 17:54:21 GMT"
## 
## $vary
## [1] "Accept-Encoding"
## 
## $`x-aspnet-version`
## [1] "4.0.30319"
## 
## $`x-powered-by`
## [1] "ASP.NET"
## 
## $`access-control-allow-origin`
## [1] "*"
## 
## $`cf-cache-status`
## [1] "HIT"
## 
## $server
## [1] "cloudflare-nginx"
## 
## $`cf-ray`
## [1] "2fb97388d6d52579-ORD"
## 
## $`content-encoding`
## [1] "gzip"
## 
## attr(,"class")
## [1] "insensitive" "list"

Status codes

status_code(response_xml)
## [1] 200

Status codes

Code Status
1xx Informational
2xx Success
3xx Redirection
4xx Client error (you did something wrong)
5xx Server error (server did something wrong)

Don’t need omdb()

the_martian <- GET("http://www.omdbapi.com/?",
                   query = list(t = "The Martian", y = 2015,
                                plot = "short", r = "json"))

content(the_martian)
## $Title
## [1] "The Martian"
## 
## $Year
## [1] "2015"
## 
## $Rated
## [1] "PG-13"
## 
## $Released
## [1] "02 Oct 2015"
## 
## $Runtime
## [1] "144 min"
## 
## $Genre
## [1] "Adventure, Drama, Sci-Fi"
## 
## $Director
## [1] "Ridley Scott"
## 
## $Writer
## [1] "Drew Goddard (screenplay), Andy Weir (book)"
## 
## $Actors
## [1] "Matt Damon, Jessica Chastain, Kristen Wiig, Jeff Daniels"
## 
## $Plot
## [1] "An astronaut becomes stranded on Mars after his team assume him dead, and must rely on his ingenuity to find a way to signal to Earth that he is alive."
## 
## $Language
## [1] "English, Mandarin"
## 
## $Country
## [1] "USA, UK"
## 
## $Awards
## [1] "Nominated for 7 Oscars. Another 34 wins & 171 nominations."
## 
## $Poster
## [1] "https://images-na.ssl-images-amazon.com/images/M/MV5BMTc2MTQ3MDA1Nl5BMl5BanBnXkFtZTgwODA3OTI4NjE@._V1_SX300.jpg"
## 
## $Metascore
## [1] "80"
## 
## $imdbRating
## [1] "8.0"
## 
## $imdbVotes
## [1] "495,332"
## 
## $imdbID
## [1] "tt3659388"
## 
## $Type
## [1] "movie"
## 
## $Response
## [1] "True"

Scraping

<HTML>
<HEAD>
  <TITLE>Your Title Here</TITLE>
</HEAD>

<BODY BGCOLOR="FFFFFF">
<CENTER><IMG SRC="clouds.jpg" ALIGN="BOTTOM"> </CENTER>
<HR>
<a href="http://somegreatsite.com">Link Name</a> is a link to another nifty site
<H1>This is a Header</H1>
<H2>This is a Medium Header</H2>
Send me mail at <a href="mailto:support@yourcompany.com"> support@yourcompany.com</a>.
<P> This is a new paragraph!
<P> <B>This is a new paragraph!</B>
<BR> <B><I>This is a new sentence without a paragraph break, in bold italics.</I></B>
<HR>
</BODY>
</HTML>

Rendered HTML

Install your equipment

Practice CSS selectors

Let’s play a game together!

Obtain a table

gm

Read in table

read_html("GapminderHead.html") %>%
  html_table()
## [[1]]
##       country continent year lifeExp      pop gdpPercap
## 1 Afghanistan      Asia 1952  28.801  8425333  779.4453
## 2 Afghanistan      Asia 1957  30.332  9240934  820.8530
## 3 Afghanistan      Asia 1962  31.997 10267083  853.1007
## 4 Afghanistan      Asia 1967  34.020 11537966  836.1971
## 5 Afghanistan      Asia 1972  36.088 13079460  739.9811
## 6 Afghanistan      Asia 1977  38.438 14880372  786.1134

Random observations on scraping

  • Make sure you’ve obtained ONLY what you want! Scroll over the whole page to ensure that selectorgadget hasn’t found too many things
  • If you are having trouble parsing, try selecting a smaller subset of the thing you are seeking (i.e. being more precise)
  • MOST IMPORTANT - confirm that there is NO RopenSci package and NO API before you spend hours scraping when the API was right here