Getting data from the web: scraping
MACS 30500
University of Chicago
November 2, 2016
OMDB
data:image/s3,"s3://crabby-images/5d7f4/5d7f4871b3d0eecf4471d0d97b6e4bfc469c328e" alt=""
http://www.omdbapi.com/?t=Interstellar&y=2014&plot=short&r=xml
Creating link in R
request <- str_c("http://www.omdbapi.com/?t=", "Interstellar", "&",
"y=", "2014", "&", "plot=",
"short", "&", "r=", "xml")
request
## [1] "http://www.omdbapi.com/?t=Interstellar&y=2014&plot=short&r=xml"
Better way
omdb <- function(Title, Year, Plot, Format){
baseurl <- "http://www.omdbapi.com/?"
params <- c("t=", "y=", "plot=", "r=")
values <- c(Title, Year, Plot, Format)
param_values <- map2_chr(params, values, str_c)
args <- str_c(param_values, collapse = "&")
str_c(baseurl, args)
}
omdb("Interstellar", "2014", "short", "xml")
## [1] "http://www.omdbapi.com/?t=Interstellar&y=2014&plot=short&r=xml"
Obtain OMDB data
request_interstellar <- omdb("Interstellar", "2014", "short", "xml")
con <- curl(request_interstellar)
answer_xml <- readLines(con)
close(con)
answer_xml
## [1] "<?xml version=\"1.0\" encoding=\"UTF-8\"?><root response=\"True\"><movie title=\"Interstellar\" year=\"2014\" rated=\"PG-13\" released=\"07 Nov 2014\" runtime=\"169 min\" genre=\"Adventure, Drama, Sci-Fi\" director=\"Christopher Nolan\" writer=\"Jonathan Nolan, Christopher Nolan\" actors=\"Ellen Burstyn, Matthew McConaughey, Mackenzie Foy, John Lithgow\" plot=\"A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival.\" language=\"English\" country=\"USA, UK\" awards=\"Won 1 Oscar. Another 39 wins & 134 nominations.\" poster=\"https://images-na.ssl-images-amazon.com/images/M/MV5BMjIxNTU4MzY4MF5BMl5BanBnXkFtZTgwMzM4ODI3MjE@._V1_SX300.jpg\" metascore=\"74\" imdbRating=\"8.6\" imdbVotes=\"950,670\" imdbID=\"tt0816692\" type=\"movie\"/></root>"
Obtain OMDB data
request_interstellar <- omdb("Interstellar", "2014", "short", "json")
con <- curl(request_interstellar)
answer_json <- readLines(con)
close(con)
answer_json %>%
prettify()
## {
## "Title": "Interstellar",
## "Year": "2014",
## "Rated": "PG-13",
## "Released": "07 Nov 2014",
## "Runtime": "169 min",
## "Genre": "Adventure, Drama, Sci-Fi",
## "Director": "Christopher Nolan",
## "Writer": "Jonathan Nolan, Christopher Nolan",
## "Actors": "Ellen Burstyn, Matthew McConaughey, Mackenzie Foy, John Lithgow",
## "Plot": "A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival.",
## "Language": "English",
## "Country": "USA, UK",
## "Awards": "Won 1 Oscar. Another 39 wins & 134 nominations.",
## "Poster": "https://images-na.ssl-images-amazon.com/images/M/MV5BMjIxNTU4MzY4MF5BMl5BanBnXkFtZTgwMzM4ODI3MjE@._V1_SX300.jpg",
## "Metascore": "74",
## "imdbRating": "8.6",
## "imdbVotes": "950,670",
## "imdbID": "tt0816692",
## "Type": "movie",
## "Response": "True"
## }
##
JavaScript Object Notation
{
"crust": "original",
"toppings": ["cheese", "pepperoni", "garlic"],
"status": "cooking",
"customer": {
"name": "Brian",
"phone": "573-111-1111"
}
}
eXtensible Markup Language
<order>
<crust>original</crust>
<toppings>
<topping>cheese</topping>
<topping>pepperoni</topping>
<topping>garlic</topping>
</toppings>
<status>cooking</status>
</order>
Parsing XML
ans_xml_parsed <- xmlParse(answer_xml)
ans_xml_parsed
## <?xml version="1.0" encoding="UTF-8"?>
## <root response="True">
## <movie title="Interstellar" year="2014" rated="PG-13" released="07 Nov 2014" runtime="169 min" genre="Adventure, Drama, Sci-Fi" director="Christopher Nolan" writer="Jonathan Nolan, Christopher Nolan" actors="Ellen Burstyn, Matthew McConaughey, Mackenzie Foy, John Lithgow" plot="A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival." language="English" country="USA, UK" awards="Won 1 Oscar. Another 39 wins & 134 nominations." poster="https://images-na.ssl-images-amazon.com/images/M/MV5BMjIxNTU4MzY4MF5BMl5BanBnXkFtZTgwMzM4ODI3MjE@._V1_SX300.jpg" metascore="74" imdbRating="8.6" imdbVotes="950,670" imdbID="tt0816692" type="movie"/>
## </root>
##
Parsing XML
name |
xmlName() |
attributes |
xmlAttrs() |
children |
xmlChildren() |
value |
xmlValue() |
Introducing the easy way: httr
- GET: fetch an existing resource
- POST: create a new resource
- PUT: update an existing resource
- DELETE: delete an existing resource
Redo the OMDB example
interstellar_json <- omdb("Interstellar", "2014", "short", "json")
response_json <- GET(interstellar_json)
content(response_json, as = "parsed", type = "application/json")
## $Title
## [1] "Interstellar"
##
## $Year
## [1] "2014"
##
## $Rated
## [1] "PG-13"
##
## $Released
## [1] "07 Nov 2014"
##
## $Runtime
## [1] "169 min"
##
## $Genre
## [1] "Adventure, Drama, Sci-Fi"
##
## $Director
## [1] "Christopher Nolan"
##
## $Writer
## [1] "Jonathan Nolan, Christopher Nolan"
##
## $Actors
## [1] "Ellen Burstyn, Matthew McConaughey, Mackenzie Foy, John Lithgow"
##
## $Plot
## [1] "A team of explorers travel through a wormhole in space in an attempt to ensure humanity's survival."
##
## $Language
## [1] "English"
##
## $Country
## [1] "USA, UK"
##
## $Awards
## [1] "Won 1 Oscar. Another 39 wins & 134 nominations."
##
## $Poster
## [1] "https://images-na.ssl-images-amazon.com/images/M/MV5BMjIxNTU4MzY4MF5BMl5BanBnXkFtZTgwMzM4ODI3MjE@._V1_SX300.jpg"
##
## $Metascore
## [1] "74"
##
## $imdbRating
## [1] "8.6"
##
## $imdbVotes
## [1] "950,670"
##
## $imdbID
## [1] "tt0816692"
##
## $Type
## [1] "movie"
##
## $Response
## [1] "True"
Redo the OMDB example
interstellar_xml <- omdb("Interstellar", "2014", "short", "xml")
response_xml <- GET(interstellar_xml)
content(response_xml, as = "parsed")
## {xml_document}
## <root response="True">
## [1] <movie title="Interstellar" year="2014" rated="PG-13" released="07 N ...
Status codes
status_code(response_xml)
## [1] 200
Status codes
1xx |
Informational |
2xx |
Success |
3xx |
Redirection |
4xx |
Client error (you did something wrong) |
5xx |
Server error (server did something wrong) |
Don’t need omdb()
the_martian <- GET("http://www.omdbapi.com/?",
query = list(t = "The Martian", y = 2015,
plot = "short", r = "json"))
content(the_martian)
## $Title
## [1] "The Martian"
##
## $Year
## [1] "2015"
##
## $Rated
## [1] "PG-13"
##
## $Released
## [1] "02 Oct 2015"
##
## $Runtime
## [1] "144 min"
##
## $Genre
## [1] "Adventure, Drama, Sci-Fi"
##
## $Director
## [1] "Ridley Scott"
##
## $Writer
## [1] "Drew Goddard (screenplay), Andy Weir (book)"
##
## $Actors
## [1] "Matt Damon, Jessica Chastain, Kristen Wiig, Jeff Daniels"
##
## $Plot
## [1] "An astronaut becomes stranded on Mars after his team assume him dead, and must rely on his ingenuity to find a way to signal to Earth that he is alive."
##
## $Language
## [1] "English, Mandarin"
##
## $Country
## [1] "USA, UK"
##
## $Awards
## [1] "Nominated for 7 Oscars. Another 34 wins & 171 nominations."
##
## $Poster
## [1] "https://images-na.ssl-images-amazon.com/images/M/MV5BMTc2MTQ3MDA1Nl5BMl5BanBnXkFtZTgwODA3OTI4NjE@._V1_SX300.jpg"
##
## $Metascore
## [1] "80"
##
## $imdbRating
## [1] "8.0"
##
## $imdbVotes
## [1] "495,332"
##
## $imdbID
## [1] "tt3659388"
##
## $Type
## [1] "movie"
##
## $Response
## [1] "True"
Scraping
<HTML>
<HEAD>
<TITLE>Your Title Here</TITLE>
</HEAD>
<BODY BGCOLOR="FFFFFF">
<CENTER><IMG SRC="clouds.jpg" ALIGN="BOTTOM"> </CENTER>
<HR>
<a href="http://somegreatsite.com">Link Name</a> is a link to another nifty site
<H1>This is a Header</H1>
<H2>This is a Medium Header</H2>
Send me mail at <a href="mailto:support@yourcompany.com"> support@yourcompany.com</a>.
<P> This is a new paragraph!
<P> <B>This is a new paragraph!</B>
<BR> <B><I>This is a new sentence without a paragraph break, in bold italics.</I></B>
<HR>
</BODY>
</HTML>
Rendered HTML
data:image/s3,"s3://crabby-images/64fdc/64fdc168ad0e47a0968cf679cacc26f85146b255" alt=""
Obtain a table
data:image/s3,"s3://crabby-images/2cd57/2cd57c8a756cd1bcd8eeac8f16d28e7ed1cf5195" alt="gm"
Read in table
read_html("GapminderHead.html") %>%
html_table()
## [[1]]
## country continent year lifeExp pop gdpPercap
## 1 Afghanistan Asia 1952 28.801 8425333 779.4453
## 2 Afghanistan Asia 1957 30.332 9240934 820.8530
## 3 Afghanistan Asia 1962 31.997 10267083 853.1007
## 4 Afghanistan Asia 1967 34.020 11537966 836.1971
## 5 Afghanistan Asia 1972 36.088 13079460 739.9811
## 6 Afghanistan Asia 1977 38.438 14880372 786.1134
Random observations on scraping
- Make sure you’ve obtained ONLY what you want! Scroll over the whole page to ensure that selectorgadget hasn’t found too many things
- If you are having trouble parsing, try selecting a smaller subset of the thing you are seeking (i.e. being more precise)
- MOST IMPORTANT - confirm that there is NO RopenSci package and NO API before you spend hours scraping when the API was right here