MACS 30500
University of Chicago
I am happy
get_sentiments("bing")
## # A tibble: 6,788 × 2
## word sentiment
## <chr> <chr>
## 1 2-faced negative
## 2 2-faces negative
## 3 a+ positive
## 4 abnormal negative
## 5 abolish negative
## 6 abominable negative
## 7 abominably negative
## 8 abominate negative
## 9 abomination negative
## 10 abort negative
## # ... with 6,778 more rows
get_sentiments("afinn")
## # A tibble: 2,476 × 2
## word score
## <chr> <int>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,466 more rows
get_sentiments("nrc")
## # A tibble: 13,901 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
get_sentiments("nrc") %>%
count(sentiment)
## # A tibble: 10 × 2
## sentiment n
## <chr> <int>
## 1 anger 1247
## 2 anticipation 839
## 3 disgust 1058
## 4 fear 1476
## 5 joy 689
## 6 negative 3324
## 7 positive 2312
## 8 sadness 1191
## 9 surprise 534
## 10 trust 1231
library(janeaustenr)
tidy_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
janeaustensentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
ggplot(janeaustensentiment, aes(index, sentiment, fill = book)) +
geom_bar(alpha = 0.8, stat = "identity", show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
tidytext
Every non-hyperbolic tweet is from iPhone (his staff).
— Todd Vaziri (@tvaziri) August 6, 2016
Every hyperbolic tweet is from Android (from him). pic.twitter.com/GWr6D8h5ed
library(twitteR)
# You'd need to set global options with an authenticated app
setup_twitter_oauth(getOption("twitter_api_key"),
getOption("twitter_api_token"))
# We can request only 3200 tweets at a time; it will return fewer
# depending on the API
trump_tweets <- userTimeline("realDonaldTrump", n = 3200)
trump_tweets_df <- trump_tweets %>%
map_df(as.data.frame) %>%
tbl_df()
# if you want to follow along without setting up Twitter authentication,
# just use this dataset:
load(url("http://varianceexplained.org/files/trump_tweets_df.rda"))
tweets <- trump_tweets_df %>%
select(id, statusSource, text, created) %>%
extract(statusSource, "source", "Twitter for (.*?)<") %>%
filter(source %in% c("iPhone", "Android"))
tweets %>%
head() %>%
knitr::kable(caption = "Example of Donald Trump tweets")
id | source | text | created |
---|---|---|---|
762669882571980801 | Android | My economic policy speech will be carried live at 12:15 P.M. Enjoy! | 2016-08-08 15:20:44 |
762641595439190016 | iPhone | Join me in Fayetteville, North Carolina tomorrow evening at 6pm. Tickets now available at: https://t.co/Z80d4MYIg8 | 2016-08-08 13:28:20 |
762439658911338496 | iPhone | #ICYMI: “Will Media Apologize to Trump?” https://t.co/ia7rKBmioA | 2016-08-08 00:05:54 |
762425371874557952 | Android | Michael Morell, the lightweight former Acting Director of C.I.A., and a man who has made serious bad calls, is a total Clinton flunky! | 2016-08-07 23:09:08 |
762400869858115588 | Android | The media is going crazy. They totally distort so many things on purpose. Crimea, nuclear, “the baby” and so much more. Very dishonest! | 2016-08-07 21:31:46 |
762284533341417472 | Android | I see where Mayor Stephanie Rawlings-Blake of Baltimore is pushing Crooked hard. Look at the job she has done in Baltimore. She is a joke! | 2016-08-07 13:49:29 |
tweets %>%
count(source, hour = hour(with_tz(created, "EST"))) %>%
mutate(percent = n / sum(n)) %>%
ggplot(aes(hour, percent, color = source)) +
geom_line() +
scale_y_continuous(labels = percent_format()) +
labs(x = "Hour of day (EST)",
y = "% of tweets",
color = "")
“@trumplican2016: @realDonaldTrump @DavidWohl stay the course mr trump your message is resonating with the PEOPLE”
— Donald J. Trump (@realDonaldTrump) July 28, 2016
tweets %>%
count(source,
quoted = ifelse(str_detect(text, '^"'), "Quoted", "Not quoted")) %>%
ggplot(aes(source, n, fill = quoted)) +
geom_bar(stat = "identity", position = "dodge") +
labs(x = "", y = "Number of tweets", fill = "") +
ggtitle('Whether tweets start with a quotation mark (")')
library(tidytext)
# function to neatly print the first 10 rows using kable
print_neat <- function(df){
df %>%
head() %>%
knitr::kable()
}
# tweets data frame
tweets %>%
print_neat()
id | source | text | created |
---|---|---|---|
762669882571980801 | Android | My economic policy speech will be carried live at 12:15 P.M. Enjoy! | 2016-08-08 15:20:44 |
762641595439190016 | iPhone | Join me in Fayetteville, North Carolina tomorrow evening at 6pm. Tickets now available at: https://t.co/Z80d4MYIg8 | 2016-08-08 13:28:20 |
762439658911338496 | iPhone | #ICYMI: “Will Media Apologize to Trump?” https://t.co/ia7rKBmioA | 2016-08-08 00:05:54 |
762425371874557952 | Android | Michael Morell, the lightweight former Acting Director of C.I.A., and a man who has made serious bad calls, is a total Clinton flunky! | 2016-08-07 23:09:08 |
762400869858115588 | Android | The media is going crazy. They totally distort so many things on purpose. Crimea, nuclear, “the baby” and so much more. Very dishonest! | 2016-08-07 21:31:46 |
762284533341417472 | Android | I see where Mayor Stephanie Rawlings-Blake of Baltimore is pushing Crooked hard. Look at the job she has done in Baltimore. She is a joke! | 2016-08-07 13:49:29 |
# remove manual retweets
tweets %>%
filter(!str_detect(text, '^"')) %>%
print_neat()
id | source | text | created |
---|---|---|---|
762669882571980801 | Android | My economic policy speech will be carried live at 12:15 P.M. Enjoy! | 2016-08-08 15:20:44 |
762641595439190016 | iPhone | Join me in Fayetteville, North Carolina tomorrow evening at 6pm. Tickets now available at: https://t.co/Z80d4MYIg8 | 2016-08-08 13:28:20 |
762439658911338496 | iPhone | #ICYMI: “Will Media Apologize to Trump?” https://t.co/ia7rKBmioA | 2016-08-08 00:05:54 |
762425371874557952 | Android | Michael Morell, the lightweight former Acting Director of C.I.A., and a man who has made serious bad calls, is a total Clinton flunky! | 2016-08-07 23:09:08 |
762400869858115588 | Android | The media is going crazy. They totally distort so many things on purpose. Crimea, nuclear, “the baby” and so much more. Very dishonest! | 2016-08-07 21:31:46 |
762284533341417472 | Android | I see where Mayor Stephanie Rawlings-Blake of Baltimore is pushing Crooked hard. Look at the job she has done in Baltimore. She is a joke! | 2016-08-07 13:49:29 |
# custom regular expression to tokenize tweets
reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
# unnest into tokens - tidytext format
tweets %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
print_neat()
id | source | created | word |
---|---|---|---|
676494179216805888 | iPhone | 2015-12-14 20:09:15 | record |
676494179216805888 | iPhone | 2015-12-14 20:09:15 | of |
676494179216805888 | iPhone | 2015-12-14 20:09:15 | health |
676494179216805888 | iPhone | 2015-12-14 20:09:15 | #makeamericagreatagain |
676494179216805888 | iPhone | 2015-12-14 20:09:15 | #trump2016 |
676509769562251264 | iPhone | 2015-12-14 21:11:12 | another |
# remove stop words
tweets %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]")) %>%
print_neat()
id | source | created | word |
---|---|---|---|
676494179216805888 | iPhone | 2015-12-14 20:09:15 | record |
676494179216805888 | iPhone | 2015-12-14 20:09:15 | health |
676494179216805888 | iPhone | 2015-12-14 20:09:15 | #makeamericagreatagain |
676494179216805888 | iPhone | 2015-12-14 20:09:15 | #trump2016 |
676509769562251264 | iPhone | 2015-12-14 21:11:12 | accolade |
676509769562251264 | iPhone | 2015-12-14 21:11:12 | @trumpgolf |
# store for future use
tweet_words <- tweets %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]"))
Term frequency-inverse document frequency (tf-idf)
\[idf(\text{term}) = \ln{\left(\frac{n_{\text{documents}}}{n_{\text{documents containing term}}}\right)}\]
tweet_words_count <- tweet_words %>%
count(source, word, sort = TRUE) %>%
ungroup()
tweet_words_count
## # A tibble: 3,235 × 3
## source word n
## <chr> <chr> <int>
## 1 iPhone #trump2016 171
## 2 Android hillary 124
## 3 iPhone #makeamericagreatagain 95
## 4 Android crooked 93
## 5 Android clinton 66
## 6 Android people 64
## 7 iPhone hillary 52
## 8 Android cruz 50
## 9 Android bad 43
## 10 iPhone america 43
## # ... with 3,225 more rows
total_words <- tweet_words_count %>%
group_by(source) %>%
summarize(total = sum(n))
total_words
## # A tibble: 2 × 2
## source total
## <chr> <int>
## 1 Android 4901
## 2 iPhone 3852
tweet_words_count <- left_join(tweet_words_count, total_words)
tweet_words_count
## # A tibble: 3,235 × 4
## source word n total
## <chr> <chr> <int> <int>
## 1 iPhone #trump2016 171 3852
## 2 Android hillary 124 4901
## 3 iPhone #makeamericagreatagain 95 3852
## 4 Android crooked 93 4901
## 5 Android clinton 66 4901
## 6 Android people 64 4901
## 7 iPhone hillary 52 3852
## 8 Android cruz 50 4901
## 9 Android bad 43 4901
## 10 iPhone america 43 3852
## # ... with 3,225 more rows
tweet_words_count <- tweet_words_count %>%
bind_tf_idf(word, source, n)
tweet_words_count
## # A tibble: 3,235 × 7
## source word n total tf idf
## <chr> <chr> <int> <int> <dbl> <dbl>
## 1 iPhone #trump2016 171 3852 0.04439252 0.0000000
## 2 Android hillary 124 4901 0.02530096 0.0000000
## 3 iPhone #makeamericagreatagain 95 3852 0.02466251 0.6931472
## 4 Android crooked 93 4901 0.01897572 0.0000000
## 5 Android clinton 66 4901 0.01346664 0.0000000
## 6 Android people 64 4901 0.01305856 0.0000000
## 7 iPhone hillary 52 3852 0.01349948 0.0000000
## 8 Android cruz 50 4901 0.01020200 0.0000000
## 9 Android bad 43 4901 0.00877372 0.0000000
## 10 iPhone america 43 3852 0.01116303 0.0000000
## # ... with 3,225 more rows, and 1 more variables: tf_idf <dbl>
tweet_words_count %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 3,235 × 6
## source word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 iPhone #makeamericagreatagain 95 0.024662513 0.6931472 0.017094751
## 2 iPhone join 42 0.010903427 0.6931472 0.007557680
## 3 iPhone #americafirst 27 0.007009346 0.6931472 0.004858508
## 4 iPhone #votetrump 23 0.005970924 0.6931472 0.004138729
## 5 iPhone #imwithyou 20 0.005192108 0.6931472 0.003598895
## 6 iPhone #crookedhillary 17 0.004413292 0.6931472 0.003059061
## 7 iPhone #trumppence16 14 0.003634476 0.6931472 0.002519227
## 8 iPhone 7pm 11 0.002855659 0.6931472 0.001979392
## 9 iPhone video 11 0.002855659 0.6931472 0.001979392
## 10 Android badly 13 0.002652520 0.6931472 0.001838587
## # ... with 3,225 more rows
tweet_important <- tweet_words_count %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word))))
tweet_important %>%
group_by(source) %>%
slice(1:15) %>%
ggplot(aes(word, tf_idf, fill = source)) +
geom_bar(alpha = 0.8, stat = "identity") +
labs(title = "Highest tf-idf words",
subtitle = "Top 15 for Android and iPhone",
x = NULL, y = "tf-idf") +
coord_flip()
nrc <- sentiments %>%
filter(lexicon == "nrc") %>%
select(word, sentiment)
nrc
## # A tibble: 13,901 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
sources <- tweet_words %>%
group_by(source) %>%
mutate(total_words = n()) %>%
ungroup() %>%
distinct(id, source, total_words)
sources
## # A tibble: 1,172 × 3
## id source total_words
## <chr> <chr> <int>
## 1 676494179216805888 iPhone 3852
## 2 676509769562251264 iPhone 3852
## 3 680496083072593920 Android 4901
## 4 680503951440121856 Android 4901
## 5 680505672476262400 Android 4901
## 6 680734915718176768 Android 4901
## 7 682764544402440192 iPhone 3852
## 8 682792967736848385 iPhone 3852
## 9 682805320217980929 iPhone 3852
## 10 685490467329425408 Android 4901
## # ... with 1,162 more rows
by_source_sentiment <- tweet_words %>%
inner_join(nrc, by = "word") %>%
count(sentiment, id) %>%
ungroup() %>%
complete(sentiment, id, fill = list(n = 0)) %>%
inner_join(sources) %>%
group_by(source, sentiment, total_words) %>%
summarize(words = sum(n)) %>%
ungroup()
head(by_source_sentiment)
## # A tibble: 6 × 4
## source sentiment total_words words
## <chr> <chr> <int> <dbl>
## 1 Android anger 4901 321
## 2 Android anticipation 4901 256
## 3 Android disgust 4901 207
## 4 Android fear 4901 268
## 5 Android joy 4901 199
## 6 Android negative 4901 560
# function to calculate the poisson.test for a given sentiment
poisson_test <- function(df){
poisson.test(df$words, df$total_words)
}
# use the nest() and map() functions to apply poisson_test to each
# sentiment and extract results using broom::tidy()
sentiment_differences <- by_source_sentiment %>%
group_by(sentiment) %>%
nest() %>%
mutate(poisson = map(data, poisson_test),
poisson_tidy = map(poisson, tidy)) %>%
unnest(poisson_tidy, .drop = TRUE)
sentiment_differences
## # A tibble: 10 × 9
## sentiment estimate statistic p.value parameter conf.low
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 anger 1.492863 321 2.193242e-05 274.3619 1.2353162
## 2 anticipation 1.169804 256 1.191668e-01 239.6467 0.9604950
## 3 disgust 1.677259 207 1.777434e-05 170.2164 1.3116238
## 4 fear 1.560280 268 1.886129e-05 225.6487 1.2640494
## 5 joy 1.002605 199 1.000000e+00 198.7724 0.8089357
## 6 negative 1.692841 560 7.094486e-13 459.1363 1.4586926
## 7 positive 1.058760 555 3.820571e-01 541.4449 0.9303732
## 8 sadness 1.620044 303 1.150493e-06 251.9650 1.3260252
## 9 surprise 1.167925 159 2.174483e-01 148.9393 0.9083517
## 10 trust 1.128482 369 1.471929e-01 350.5114 0.9597478
## # ... with 3 more variables: conf.high <dbl>, method <fctr>,
## # alternative <fctr>
sentiment_differences %>%
ungroup() %>%
mutate(sentiment = reorder(sentiment, estimate)) %>%
mutate_each(funs(. - 1), estimate, conf.low, conf.high) %>%
ggplot(aes(estimate, sentiment)) +
geom_point() +
geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
scale_x_continuous(labels = percent_format()) +
labs(x = "% increase in Android relative to iPhone",
y = "Sentiment")
tweet_important %>%
inner_join(nrc, by = "word") %>%
filter(!sentiment %in% c("positive", "negative")) %>%
mutate(sentiment = reorder(sentiment, -tf_idf),
word = reorder(word, -tf_idf)) %>%
group_by(sentiment) %>%
top_n(10, tf_idf) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = source)) +
facet_wrap(~ sentiment, scales = "free", nrow = 4) +
geom_bar(stat = "identity") +
theme_bw(base_size = 16) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x = "",
y = "tf-idf") +
scale_fill_manual(name = "", labels = c("Android", "iPhone"),
values = c("red", "lightblue"))