#Import NLTK and Texts
import nltk
from nltk import *
from nltk.book import *
from nltk.corpus import stopwords
#Import Web Scraping Modules
from urllib import request
from bs4 import BeautifulSoup
#Command All Matplotlib Graphs to Appear in Inline in Notebook
%matplotlib inline
There are multiple ways of approaching this problem. One method is to follow the example as shown in the NLTK Book, Chapter 3. This method, however does not fully utilize BeautifulSoup, and as a result, the output is not exactly the desired content.
HTML is a complex syntax with not only written text of paragraphs but also menu items, drop-down fields, and links, among other facets. If we want to read a content of a given page, we generally are interested in the text content, rather than all content, headers, meta-data, and so forth.
def nltk_web_read(url, _raw=0, words=1):
"""
-----------------------------------------------------
This function returns the text of a website for a
given url
-----------------------------------------------------
OPTIONS
-----------------------------------------------------
- _raw = option to return raw text from HTML
- 0 = no (default)
- 1 = yes, return raw text
-----------------------------------------------------
- words = option to return word tokens from HTML
- 1 = return all words (default)
- 2 = return only alphanumeric words
-----------------------------------------------------
"""
#Import Modules
from urllib import request
from bs4 import BeautifulSoup
response = request.urlopen(url)
html = response.read().decode('utf-8')
#Get Text from HTML
raw = BeautifulSoup(html, "html5lib").get_text()
raw[:200]
#Options
#Raw Text Option
if _raw==0:
pass
else:
print (raw[:200])
#return raw
#Get Tokens
tokens = word_tokenize(raw)
#Word Options
#All Words
if words==1:
print(tokens[:200])
#return tokens
#Alphanumeric Words
elif words==2:
words = [w for w in tokens if w.isalnum()]
print (words[:200])
#return words
#Get All Raw Content
url = "http://www.nltk.org"
nltk_web_read(url, 1)
#Get ONLY Raw Text
nltk_web_read(url, 0, 2)
Beautiful Soup offers another and better option. We can specify that we only want the text of a page, located within a particular HTML tag. While all pages differ, a typical setup is to find text within a paragraph <p> .... </p>
set of tags. Typically, these are in the "body" of the HTML not the head. They are also typically nested under a hierarchy of <div>
tags.
Example 1: NLTK Website
<div class="section" id="natural-language-toolkit">
<h1>Natural Language Toolkit<a class="headerlink" href="#natural-language-toolkit" title="Permalink to this headline">ΒΆ</a></h1>
<p>NLTK is a leading platform for building Python programs to work with human language data.
It provides easy-to-use interfaces to <a class="reference external" href="http://nltk.org/nltk_data/">over 50 corpora and lexical
resources</a> such as WordNet,
along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning,
wrappers for industrial-strength NLP libraries,
and an active <a class="reference external" href="http://groups.google.com/group/nltk-users">discussion forum</a>.</p>
<p>Thanks to a hands-on guide introducing programming fundamentals alongside topics in computational linguistics, plus comprehensive API documentation,
NLTK is suitable for linguists, engineers, students, educators, researchers, and industry users alike.</p></div>
Example 2: TheAtlantic Online
http://www.theatlantic.com/politics/archive/2016/02/berniebro-revisited/460212/
<div class="article-body" itemprop="articleBody">
<section id="article-section-1"><p>O reader, hear my plea: I am the victim of semantic drift.</p><p>Four months ago, I <a href="http://www.theatlantic.com/politics/archive/2015/10/here-comes-the-berniebro-bernie-sanders/411070/" data-omni-click="r'article',r'link',r'0',r'460212'">coined the term βBerniebroβ</a> to describe a phenomenon I saw on Facebook: Men, mostly my age, mostly of my background, mostly with my political beliefs, were hectoring their friends about how great Bernie was even when their friends wanted to do something else, like talk about the NBA.</p> </section>
</div>
div class="section"
tag, each website varies in terms of the classes provided. It is typically unique to the website design and CSS.¶If we generalize to finding all text in paragraphs <p>
subsumed under a <div>
we can get the full text printed for most websites.
Below I display this function followed by several examples.
def get_website_text(url, div_class=0, _return=0):
"""
-----------------------------------------------------
This function returns the text of a website for a
given URL using Beautiful Soup.
The URL must be specified.
If you do not know the HTML format but would like to
try parsing the URL, run as is. The parser looks for
the "div" class. However, depending on the webpage,
you may need to first inspect the HTML and specify
a "div class=<input>", where "<input>" could
be any number of unique strings specific to the
website.
After finding the content tag, this function returns
text in the paragraph <p> tag.
-----------------------------------------------------
OPTIONS
-----------------------------------------------------
- div_class = a specified class of the <div> tag
- 0 (default)
- looks for any div tags. Works on some
but not all websites.
- Any string
- looks for that string as a div class
Example:
get_website_text(url, "content-wrapper")
This input looks for the tag
<div class="content-wrapper">.
-----------------------------------------------------
- _return = option to return text for use in another
function.
- 0 = do not return, print instead (default)
- 1 = return text
-----------------------------------------------------
"""
#Import Modules
from urllib import request
from bs4 import BeautifulSoup
#Get HTML from URL
response = request.urlopen(url)
html = response.read().decode('utf-8')
#Get Soup for Beautiful Soup
soup = BeautifulSoup(html, "html5lib")
#Class Option (Default=0)
#Define Content
#Look for Any Div Tag
if div_class ==0:
pass
content = soup.find("div")
#Parser Content Error Message
if len(str(content)) < 1000:
print ("Your request may not be returning the desired results.", '\n' \
"Consider inspecting the webpage and trying a different div tag", '\n')
print ("CURRENT RESULTS:", '\n', content)
else:
pass
#Look for Specific Div Tag
else:
try:
content = soup.find("div", {"class":str(div_class)})
#Parser Content Error Message
if len(str(content)) < 1000:
print ("Your request may not be returning the desired results.", '\n' \
"Consider inspecting the webpage and trying a different div tag", '\n')
print ("CURRENT RESULTS:", '\n', content)
else:
pass
#Print Error Message For Failure
except:
print ("Error: Please check your div class='input'.", \
"A valid 'input' must be specified")
return
#Get Paragraph Body
paragraph = ["".join(x.findAll(text=True)) for x in content.findAll("p")]
paragraph_body = "\n\n%s" % ("\n\n".join(paragraph))
#Return Function Option
if _return==1:
return paragraph_body
else:
print (paragraph_body)
pass
#Example NLTK Website
url = "http://www.nltk.org"
get_website_text(url)
#get_website_text(url, "content-wrapper")
#The Atlantic Online
url = "http://www.theatlantic.com/politics/archive/2016/02/berniebro-revisited/460212/"
text = get_website_text(url, 0, 1)
#Print a Subset of the Text
print(text[60:1000])
#The White House
url = "https://www.whitehouse.gov/the-press-office/2016/01/27/remarks-president-righteous-among-nations-award-ceremony"
text = get_website_text(url, 0, 1)
#Print a Subset of the Text
print(text[0:1500])
#To Print All of It
#get_website_text(url)
raw = get_website_text(url, 0, 1)
tokens = word_tokenize(raw)
print (tokens[:100])
# %load word_freq_nltk.py
def words(text, k=10, r=0, sw=0):
"""This functions returns all alphabetic words of
a specified length for a given text.
Defaults, k=10 and r=0, sw=0.
-------------------------------------------------
- k = the length of the word.
-------------------------------------------------
- r = the evaluation option.
It takes values 0 (the default), 1, or 2.
0. "equals" | len(word) == k
1. "less than" | len(word) < k.
2. "greater than" | len(word) > k.
-------------------------------------------------
- sw = stop words (English)
Stop words are high-frequency words like
(the, to and also, is), among others.
In this function, sw takes values
0 (the default) or 1.
The function prints an exception
statement if other values are entered.
-------------------------------------------------
"""
#Not Accounting for Stopwords
if sw == 0:
#Option to Return Words == K
if r == 0:
ucw = [w.lower() for w in text if w.isalpha() and len(w) == k ]
return ucw
#Option to Return Words < K
elif r == 1:
ucw = [w.lower() for w in text if w.isalpha() and len(w) < k ]
return ucw
#Option to Return Words > K
elif r == 2:
ucw = [w.lower() for w in text if w.isalpha() and len(w) > k ]
return ucw
else:
pass
elif sw == 1:
#Option to Return Words == K
if r == 0:
ucw = [w.lower() for w in text if w.lower() not in stopwords.words('english') \
and w.isalpha() and len(w) == k]
return ucw
#Option to Return Words < K
elif r == 1:
ucw = [w.lower() for w in text if w.lower() not in stopwords.words('english') \
and w.isalpha() and len(w) < k]
return ucw
#Option to Return Words > K
elif r == 2:
ucw = [w.lower() for w in text if w.lower() not in stopwords.words('english') \
and w.isalpha() and len(w) > k]
return ucw
else:
pass
else:
print ("Please input a valid stopwords option: 0 = no, 1 = yes")
def freq_words(text, k=10, r=0, n=20, sw=0):
"""This function uses the words function to
generate a specified frequency distribtion,
of the most frequent words and related graph.
You can specify word length, an equality option
(to look for words =, >, or <) a given length.
You can specify how many words to return and
if you want to exclude stopwords.
Defaults, k=10 and r=0, n=20, sw.
-------------------------------------------------
- k = the length of the word.
-------------------------------------------------
- r = the evaluation option.
It takes values 0 (the default), 1, or 2.
0. "equals" | len(word) == k
1. "less than" | len(word) < k.
2. "greater than" | len(word) > k.
-------------------------------------------------
- n = the number of most common results.
The default value is 20. For example, if you
want to see the top 100 results, input 100.
-------------------------------------------------
- sw = stop words (English)
Stop words are high-frequency words like
(the, to and also, is), among others.
In this function, sw takes values
0 (the default) or 1.
The function prints an exception
statement if other values are entered.
-------------------------------------------------
"""
#Generate the Frequency Distribution for specified text, k, and r.
fdist = FreqDist(words(text, k, r, sw))
#Clean up the Title of the Text
clean_title0 = str(text).replace("<Text: ", "").replace(">", "").replace('[', '').replace(']', '')
clean_title1 = clean_title0.replace("'", '').replace('"', '').replace(',', '')[0:10]+"..."
try:
c2 = clean_title1.split(" by ")[0].title()
except:
c2 = clean_title0.title()
#Creating Possible Titles
figtitle1 = "Most Frequent "+str(k)+"-Letter Words in "+c2
figtitle2 = "Most Frequent Words Less Than "+str(k)+"-Letters in "+c2
figtitle3 = "Most Frequent Words Greater Than "+str(k)+"-Letters in "+c2
figtitle4 = "Most Frequent Words of Any Length "+c2
figelse = "Most Frequent Words in "+c2
#Setting the Title Based on Inputs
if r == 0:
figtitle = figtitle1
elif r == 1:
figtitle = figtitle2
elif r == 2 and k != 0:
figtitle = figtitle3
elif r == 2 and k == 0:
figtitle = figtitle4
else:
print ("else")
figtitle = figelse
#Print Plot and Most Common Words
fdist.plot(n, title=figtitle, cumulative=True)
print (figtitle+":", '\n', fdist.most_common(n))
if sw == 1:
print ("*NOTE: Excluding English Stopwords")
else:
pass
#Get Top 30 Words > 7 Letter's in President Obama's Embassy Speech
freq_words(tokens, 7, 2, 30, 1)
freq_words(text5, 0, 2, 50, 1)
<div>
tag and get the results, as shown below:¶#Example NLTK Website, Specify the <div class = >
url = "http://www.nltk.org"
get_website_text(url, "content-wrapper")