18 Web Scrapping
18.1 requests
18.1.1 Creating A Session
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import random
_retries = Retry(connect=10,read=10,backoff_factor=1) # backoff is incremental interval in seconds between retries
_timeout = (10,10) ## connect, read timeout in seconds
rqs = requests.Session()
rqs.mount( 'http://' , HTTPAdapter(max_retries= _retries))
rqs.mount( 'https://' , HTTPAdapter(max_retries= _retries))link1 = 'https://www.yahoo.com'
link2 = 'http://mamamia777.com.au'
#user_agent = {'User-Agent': random.choice(_USER_AGENTS)}
#response1 = rqs.get(link1, timeout=_timeout)
#response2 = rqs.get(link2, timeout=_timeout) print (page1.status_code)18.1.2 Rotating Broswer
_USER_AGENTS = [
#Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
#Firefox
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)']
18.2 BeautifulSoup
18.2.2 HTML Tag Parsing
18.2.2.1 Sample Data
my_html = '''
<div id="my-id1" class='title'>
<p>This Is My Title</p>
<div id="my-id2" class='subtitle' custom_attr='funny'>
<p>This is Subtitle</p>
</div>
<div id="my-id3" class='title', custom_attr='funny'>
<p>This is paragraph1</p>
<p>This is paragraph2</p>
<h3>This is paragraph3</h3>
</div>
</div>
'''
soup = BeautifulSoup(my_html)18.2.2.2 First Match
ID Selector
Everthing under the selected tag will be returned.
soup.find(id='my-id1')#:> <div class="title" id="my-id1">
#:> <p>This Is My Title</p>
#:> <div class="subtitle" custom_attr="funny" id="my-id2">
#:> <p>This is Subtitle</p>
#:> </div>
#:> <div class="title" custom_attr="funny" id="my-id3">
#:> <p>This is paragraph1</p>
#:> <p>This is paragraph2</p>
#:> <h3>This is paragraph3</h3>
#:> </div>
#:> </div>
Class Selector
soup.find(class_='subtitle')#:> <div class="subtitle" custom_attr="funny" id="my-id2">
#:> <p>This is Subtitle</p>
#:> </div>
Attribute Selector
soup.find(custom_attr='funny')#:> <div class="subtitle" custom_attr="funny" id="my-id2">
#:> <p>This is Subtitle</p>
#:> </div>
soup.find( custom_attr='funny')#:> <div class="subtitle" custom_attr="funny" id="my-id2">
#:> <p>This is Subtitle</p>
#:> </div>
soup.find('div', custom_attr='funny')#:> <div class="subtitle" custom_attr="funny" id="my-id2">
#:> <p>This is Subtitle</p>
#:> </div>
18.2.2.3 Find All Matches
find_all
soup = BeautifulSoup(my_html)
multiple_result = soup.find_all(class_='title')
print( 'Item 0: \n', multiple_result[0],
'\n\nItem 1: \n', multiple_result[1])#:> Item 0:
#:> <div class="title" id="my-id1">
#:> <p>This Is My Title</p>
#:> <div class="subtitle" custom_attr="funny" id="my-id2">
#:> <p>This is Subtitle</p>
#:> </div>
#:> <div class="title" custom_attr="funny" id="my-id3">
#:> <p>This is paragraph1</p>
#:> <p>This is paragraph2</p>
#:> <h3>This is paragraph3</h3>
#:> </div>
#:> </div>
#:>
#:> Item 1:
#:> <div class="title" custom_attr="funny" id="my-id3">
#:> <p>This is paragraph1</p>
#:> <p>This is paragraph2</p>
#:> <h3>This is paragraph3</h3>
#:> </div>
CSS Selector using select()
Above can be achieved using css selector. It return an array of result (multiple matches).
multiple_result = soup.select('.title')
print( 'Item 0: \n', multiple_result[0],
'\n\nItem 1: \n', multiple_result[1])#:> Item 0:
#:> <div class="title" id="my-id1">
#:> <p>This Is My Title</p>
#:> <div class="subtitle" custom_attr="funny" id="my-id2">
#:> <p>This is Subtitle</p>
#:> </div>
#:> <div class="title" custom_attr="funny" id="my-id3">
#:> <p>This is paragraph1</p>
#:> <p>This is paragraph2</p>
#:> <h3>This is paragraph3</h3>
#:> </div>
#:> </div>
#:>
#:> Item 1:
#:> <div class="title" custom_attr="funny" id="my-id3">
#:> <p>This is paragraph1</p>
#:> <p>This is paragraph2</p>
#:> <h3>This is paragraph3</h3>
#:> </div>
More granular exmaple of css selector.
soup.select('#my-id1 div.subtitle')#:> [<div class="subtitle" custom_attr="funny" id="my-id2">
#:> <p>This is Subtitle</p>
#:> </div>]
Using contains()
soup.select("p:contains('This is paragraph')")#:> [<p>This is paragraph1</p>, <p>This is paragraph2</p>]
Combining ID, Class and Custom Attribute in the selector
soup.select("div#my-id3.title[custom_attr='funny']:contains('This is paragraph')")#:> [<div class="title" custom_attr="funny" id="my-id3">
#:> <p>This is paragraph1</p>
#:> <p>This is paragraph2</p>
#:> <h3>This is paragraph3</h3>
#:> </div>]
18.2.3 Meta Parsing
my_meta = '''
<meta property="description" content="KUALA LUMPUR: blah blah" category="Malaysia">
<meta property="publish-date" content="2012-01-03">
'''
soup = BeautifulSoup(my_meta)
soup.find('meta', property='description')['content']#:> 'KUALA LUMPUR: blah blah'
soup.find('meta', property='description')['category']#:> 'Malaysia'
soup.find('meta', property='publish-date')['content']#:> '2012-01-03'
soup.find('meta', category='Malaysia')['property']#:> 'description'
18.2.4 Getting Content
18.2.4.1 Get Content get_text(strip=, separator=)
- Use
strip=Trueto strip whitespace from the beginning and end of each bit of text
- Use `separator=ā\nā to specify a string to be used to join the bits of text together
- It is recommended to use
strip=True, separator='\n'so that result from different operating system will be consistant
soup = BeautifulSoup(my_html)
elem = soup.find(id = "my-id3")
elem.get_text(strip=False)#:> '\nThis is paragraph1\nThis is paragraph2\nThis is paragraph3\n'
- strip=True combine with separator will retain only the user readable text portion of each tag, with separator seperating them
elem.get_text(strip=True, separator='\n')#:> 'This is paragraph1\nThis is paragraph2\nThis is paragraph3'
18.2.5 Traversing
18.2.5.1 Get The Element
elems = soup.select("div#my-id3.title[custom_attr='funny']:contains('This is paragraph')")
elem = elems[0]
elem#:> <div class="title" custom_attr="funny" id="my-id3">
#:> <p>This is paragraph1</p>
#:> <p>This is paragraph2</p>
#:> <h3>This is paragraph3</h3>
#:> </div>
18.2.5.2 Traversing Children
All Children In List findChildren()
elem.findChildren()#:> [<p>This is paragraph1</p>, <p>This is paragraph2</p>, <h3>This is paragraph3</h3>]
Next Children findNext()
- If the element has children, this will get the immediate child
- If the element has no children, this will find the next element in the hierechy
first_child = elem.fin
print(
elem.findNext().get_text(strip=True), '\n',
elem.findNext().findNext().get_text(strip=True), '\n')#:> This is paragraph1
#:> This is paragraph2