18 Web Scrapping
18.1 requests
18.1.1 Creating A Session
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import random
= Retry(connect=10,read=10,backoff_factor=1) # backoff is incremental interval in seconds between retries
_retries = (10,10) ## connect, read timeout in seconds
_timeout
= requests.Session()
rqs 'http://' , HTTPAdapter(max_retries= _retries))
rqs.mount( 'https://' , HTTPAdapter(max_retries= _retries)) rqs.mount(
= 'https://www.yahoo.com'
link1 = 'http://mamamia777.com.au'
link2 #user_agent = {'User-Agent': random.choice(_USER_AGENTS)}
#response1 = rqs.get(link1, timeout=_timeout)
#response2 = rqs.get(link2, timeout=_timeout)
print (page1.status_code)
18.1.2 Rotating Broswer
= [
_USER_AGENTS #Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
#Firefox
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)']
18.2 BeautifulSoup
18.2.2 HTML Tag Parsing
18.2.2.1 Sample Data
= '''
my_html <div id="my-id1" class='title'>
<p>This Is My Title</p>
<div id="my-id2" class='subtitle' custom_attr='funny'>
<p>This is Subtitle</p>
</div>
<div id="my-id3" class='title', custom_attr='funny'>
<p>This is paragraph1</p>
<p>This is paragraph2</p>
<h3>This is paragraph3</h3>
</div>
</div>
'''
= BeautifulSoup(my_html) soup
18.2.2.2 First Match
ID Selector
Everthing under the selected tag will be returned.
id='my-id1') soup.find(
#:> <div class="title" id="my-id1">
#:> <p>This Is My Title</p>
#:> <div class="subtitle" custom_attr="funny" id="my-id2">
#:> <p>This is Subtitle</p>
#:> </div>
#:> <div class="title" custom_attr="funny" id="my-id3">
#:> <p>This is paragraph1</p>
#:> <p>This is paragraph2</p>
#:> <h3>This is paragraph3</h3>
#:> </div>
#:> </div>
Class Selector
='subtitle') soup.find(class_
#:> <div class="subtitle" custom_attr="funny" id="my-id2">
#:> <p>This is Subtitle</p>
#:> </div>
Attribute Selector
='funny') soup.find(custom_attr
#:> <div class="subtitle" custom_attr="funny" id="my-id2">
#:> <p>This is Subtitle</p>
#:> </div>
='funny') soup.find( custom_attr
#:> <div class="subtitle" custom_attr="funny" id="my-id2">
#:> <p>This is Subtitle</p>
#:> </div>
'div', custom_attr='funny') soup.find(
#:> <div class="subtitle" custom_attr="funny" id="my-id2">
#:> <p>This is Subtitle</p>
#:> </div>
18.2.2.3 Find All Matches
find_all
= BeautifulSoup(my_html)
soup = soup.find_all(class_='title')
multiple_result print( 'Item 0: \n', multiple_result[0],
'\n\nItem 1: \n', multiple_result[1])
#:> Item 0:
#:> <div class="title" id="my-id1">
#:> <p>This Is My Title</p>
#:> <div class="subtitle" custom_attr="funny" id="my-id2">
#:> <p>This is Subtitle</p>
#:> </div>
#:> <div class="title" custom_attr="funny" id="my-id3">
#:> <p>This is paragraph1</p>
#:> <p>This is paragraph2</p>
#:> <h3>This is paragraph3</h3>
#:> </div>
#:> </div>
#:>
#:> Item 1:
#:> <div class="title" custom_attr="funny" id="my-id3">
#:> <p>This is paragraph1</p>
#:> <p>This is paragraph2</p>
#:> <h3>This is paragraph3</h3>
#:> </div>
CSS Selector using select()
Above can be achieved using css selector. It return an array of result (multiple matches).
= soup.select('.title')
multiple_result print( 'Item 0: \n', multiple_result[0],
'\n\nItem 1: \n', multiple_result[1])
#:> Item 0:
#:> <div class="title" id="my-id1">
#:> <p>This Is My Title</p>
#:> <div class="subtitle" custom_attr="funny" id="my-id2">
#:> <p>This is Subtitle</p>
#:> </div>
#:> <div class="title" custom_attr="funny" id="my-id3">
#:> <p>This is paragraph1</p>
#:> <p>This is paragraph2</p>
#:> <h3>This is paragraph3</h3>
#:> </div>
#:> </div>
#:>
#:> Item 1:
#:> <div class="title" custom_attr="funny" id="my-id3">
#:> <p>This is paragraph1</p>
#:> <p>This is paragraph2</p>
#:> <h3>This is paragraph3</h3>
#:> </div>
More granular exmaple of css selector.
'#my-id1 div.subtitle') soup.select(
#:> [<div class="subtitle" custom_attr="funny" id="my-id2">
#:> <p>This is Subtitle</p>
#:> </div>]
Using contains()
"p:contains('This is paragraph')") soup.select(
#:> [<p>This is paragraph1</p>, <p>This is paragraph2</p>]
Combining ID, Class and Custom Attribute in the selector
"div#my-id3.title[custom_attr='funny']:contains('This is paragraph')") soup.select(
#:> [<div class="title" custom_attr="funny" id="my-id3">
#:> <p>This is paragraph1</p>
#:> <p>This is paragraph2</p>
#:> <h3>This is paragraph3</h3>
#:> </div>]
18.2.3 Meta Parsing
= '''
my_meta <meta property="description" content="KUALA LUMPUR: blah blah" category="Malaysia">
<meta property="publish-date" content="2012-01-03">
'''
= BeautifulSoup(my_meta)
soup 'meta', property='description')['content'] soup.find(
#:> 'KUALA LUMPUR: blah blah'
'meta', property='description')['category'] soup.find(
#:> 'Malaysia'
'meta', property='publish-date')['content'] soup.find(
#:> '2012-01-03'
'meta', category='Malaysia')['property'] soup.find(
#:> 'description'
18.2.4 Getting Content
18.2.4.1 Get Content get_text(strip=, separator=)
- Use
strip=True
to strip whitespace from the beginning and end of each bit of text
- Use `separator=ā\nā to specify a string to be used to join the bits of text together
- It is recommended to use
strip=True, separator='\n'
so that result from different operating system will be consistant
= BeautifulSoup(my_html)
soup = soup.find(id = "my-id3")
elem =False) elem.get_text(strip
#:> '\nThis is paragraph1\nThis is paragraph2\nThis is paragraph3\n'
- strip=True combine with separator will retain only the user readable text portion of each tag, with separator seperating them
=True, separator='\n') elem.get_text(strip
#:> 'This is paragraph1\nThis is paragraph2\nThis is paragraph3'
18.2.5 Traversing
18.2.5.1 Get The Element
= soup.select("div#my-id3.title[custom_attr='funny']:contains('This is paragraph')")
elems = elems[0]
elem elem
#:> <div class="title" custom_attr="funny" id="my-id3">
#:> <p>This is paragraph1</p>
#:> <p>This is paragraph2</p>
#:> <h3>This is paragraph3</h3>
#:> </div>
18.2.5.2 Traversing Children
All Children In List findChildren()
elem.findChildren()
#:> [<p>This is paragraph1</p>, <p>This is paragraph2</p>, <h3>This is paragraph3</h3>]
Next Children findNext()
- If the element has children, this will get the immediate child
- If the element has no children, this will find the next element in the hierechy
= elem.fin
first_child print(
=True), '\n',
elem.findNext().get_text(strip=True), '\n') elem.findNext().findNext().get_text(strip
#:> This is paragraph1
#:> This is paragraph2