crawler

Use requests to get GET results, and BeautifulSoup to parse it (html). Or urllib to do the basic parse, and lxml is a great tool to parse xml.

1 Basic

1.1 packages

  • urllib is python official module and not as efficient as 3rd-party downloader requests.
  • We may need cookie, proxy, https, Redirect handlers to download the target webpage.
  • beautifulsoup is the ideal parser for html and lxml.
1
2
3
4
5
6
import requests
from bs4 import BeautifulSoup
import lxml
from urllib.parse import quote
import string
import random

1.2 requests

regular request

1
2
3
res = urllib.request.urlopen(url)
print(res.getcode() == 200)
cont = res.read().decode('utf-8)

request with data and header

1
2
3
4
5
req = urllib.request.Request(url)
req.data = 'demo'
req.add_header('User-Agent', 'Mozilla/5.0')
req.add_header('Origin', 'https://baike.baidu.com')
resp = urllib.request.urlopen(req)

request with cookiejar

1
2
3
4
cj = http.cookiejar.CookieJar()
opr = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
urllib.request.install_opener(opener)
resp = urllib.request.urlopen(url)

post

1
2
3
4
5
6
7
8
9
10
from urllib import parse
from urllib.request import Request
from urllib.request import urlopen
req = Request(url)
postData = parse.urlencode([
(key1, value1),
(key2, value2),
...
])
urlopen(req, data=postData.encode('utf-8'))

1.3 random user agents

1
2
3
4
5
6
7
8
9
10
11
User_Agents =[
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
]

headers = {'User-Agent': random.choice(User_Agents)}

2 beautifulsoup1

2.1 make soup

1
2
3
4
5
6
html = urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')

### store webpage at local
with open('index.html', 'w') as h:
h.write(html)

2.2 re

1
2
3
import re
soup.find('a', href=re.compile(r"view"))
soup.find_all("img", {"src":re.compile("xxx")})

2.3 soup find

1
2
3
wiki = soup.find(mu=re.compile("baike.baidu.com"))
if wiki:
return (wiki['mu'])
1
2
3
awiki = soup.find('div', class_='result c-container ')
if awiki:
return (awiki.a['href'])

2.4 soup tag

Tag are list for storage.

1
2
3
4
5
6
7
8
9
print(wiki.attrs)
print (wiki['mu'])
print (awiki.a['href'])
print (awiki.a.get_text())
bsobj.find("table", {"id":"giftlist"}).children
bsobj.find("table", {"id":"giftlist"}).descendants
bsobj.find("table", {"id":"giftlist"}).tr.next_siblings
bsobj.find("table", {"id":"giftlist"}).tr.previous_siblings
bsobj.find("table", {"id":"giftlist"}).tr.parent

3 Crawler

Now we are able to write a full function crawler, just with these fundamental tools. But code as sophisticated programmer, we must consider error capture.

3.1 urlopen error

1
2
3
4
try:
html = urlopen("http://www.pythonscraping.com/pages/page1.html")
except HTTPError as e:
print(e)

3.2 soup check

1
2
3
4
5
6
7
8
9
try:
badContent = bsObj.nonExistingTag.anotherTag
exceptAttributeError as e:
print("Tag was not found")
else:
if badContent == None:
print ("Tag was not found")
else:
print(badContent)

3.3 crawler framework

  • crawler_main.py
  • url_manager.py
  • html_downloader.py
  • html_parser.py
  • meta_storage.py

3.4 url_manager

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
class UrlManager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()

def add_new_url(self, url):
if url is None:
return
if url not in self.new_urls or self.old_urls:
self.new_urls.add(url)

def add_new_urls(self, urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_new_url(url)

def has_new_url(self):
return len(self.new_urls) != 0

def get_new_url(self):
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
1
2
3
4
5
6
7
8
9
10
import urllib.request

class HtmlDownloader(object):
def download(self, url):
if url is None:
return None
response = urllib.request.urlopen(url)
if response.getcode() != 200:
return None
return response.read()

3.5 html_parser

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from bs4 import BeautifulSoup
import urllib.parse
import re


class HtmlParser(object):

def _get_new_urls(self, page_url, soup):
new_urls = set()
links = soup.find_all('a', href = re.compile(r"/item/"))
for link in links:
new_url = link['href']
new_full_url = urllib.parse.urljoin(page_url, new_url)
new_urls.add(new_full_url)
return new_urls

def _get_new_data(self, page_url, soup):
res_data = {}
res_data['url'] = page_url
title_node = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find("h1")
res_data['title'] = title_node.get_text()
summary_node = soup.find('div', class_="lemma-summary")
res_data['summary'] = summary_node.get_text()
return res_data

def parse(self, page_url, html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8')
new_urls = self._get_new_urls(page_url, soup)
new_data = self._get_new_data(page_url, soup)
return new_urls, new_data

3.6 crawler_main

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# coding:utf-8
from baike_spider import url_manager, html_downloader, html_parser, html_outputer
import logging

class CrawlerMain(object):
def __init__(self):
self.urls = url_manager.UrlManager()
self.downloader = html_downloader.HtmlDownloader()
self.parser = html_parser.HtmlParser()
self.outputer = html_outputer.HtmlOutputer()

def crawl(self, root_url):
count = 1 # record the current number url
self.urls.add_new_url(root_url)
while self.urls.has_new_url():
try:
new_url = self.urls.get_new_url()
print('crawl No.%d: %s'%(count, new_url))
html_cont = self.downloader.download(new_url)
new_urls, new_data = self.parser.parse(new_url, html_cont)
self.urls.add_new_urls(new_urls)
self.outputer.collect_data(new_data)
if count == 1000:
break
count += 1
except:
logging.warning('crawl failed')
self.outputer.output_html()


if __name__ == "__main__":
root_url = "https://baike.baidu.com/item/Python/407313"
obj_spider = CrawlerMain()
obj_spider.crawl(root_url)

3.7 meta_storage

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
class HtmlOutputer(object):
def __init__(self):
self.datas = []

def collect_data(self, data):
if data is None:
return
self.datas.append(data)

def output_html(self):
with open('output.html', 'w', encoding='utf-8') as fout:
fout.write("<html>")
fout.write("<head><meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\"></head>")
fout.write("<body>")
fout.write("<table>")

for data in self.datas:
fout.write("<tr>")
fout.write("<td>%s</td>" % data["url"])
fout.write("<td>%s</td>" % data["title"])
fout.write("<td>%s</td>" % data["summary"])
fout.write("</tr>")

fout.write("</table>")
fout.write("</body>")
fout.write("</html>")

4 PhantomJS2

Webpage may apply code confusion with js code, while soup can only parse html page. The data is in window._DATA_, use CasperJS or PhantomJS headless browser to parse it.

If webpage needs login to GET, then use headless browser like PhantomJS or chrome headless. Or we write code to decode js code, or write a light browser.

1
2
brew tap homebrew/cask
brew cask install phantomjs

5 selenium

Selenium is operate browser to decode js, which will be a little slow. Here is the demo.3

1
pip3 install selenium

For chrome

6 scrapy

7 query movie info from dianying.fm

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import requests
import string
import random
from bs4 import BeautifulSoup
import lxml
from urllib.parse import quote

User_Agents =[
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
]

def get_movie(query_str):
headers = {'User-Agent': random.choice(User_Agents)}
num = ''.join(random.sample(string.digits + string.ascii_letters, 11))
cookie = {'bid': num, 'll': '"108296"'}
base_url = "http://dianying.fm/search/?text="
query_url = quote(query_str)
url = base_url + query_url
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
info = soup.find('div', class_='fm-movie-title')
if info:
return info.get_text().replace('\n',"").strip()+"\n"
else:
return query_str

with open('movies.txt',encoding='utf-8') as f:
result = set()
for lines in f.readlines():
result.add(get_movie(lines))

with open('mres.txt','w',encoding='utf-8') as fo:
for r in result:
if r:
fo.write(r)

8 query movie info from movie.douban.com

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from urllib.parse import quote

chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=chrome_options)

def get_raw(query_str):
base_url = "https://movie.douban.com/subject_search?search_text="
query_url = quote(query_str)
url = base_url + query_url
driver.get(url)
print(driver.title)
iroot = driver.find_element_by_class_name('item-root')
print(iroot)
driver.quit()

get_raw('千与千寻')

9 query movie info from baidu baike

9.1 baike lemmaWgt

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import requests
import string
import random
import re
from bs4 import BeautifulSoup
import lxml
from urllib.parse import quote
from urllib.request import urlopen

User_Agents =[
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
]

def get_movie(query_str):
headers = {'User-Agent': random.choice(User_Agents)}
num = ''.join(random.sample(string.digits + string.ascii_letters, 11))
cookie = {'bid': num, 'll': '"108296"'}
base_url = "https://baike.baidu.com/item/"
query_url = quote(query_str)
url = base_url + query_url
# print (url)
html = urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')
# soup = BeautifulSoup(html, 'html.parser')
# with open('index.html', 'w') as h:
# h.write(html)

# for str in soup.strings:
# for str in soup.stripped_strings:
# print(str)
# print(soup.li.next_siblings)

# print (soup.title)
info = soup.find('dd', class_='lemmaWgt-lemmaTitle-keyInfo')
name = soup.find('h1')
# name = soup.find('dd', class_='lemmaWgt-lemmaTitle-title')
subtitle = soup.find('div', class_='lemmaWgt-subLemmaListTitle')
if subtitle:
# lis = soup.find_all('li', class_='list-dot list-dot-paddingleft')
lis = soup.find_all('li', class_='list-dot list-dot-paddingleft', string=re.compile("电影"))
for l in lis:
print(l.string)
# else:
# if info:
# print (info.string)
# if name:
# print (name.string)
# print(type(info.get_text()))
# print(info.get_text())
# return info.get_text().replace('\n',"").strip()+"\n"
# else:
# info = soup.find('div', class_='lemmaWgt-lemmaTitle-keyInfo')
# print (info)
# return query_str

binfo = soup.find('div', class_='basic-info cmn-clearfix')
if binfo is not None:
linfo = [x.replace('\xa0',"") for x in list(binfo.stripped_strings)]
i = iter(linfo)
d = dict(zip(i,i))
print(d)

names = binfo.find_all('dt')
vals = binfo.find_all('dd')
for n in names:
print(n.get_text(strip=True).replace('\xa0',''))


names = [n.get_text(strip=True).replace('\xa0','') for n in binfo.find_all('dt')]
print (names)
vals = ["".join(n.get_text(strip=True).split()) for n in binfo.find_all('dd')]
print (vals)

with open('mov.txt',encoding='utf-8') as f:
flag = 1
result = set()
for line in f.readlines():
# if flag == 1:
# line = line[1:]
# print(line.strip('\n'))
result.add(get_movie(line.strip('\n')))
# print(result)


### It's for dicts
hdr = d.keys()
vals = d.values()
csvwr = csv.DictWriter(csvfile, fieldnames = hdr)
csvwr.writeheader()
csvwr.writerow(d)

with open('mres.txt','w',encoding='utf-8') as fo:
# fo.write(str(result))
for r in result:
if r:
fo.write(r)

9.2 dict 2 csv/txt4

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import requests
import string
import random
import re
import json
from bs4 import BeautifulSoup
import lxml
from urllib.parse import quote
from urllib.request import urlopen

User_Agents =[
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
]

def get_movie(query_str):
headers = {'User-Agent': random.choice(User_Agents)}
num = ''.join(random.sample(string.digits + string.ascii_letters, 11))
cookie = {'bid': num, 'll': '"108296"'}
base_url = "https://baike.baidu.com/item/"
query_url = quote(query_str)
url = base_url + query_url
# print (url)
html = urlopen(url).read().decode('utf-8')
soup = BeautifulSoup(html, features='lxml')

mtitle = soup.find('div', class_='lemmaWgt-subLemmaListTitle')
if mtitle:
lis = soup.find('li', class_='list-dot list-dot-paddingleft', string=re.compile("电影"))
for l in lis:
return l.string

binfo = soup.find('div', class_='basic-info cmn-clearfix')
if binfo is not None:
if binfo.find('sup'):
binfo.sup.extract()
linfo = [x.replace('\xa0',"") for x in list(binfo.stripped_strings)]
i = iter(linfo)
d = dict(zip(i,i))
return d

def dict2json(filename, d):
with open(filename, 'a') as f:
json.dump(d, f, indent=2, ensure_ascii=False)
f.write(',\n')

def str2txt(filename, s):
with open(filename, 'a') as f:
f.write(s + '\n')

with open('mov.txt',encoding='utf-8') as f:
for line in f.readlines():
film = line.strip('\n')
wiki = get_movie(film)
if wiki is not None:
if isinstance(wiki, dict):
dict2json('res_mov.json', wiki)
if isinstance(wiki, str):
str2txt('res_mul.txt', wiki)
else:
str2txt('res_non.txt', film)

9.3 0xEF0xBB0xBF5