Python爬虫

2020-08-11 约 2767 字预计阅读 6 分钟次阅读

urllib模块

不使用

requests模块

获得网页源码

1
2
3
4


import requests
url = 'https://dyhgo.fun'
response = requests.get(url)
print(response.text)

带参数的url和UA伪装

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12


#测试一下带参数的url和ua伪装
import requests
url = 'https://www.baidu.com/s'
name = input('输入您要搜索的内容，我们将返回百度搜索该信息的网页源码')
params = {
    'wd' : name
}
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
response = requests.get(url=url, params=params, headers=headers)
print(response.text)

案例：有道翻译查词

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27


#有道翻译的结果获取，查询look单词并将结果存储
import requests
import json
params = {
    'i': 'look',
    'from': 'AUTO',
    'to': 'AUTO',
    'smartresult': 'dict',
    'client': 'fanyideskweb',
    'salt': '16119129043609',
    'sign': '288cdf16af5fa68411381ba3c9f7f874',
    'lts': '1611912904360',
    'bv': '44a53b4124e8b822ebfd881c5a599938',
    'doctype': 'json',
    'version': '2.1',
    'keyfrom': 'fanyi.web',
    'action': 'FY_BY_REALTlME'
}
url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
response = requests.post(url=url, data=params, headers=headers)
f = open('look.json', 'w', encoding='utf-8')
json.dump(response.json(), fp=f, ensure_ascii=False)
f.close()
print(response.json())

案例：获取豆瓣影单

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19


#获取豆瓣影单
import requests
import json
url = 'https://movie.douban.com/j/chart/top_list'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
params = {
    'type': '24',
    'interval_id': '100:90',
    'action':'',
    'start': '1',
    'limit': '2'
}
response = requests.get(url=url, params=params, headers=headers)
list_data = response.json()
f = open('douban.json', 'w', encoding='utf-8')
json.dump(list_data, fp=f, ensure_ascii=False)
f.close()

案例：kfc餐厅信息

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15


#kfc餐厅信息
import requests
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
params = {
    'cname':'',
    'pid':'',
    'keyword': '上海',
    'pageIndex': '1',
    'pageSize': '10'
}
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
response = requests.post(url=url, data=params, headers=headers)
print(response.text)

案例：药监局相关信息

注意数据由ajax加载

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34


#药监局相关数据
import requests
import json
url1 = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
data = {
    'on': 'true',
    'page': '1',
    'pageSize': '15',
    'productName':'',
    'conditionType': '1',
    'applyname':'',
    'applysn':''
}
id_list = []
response = requests.post(url=url1, data=data, headers=headers).json()['list']
for i in response:
    id_list.append(i['ID'])
#print(id_list)

detail_list = []
url2 = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
for id in id_list:
    data = {
        'id' : id
    }
    response = requests.post(url=url2, data=data, headers=headers).json()
    detail_list.append(response)
    #print(response)
with open('yaojianju.json', 'w', encoding='utf-8') as f:
    json.dump(detail_list, fp=f, ensure_ascii=False, indent=True)
print('over')

数据解析

正则

基础用法参考此处

案例：下载糗事百科图片

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11


#下载糗事百科的图片
import requests
url = 'https://pic.qiushibaike.com/system/pictures/12402/124029001/medium/J8FFRWUK30TM3X79.jpg'
header = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
response = requests.get(url=url, headers=header)
image = response.content
with open('qiutu.jpg', 'wb') as f:
    f.write(image)
print('over')

案例：批量下载糗事百科图片

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28


#用正则表达式下载图片（可批量下载）
# <div class="thumb">
#
# <a href="/article/124031033" target="_blank">
# <img src="//pic.qiushibaike.com/system/pictures/12403/124031033/medium/V35P60R0KM4YCY3Q.jpg" alt="糗事#124031033" class="illustration" width="100%" height="auto">
# </a>
# </div>

import requests
import re
url = 'https://www.qiushibaike.com/imgrank/'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
response = requests.get(url=url, headers=headers).text
#print(response)

regex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'

img_list = re.findall(regex, response, re.S)
#print(img_list)
img_url = 'https:' + img_list[0]
#print(img_url)
response = requests.get(url=img_url, headers=headers).content
#print(type(response))
with open('qiutu2.jpg', 'wb') as f :
    f.write(response)
print('download successfully')

bs4

BeautifulSoup解析本地html

1
2
3
4
5
6


#beautifulsoup解析本地html
from bs4 import BeautifulSoup
import requests
with open('test.html', 'r', encoding='utf-8') as f:
    bs = BeautifulSoup(f, 'lxml')
    print(bs)

BeautifulSoup解析当前获得的html源码

1
2
3
4
5


#beautifulsoup解析当前获得的html源码
from bs4 import BeautifulSoup
import requests
url = 'https://dyhgo.fun'
print(BeautifulSoup(requests.get(url=url).text, 'lxml'))

BeautifulSoup的使用

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27


#beautifulsoup的使用，挺复杂的
from bs4 import BeautifulSoup
with open('forbsfor.html', 'r', encoding='utf-8') as f:
    bs = BeautifulSoup(f, 'lxml')
print(bs)
print(bs.a) #print fitst a tag
print(bs.div)
print(bs.find('div'))   #same as bs.div
print(bs.find('div', class_='header-title'))    #find tag by attribute, notice class
print(bs.find('div', id = 'menu-toggle-mobile'))
print(bs.find_all('a')) #get a list of all 'a' tags

#css selector
print(bs.select('.page-item'))  #a list of tags which of class is 'page-item'
print(bs.select('.header-wrapper > .header-title > a[0]'))  #wrong statement
print(bs.select('.header-wrapper > .header-title > a')[0]) #means level, return a list of all 'a' tags
print(bs.select('.header-wrapper a')[0])    #space means cross level

#get context or attribute
print(bs.select('.header-wrapper a')[0].text)
print(bs.select('.header-wrapper a')[0].get_text())
print(bs.select('.header-wrapper a')[0].string)
print(bs.select('.header-title')[0].text)   #get all text between this tag
print(bs.select('.header-title')[0].get_text())
print(bs.select('.header-title')[0].string) #get direct text between this tag

print(bs.select('.header-wrapper > .header-title > a')[0]['title'])

案例：爬取《三国演义》内容

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36


#爬取三国演义的内容
import requests
from bs4 import BeautifulSoup
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}

response = requests.get(url=url, headers=headers).text
bs = BeautifulSoup(response, 'lxml')
#mulu = bs.find('div', class_='book-mulu')
a_list = bs.select('.book-mulu > ul > li > a')
#print(a_list)
f = open('sanguo.txt', 'a', encoding='utf-8')
for a in a_list[:3]:
    title = a.string.encode('iso-8859-1').decode('utf-8')#.encode('iso-8859-1')
    #print(title)
    url_detail = 'https://www.shicimingju.com/' + a['href']
    #print(url_detail)
    #response_detail = requests.get(url=url_detail, headers=headers).text.encode('iso-8859-1')
    # ！！！！！！notice this statement is more common
    response_detail = requests.get(url=url_detail, headers=headers)
    response_detail.encoding = response_detail.apparent_encoding
    response_detail = response_detail.text


    bs = BeautifulSoup(response_detail, 'lxml')

    content_detail = bs.find('div', class_='chapter_content').text#.encode('iso-8859-1')  # !!!!!text not string

    #print(type(content_detail))

    f.write(title + ':' + content_detail + '\n')
    #print(content_detail)

    print(title + 'download successfully')

xpath

很常用

基本用法

1
2
3
4
5
6
7


from lxml import etree
import requests
parser = etree.HTMLParser()
url = 'https://dyhgo.fun'
response = requests.get(url=url).text
tree = etree.HTML(response)     #without etree.HTMLParser
print(tree.xpath('/html/head/title/text()'))

xpath语法

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12


from lxml import etree
parser = etree.HTMLParser()
tree = etree.parse('forbsfor.html', parser=parser)
r = tree.xpath('/html/head/meta')   #return a list
r = tree.xpath('/html//meta')       # // means cross level
r = tree.xpath('//div')              # means all tags named div
r = tree.xpath('//div[@class = "header-wrapper"]')      #with condition
r = tree.xpath('(//div[@class =  "header-wrapper"])[2]')    #second div which of class is header-wrapper !!! notice the index is started from 1
r = tree.xpath('(//div[@class = "header-wrapper"])[1]/div/a/text()')    #get DYH but doesn't work if replace '/div/' with '//'
r = tree.xpath('(//div[@class = "header-title"])[1]//text()')       # get ['\r\n            ', 'DYH', '\r\n        '] ok ! so '//text()' means all text and '/text()' means direct text
r = tree.xpath('(//div[@class = "header-title"])[1]/a/@title')      # get attribute of <a>
print(r)

案例：获取58同城二手房信息

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16


#爬取58信息 notice : the info on website is random

import requests

from lxml import etree
url = 'https://bj.58.com/ershoufang/'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
response = requests.get(url=url, headers=headers).text
tree = etree.HTML(response)
div_list = tree.xpath('(//section[@class = "list"])[1]/div')
#print(div_list)
for div in div_list:
    title = div.xpath('./a/div[@class = "property-content"]//h3/@title')[0]     # '..' means upper level
    print(title)

xpath或运算（案例：获取所有城市名称）

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


#所有城市 xpath 与或运算
import requests
from lxml import etree
url = 'https://www.aqistudy.cn/historydata/'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
response = requests.get(url=url, headers=headers).text
tree = etree.HTML(response)
a_list = tree.xpath('//div[@class = "bottom"]/ul/li/a | //div[@class = "bottom"]/ul/div[2]/li/a')
city_name = []
for a in a_list:
    city_name.append(a.xpath('./text()')[0])
print(city_name, len(city_name))

验证码识别

使用超级鹰api

链接

点击此处下载Python版api

使用示例如下

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


import requests
from lxml import etree
import chaojiying_Python.chaojiying as rec
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
response = requests.get(url=url, headers=headers).text
tree = etree.HTML(response)
img_url = 'https://so.gushiwen.cn/' + tree.xpath('//*[@id="imgCode"]/@src')[0]
response = requests.get(url=img_url, headers=headers).content
with open('chaojiying_Python/b.png', 'wb') as f:
    f.write(response)
print(rec.main())

cookie(模拟登录）

案例：登录牛客竞赛网

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22


#session can keep cookie
import requests
url = 'https://www.nowcoder.com/nccommon/login/do?token='
session = requests.Session()
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
}
data = {
    'email': 'xxxxxxx', #登录账号（手机或邮箱）
    'remember': 'false',
    'cipherPwd': 'xxxxxxxxx'    #加密后的密码（可通过抓包获取）
}
response = session.post(url=url, headers=headers, data=data)
print(response.status_code)
page_src = response.text
profile_url = 'https://ac.nowcoder.com/sns/message/90625985/conversation-list?#/'
profile_resopnse = session.get(url=profile_url, headers=headers)
print(profile_resopnse.status_code)

profile_src = profile_resopnse.text
with open('wangye2.html', 'w', encoding='utf-8') as f:
     f.write(profile_src)

代理ip

代理ip的网站（大多数不好用）

代理ip池的github仓库

1
2
3
4
5
6


#use proxy ip
import requests
headers = {
     'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
 }
src = requests.get(url='http://www.baidu.com/s?wd=ip', headers=headers, proxies={"https" : '183.166.102.222:9999'}).text

异步爬虫

进程池

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13


from multiprocessing.dummy import Pool
import time
def tst(str):
    print(str, 'doing')
    time.sleep(2)
    print(str, 'done')

pool = Pool(3)
sta = time.time()
lis = ['aa', 'bb', 'cc', 'dd']
pool.map(tst, lis)
ed = time.time()
print(ed - sta)	#4s

案例：爬取梨视频网站视频

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62


#https://video.pearvideo.com/mp4/third/20210201/cont-1718821-15765543-134934-hd.mp4
#https://video.pearvideo.com/mp4/third/20210201/1612279360957-15765543-134934-hd.mp4

#pearvideo
import requests
from lxml import etree
import random
from multiprocessing.dummy import Pool
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
url = 'https://www.pearvideo.com/category_5'
page_src = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_src)
li_list = tree.xpath('//ul[@id = "listvideoListUl"]/li')
vd_list = []
for li in li_list:
    if li == li_list[0]:
        continue
    #detail_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0]
    video_id = li.xpath('./div/a/@href')[0][6:]
    video_name = li.xpath('./div/a/div[@class = "vervideo-title"]/text()')[0] + '.mp4'
    #print(video_id, video_name)

    detail_url = 'https://www.pearvideo.com/videoStatus.jsp'
    params = {
        'contId' : video_id,
        'mrd' : str(random.random())
    }
    new_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
        'Referer' : 'https://www.pearvideo.com/video_' + video_id
    }
    video_dict = requests.get(url=detail_url, headers=new_headers, params=params).json()
    raw_url = video_dict['videoInfo']['videos']['srcUrl']
    #print(raw_url, video_name)
    left = raw_url.rfind('/')
    right = raw_url.find('-')
    true_url = raw_url.replace(raw_url[left + 1 : right], 'cont-' + video_id)
    #print(true_url, video_name)
    vd = {


        'name' : video_name,
        'url' : true_url
    }
    vd_list.append(vd)


def store(vd):
    url = vd['url']
    name = vd['name']
    dat = requests.get(url=url, headers=headers).content
    print(name, 'downloading')
    with open(name, 'wb') as f:
        f.write(dat)
    print(name, 'done')

pool = Pool(4)
pool.map(store, vd_list)
pool.close()
pool.join()

selenium模块

以chrome为例

安装驱动

根据chrome的版本安装相应的chromedriver

链接

基本用法

windows系统

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14


from selenium import webdriver
from lxml import etree
import time
browser = webdriver.Chrome(executable_path='chromedriver.exe')
browser.get(url='http://scxk.nmpa.gov.cn:81/xk/')
page_src = browser.page_source
tree = etree.HTML(page_src)
li_list = tree.xpath('//ul[@id="gzlist"]/li')
for li in li_list:
    name = li.xpath('./dl/@title')[0]
    print(name)

time.sleep(5)
browser.quit()

一些操作

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22


from selenium import webdriver
import time
browser = webdriver.Chrome(executable_path='chromedriver.exe')
browser.maximize_window()
browser.get(url='https://www.taobao.com/')
time.sleep(1)
search_input = browser.find_elements_by_id('q')[0]
search_input.send_keys('ipad')
time.sleep(1)
btn = browser.find_elements_by_css_selector('.btn-search')[0]
btn.click()
time.sleep(1)
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(2)
browser.get('https://dyhgo.fun')
time.sleep(5)
browser.back()
time.sleep(2)
browser.forward()
time.sleep(2)
time.sleep(3)
browser.quit()

无头浏览器

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13


#phantomJs also works
from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=chrome_options)
browser.get(url='https://dyhgo.fun')
page_src = browser.page_source
time.sleep(2)
print(page_src)
browser.quit()

规避检测

版本chrome <= 79

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11


from selenium import webdriver
import time
from selenium.webdriver import ChromeOptions
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
browser = webdriver.Chrome(executable_path='chromedriver.exe', options=option)
browser.get(url='https://dyhgo.fun')
page_src = browser.page_source
time.sleep(2)
print(page_src)
browser.quit()

版本chrome > 79

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15


from selenium import webdriver
import time
browser = webdriver.Chrome(executable_path='chromedriver.exe')
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
        Object.defineProperty(navigator, 'webdriver', {
          get: () => undefined
        })
      """
    })
browser.get(url='https://dyhgo.fun')
page_src = browser.page_source
time.sleep(2)
print(page_src)
browser.quit()

目录

Python爬虫

urllib模块

requests模块

获得网页源码

带参数的url和UA伪装

案例：有道翻译查词

案例：获取豆瓣影单

案例：kfc餐厅信息

案例：药监局相关信息

数据解析

正则

案例：下载糗事百科图片

案例：批量下载糗事百科图片

bs4

BeautifulSoup解析本地html

BeautifulSoup解析当前获得的html源码

BeautifulSoup的使用

案例：爬取《三国演义》内容

xpath

基本用法

xpath语法

案例：获取58同城二手房信息

xpath或运算（案例：获取所有城市名称）

验证码识别

cookie(模拟登录）

案例：登录牛客竞赛网

代理ip

异步爬虫

进程池

案例：爬取梨视频网站视频

selenium模块

安装驱动

基本用法

一些操作

无头浏览器

规避检测

scrapy框架

hhu爬虫

hhu健康打卡脚本（假期版）

hhu查成绩脚本（新版教务系统）