目录

Python爬虫

urllib模块

不使用

requests模块

获得网页源码

1
2
3
4
import requests
url = 'https://dyhgo.fun'
response = requests.get(url)
print(response.text)

带参数的url和UA伪装

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
#测试一下带参数的url和ua伪装
import requests
url = 'https://www.baidu.com/s'
name = input('输入您要搜索的内容,我们将返回百度搜索该信息的网页源码')
params = {
    'wd' : name
}
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
response = requests.get(url=url, params=params, headers=headers)
print(response.text)

案例:有道翻译查词

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#有道翻译的结果获取,查询look单词并将结果存储
import requests
import json
params = {
    'i': 'look',
    'from': 'AUTO',
    'to': 'AUTO',
    'smartresult': 'dict',
    'client': 'fanyideskweb',
    'salt': '16119129043609',
    'sign': '288cdf16af5fa68411381ba3c9f7f874',
    'lts': '1611912904360',
    'bv': '44a53b4124e8b822ebfd881c5a599938',
    'doctype': 'json',
    'version': '2.1',
    'keyfrom': 'fanyi.web',
    'action': 'FY_BY_REALTlME'
}
url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
response = requests.post(url=url, data=params, headers=headers)
f = open('look.json', 'w', encoding='utf-8')
json.dump(response.json(), fp=f, ensure_ascii=False)
f.close()
print(response.json())

案例:获取豆瓣影单

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
#获取豆瓣影单
import requests
import json
url = 'https://movie.douban.com/j/chart/top_list'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
params = {
    'type': '24',
    'interval_id': '100:90',
    'action':'',
    'start': '1',
    'limit': '2'
}
response = requests.get(url=url, params=params, headers=headers)
list_data = response.json()
f = open('douban.json', 'w', encoding='utf-8')
json.dump(list_data, fp=f, ensure_ascii=False)
f.close()

案例:kfc餐厅信息

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
#kfc餐厅信息
import requests
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
params = {
    'cname':'',
    'pid':'',
    'keyword': '上海',
    'pageIndex': '1',
    'pageSize': '10'
}
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
response = requests.post(url=url, data=params, headers=headers)
print(response.text)

案例:药监局相关信息

注意数据由ajax加载

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#药监局相关数据
import requests
import json
url1 = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
data = {
    'on': 'true',
    'page': '1',
    'pageSize': '15',
    'productName':'',
    'conditionType': '1',
    'applyname':'',
    'applysn':''
}
id_list = []
response = requests.post(url=url1, data=data, headers=headers).json()['list']
for i in response:
    id_list.append(i['ID'])
#print(id_list)

detail_list = []
url2 = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
for id in id_list:
    data = {
        'id' : id
    }
    response = requests.post(url=url2, data=data, headers=headers).json()
    detail_list.append(response)
    #print(response)
with open('yaojianju.json', 'w', encoding='utf-8') as f:
    json.dump(detail_list, fp=f, ensure_ascii=False, indent=True)
print('over')

数据解析

正则

基础用法参考此处

案例:下载糗事百科图片

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
#下载糗事百科的图片
import requests
url = 'https://pic.qiushibaike.com/system/pictures/12402/124029001/medium/J8FFRWUK30TM3X79.jpg'
header = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
response = requests.get(url=url, headers=header)
image = response.content
with open('qiutu.jpg', 'wb') as f:
    f.write(image)
print('over')

案例:批量下载糗事百科图片

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#用正则表达式下载图片(可批量下载)
# <div class="thumb">
#
# <a href="/article/124031033" target="_blank">
# <img src="//pic.qiushibaike.com/system/pictures/12403/124031033/medium/V35P60R0KM4YCY3Q.jpg" alt="糗事#124031033" class="illustration" width="100%" height="auto">
# </a>
# </div>

import requests
import re
url = 'https://www.qiushibaike.com/imgrank/'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
response = requests.get(url=url, headers=headers).text
#print(response)

regex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'

img_list = re.findall(regex, response, re.S)
#print(img_list)
img_url = 'https:' + img_list[0]
#print(img_url)
response = requests.get(url=img_url, headers=headers).content
#print(type(response))
with open('qiutu2.jpg', 'wb') as f :
    f.write(response)
print('download successfully')

bs4

BeautifulSoup解析本地html

1
2
3
4
5
6
#beautifulsoup解析本地html
from bs4 import BeautifulSoup
import requests
with open('test.html', 'r', encoding='utf-8') as f:
    bs = BeautifulSoup(f, 'lxml')
    print(bs)

BeautifulSoup解析当前获得的html源码

1
2
3
4
5
#beautifulsoup解析当前获得的html源码
from bs4 import BeautifulSoup
import requests
url = 'https://dyhgo.fun'
print(BeautifulSoup(requests.get(url=url).text, 'lxml'))

BeautifulSoup的使用

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#beautifulsoup的使用,挺复杂的
from bs4 import BeautifulSoup
with open('forbsfor.html', 'r', encoding='utf-8') as f:
    bs = BeautifulSoup(f, 'lxml')
print(bs)
print(bs.a) #print fitst a tag
print(bs.div)
print(bs.find('div'))   #same as bs.div
print(bs.find('div', class_='header-title'))    #find tag by attribute, notice class
print(bs.find('div', id = 'menu-toggle-mobile'))
print(bs.find_all('a')) #get a list of all 'a' tags

#css selector
print(bs.select('.page-item'))  #a list of tags which of class is 'page-item'
print(bs.select('.header-wrapper > .header-title > a[0]'))  #wrong statement
print(bs.select('.header-wrapper > .header-title > a')[0]) #means level, return a list of all 'a' tags
print(bs.select('.header-wrapper a')[0])    #space means cross level

#get context or attribute
print(bs.select('.header-wrapper a')[0].text)
print(bs.select('.header-wrapper a')[0].get_text())
print(bs.select('.header-wrapper a')[0].string)
print(bs.select('.header-title')[0].text)   #get all text between this tag
print(bs.select('.header-title')[0].get_text())
print(bs.select('.header-title')[0].string) #get direct text between this tag

print(bs.select('.header-wrapper > .header-title > a')[0]['title'])

案例:爬取《三国演义》内容

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#爬取三国演义的内容
import requests
from bs4 import BeautifulSoup
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}

response = requests.get(url=url, headers=headers).text
bs = BeautifulSoup(response, 'lxml')
#mulu = bs.find('div', class_='book-mulu')
a_list = bs.select('.book-mulu > ul > li > a')
#print(a_list)
f = open('sanguo.txt', 'a', encoding='utf-8')
for a in a_list[:3]:
    title = a.string.encode('iso-8859-1').decode('utf-8')#.encode('iso-8859-1')
    #print(title)
    url_detail = 'https://www.shicimingju.com/' + a['href']
    #print(url_detail)
    #response_detail = requests.get(url=url_detail, headers=headers).text.encode('iso-8859-1')
    # !!!!!!notice this statement is more common
    response_detail = requests.get(url=url_detail, headers=headers)
    response_detail.encoding = response_detail.apparent_encoding
    response_detail = response_detail.text


    bs = BeautifulSoup(response_detail, 'lxml')

    content_detail = bs.find('div', class_='chapter_content').text#.encode('iso-8859-1')  # !!!!!text not string

    #print(type(content_detail))

    f.write(title + ':' + content_detail + '\n')
    #print(content_detail)

    print(title + 'download successfully')

xpath

很常用

基本用法

1
2
3
4
5
6
7
from lxml import etree
import requests
parser = etree.HTMLParser()
url = 'https://dyhgo.fun'
response = requests.get(url=url).text
tree = etree.HTML(response)     #without etree.HTMLParser
print(tree.xpath('/html/head/title/text()'))

xpath语法

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
from lxml import etree
parser = etree.HTMLParser()
tree = etree.parse('forbsfor.html', parser=parser)
r = tree.xpath('/html/head/meta')   #return a list
r = tree.xpath('/html//meta')       # // means cross level
r = tree.xpath('//div')              # means all tags named div
r = tree.xpath('//div[@class = "header-wrapper"]')      #with condition
r = tree.xpath('(//div[@class =  "header-wrapper"])[2]')    #second div which of class is header-wrapper !!! notice the index is started from 1
r = tree.xpath('(//div[@class = "header-wrapper"])[1]/div/a/text()')    #get DYH but doesn't work if replace '/div/' with '//'
r = tree.xpath('(//div[@class = "header-title"])[1]//text()')       # get ['\r\n            ', 'DYH', '\r\n        '] ok ! so '//text()' means all text and '/text()' means direct text
r = tree.xpath('(//div[@class = "header-title"])[1]/a/@title')      # get attribute of <a>
print(r)

案例:获取58同城二手房信息

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
#爬取58信息 notice : the info on website is random

import requests

from lxml import etree
url = 'https://bj.58.com/ershoufang/'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
response = requests.get(url=url, headers=headers).text
tree = etree.HTML(response)
div_list = tree.xpath('(//section[@class = "list"])[1]/div')
#print(div_list)
for div in div_list:
    title = div.xpath('./a/div[@class = "property-content"]//h3/@title')[0]     # '..' means upper level
    print(title)

xpath或运算(案例:获取所有城市名称)

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
#所有城市 xpath 与或运算
import requests
from lxml import etree
url = 'https://www.aqistudy.cn/historydata/'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
response = requests.get(url=url, headers=headers).text
tree = etree.HTML(response)
a_list = tree.xpath('//div[@class = "bottom"]/ul/li/a | //div[@class = "bottom"]/ul/div[2]/li/a')
city_name = []
for a in a_list:
    city_name.append(a.xpath('./text()')[0])
print(city_name, len(city_name))

验证码识别

使用超级鹰api

链接

点击此处下载Python版api

使用示例如下

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
import requests
from lxml import etree
import chaojiying_Python.chaojiying as rec
url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
response = requests.get(url=url, headers=headers).text
tree = etree.HTML(response)
img_url = 'https://so.gushiwen.cn/' + tree.xpath('//*[@id="imgCode"]/@src')[0]
response = requests.get(url=img_url, headers=headers).content
with open('chaojiying_Python/b.png', 'wb') as f:
    f.write(response)
print(rec.main())

cookie(模拟登录)

案例:登录牛客竞赛网

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
#session can keep cookie
import requests
url = 'https://www.nowcoder.com/nccommon/login/do?token='
session = requests.Session()
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
}
data = {
    'email': 'xxxxxxx', #登录账号(手机或邮箱)
    'remember': 'false',
    'cipherPwd': 'xxxxxxxxx'    #加密后的密码(可通过抓包获取)
}
response = session.post(url=url, headers=headers, data=data)
print(response.status_code)
page_src = response.text
profile_url = 'https://ac.nowcoder.com/sns/message/90625985/conversation-list?#/'
profile_resopnse = session.get(url=profile_url, headers=headers)
print(profile_resopnse.status_code)

profile_src = profile_resopnse.text
with open('wangye2.html', 'w', encoding='utf-8') as f:
     f.write(profile_src)

代理ip

代理ip的网站(大多数不好用)

代理ip池的github仓库

1
2
3
4
5
6
#use proxy ip
import requests
headers = {
     'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
 }
src = requests.get(url='http://www.baidu.com/s?wd=ip', headers=headers, proxies={"https" : '183.166.102.222:9999'}).text

异步爬虫

进程池

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
from multiprocessing.dummy import Pool
import time
def tst(str):
    print(str, 'doing')
    time.sleep(2)
    print(str, 'done')

pool = Pool(3)
sta = time.time()
lis = ['aa', 'bb', 'cc', 'dd']
pool.map(tst, lis)
ed = time.time()
print(ed - sta)	#4s

案例:爬取梨视频网站视频

网站链接

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#https://video.pearvideo.com/mp4/third/20210201/cont-1718821-15765543-134934-hd.mp4
#https://video.pearvideo.com/mp4/third/20210201/1612279360957-15765543-134934-hd.mp4

#pearvideo
import requests
from lxml import etree
import random
from multiprocessing.dummy import Pool
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}
url = 'https://www.pearvideo.com/category_5'
page_src = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_src)
li_list = tree.xpath('//ul[@id = "listvideoListUl"]/li')
vd_list = []
for li in li_list:
    if li == li_list[0]:
        continue
    #detail_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0]
    video_id = li.xpath('./div/a/@href')[0][6:]
    video_name = li.xpath('./div/a/div[@class = "vervideo-title"]/text()')[0] + '.mp4'
    #print(video_id, video_name)

    detail_url = 'https://www.pearvideo.com/videoStatus.jsp'
    params = {
        'contId' : video_id,
        'mrd' : str(random.random())
    }
    new_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
        'Referer' : 'https://www.pearvideo.com/video_' + video_id
    }
    video_dict = requests.get(url=detail_url, headers=new_headers, params=params).json()
    raw_url = video_dict['videoInfo']['videos']['srcUrl']
    #print(raw_url, video_name)
    left = raw_url.rfind('/')
    right = raw_url.find('-')
    true_url = raw_url.replace(raw_url[left + 1 : right], 'cont-' + video_id)
    #print(true_url, video_name)
    vd = {


        'name' : video_name,
        'url' : true_url
    }
    vd_list.append(vd)


def store(vd):
    url = vd['url']
    name = vd['name']
    dat = requests.get(url=url, headers=headers).content
    print(name, 'downloading')
    with open(name, 'wb') as f:
        f.write(dat)
    print(name, 'done')

pool = Pool(4)
pool.map(store, vd_list)
pool.close()
pool.join()

selenium模块

以chrome为例

安装驱动

根据chrome的版本安装相应的chromedriver

链接

基本用法

windows系统

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
from selenium import webdriver
from lxml import etree
import time
browser = webdriver.Chrome(executable_path='chromedriver.exe')
browser.get(url='http://scxk.nmpa.gov.cn:81/xk/')
page_src = browser.page_source
tree = etree.HTML(page_src)
li_list = tree.xpath('//ul[@id="gzlist"]/li')
for li in li_list:
    name = li.xpath('./dl/@title')[0]
    print(name)

time.sleep(5)
browser.quit()

一些操作

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
from selenium import webdriver
import time
browser = webdriver.Chrome(executable_path='chromedriver.exe')
browser.maximize_window()
browser.get(url='https://www.taobao.com/')
time.sleep(1)
search_input = browser.find_elements_by_id('q')[0]
search_input.send_keys('ipad')
time.sleep(1)
btn = browser.find_elements_by_css_selector('.btn-search')[0]
btn.click()
time.sleep(1)
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(2)
browser.get('https://dyhgo.fun')
time.sleep(5)
browser.back()
time.sleep(2)
browser.forward()
time.sleep(2)
time.sleep(3)
browser.quit()

无头浏览器

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
#phantomJs also works
from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=chrome_options)
browser.get(url='https://dyhgo.fun')
page_src = browser.page_source
time.sleep(2)
print(page_src)
browser.quit()

规避检测

版本chrome <= 79

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
from selenium import webdriver
import time
from selenium.webdriver import ChromeOptions
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
browser = webdriver.Chrome(executable_path='chromedriver.exe', options=option)
browser.get(url='https://dyhgo.fun')
page_src = browser.page_source
time.sleep(2)
print(page_src)
browser.quit()

版本chrome > 79

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
from selenium import webdriver
import time
browser = webdriver.Chrome(executable_path='chromedriver.exe')
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
        Object.defineProperty(navigator, 'webdriver', {
          get: () => undefined
        })
      """
    })
browser.get(url='https://dyhgo.fun')
page_src = browser.page_source
time.sleep(2)
print(page_src)
browser.quit()

scrapy框架

https://img-blog.csdnimg.cn/20210207161753956.png

hhu爬虫

(已停止维护)

hhu健康打卡脚本(假期版)

hhu查成绩脚本(新版教务系统)

点击此处