崔庆才老师爬虫的学习笔记。

一、Requests库详解

1、什么是Requests库

Requests库是用Python编写的,基于urllib,采用Apache2 Licensed开源协议的HTTP库。

相比urllib库,Requests库更加方便,可以节约我们大量的工作,完全满足HTTP测试需求。

2、安装

pip install request

3、Requests库用法详解

1
2
3
4
5
6
7
8
import requests

response = requests.get('http://www.baidu.com')
print(type(response))
print(response.status_code)
print(type(response.text))
print(response.text)
print(response.cookies)
1
2
3
4
5
6
7
8
9
# 各种请求(HTTP测试网站:http://httpbin.org/)
import requests

requests.get('http://httpbin.org/get')
requests.post('http://httpbin.org/post')
requests.delete('http://httpbin.org/delete')
requests.put('http://httpbin.org/put')
requests.head('http://httpbin.org/get')
requests.options('http://httpbin.org/get')
1
2
3
4
5
# 基本get请求
import requests

response = requests.get('http://httpbin.org/get')
print(response.text)
1
2
3
4
5
6
7
8
9
# 带参数get请求
import requests

data={
'name':'asr',
'age':'12'
}
response = requests.get('http://httpbin.org/get',params=data)
print(response.text)
1
2
3
4
5
6
7
8
9
10
11
# 解析json
import requests
import json

response = requests.get('http://httpbin.org/get')
print(response.text)
print(type(response.text))
print(response.json())
print(type(response.json()))
print(json.loads(response.text))
print(type(json.loads(response.text)))
1
2
3
4
5
6
7
8
9
10
11
12
# 获取二进制数据
import requests

response = requests.get('https://github.com/favicon.ico')
print(response.text)
print(type(response.text))
print(response.content) # 获取二进制数据用response.content
print(type(response.content))

with open('favicon.ico','wb') as f:
f.write(response.content)
f.close()
1
2
3
4
5
6
7
8
9
10
# 添加headers
import requests


response = requests.get('http://www.zhihu.com/explore') #不加headers不能访问
print(response.text)

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; rv:60.0) Gecko/20100101 Firefox/60.0'}
response1 = requests.get('http://www.zhihu.com/explore',headers=headers)
print(response1.text)
1
2
3
4
5
6
7
8
9
10
11
12
# 基本post请求
import requests

data = {
'name':'jk',
'age':18
}
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; rv:60.0) Gecko/20100101 Firefox/60.0'
}
response = requests.post('http://httpbin.org/post',data=data,headers=headers)
print(response.json())
1
2
3
4
5
6
7
8
9
# response属性
import requests

response = requests.get('http://www.douban.com')
print(type(response.status_code),response.status_code)
print(type(response.headers),response.headers)
print(type(response.cookies),response.cookies)
print(type(response.url),response.url)
print(type(response.history),response.history)
1
2
3
4
5
# 状态码判断
import requests

response =requests.get('http://www.jianshu.com')
exit() if not response.status_code==requests.codes.forbidden else print('403 forbidden') #状态码查询对象,可以从下表中查找
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# 信息性状态码
100: ('continue',),
101: ('switching_protocols',),
102: ('processing',),
103: ('checkpoint',),
122: ('uri_too_long', 'request_uri_too_long'),

# 成功状态码
200: ('ok', 'okay', 'all_ok', 'all_okay', 'all_good', '\\o/', '✓'),
201: ('created',),
202: ('accepted',),
203: ('non_authoritative_info', 'non_authoritative_information'),
204: ('no_content',),
205: ('reset_content', 'reset'),
206: ('partial_content', 'partial'),
207: ('multi_status', 'multiple_status', 'multi_stati', 'multiple_stati'),
208: ('already_reported',),
226: ('im_used',),

# 重定向状态码
300: ('multiple_choices',),
301: ('moved_permanently', 'moved', '\\o-'),
302: ('found',),
303: ('see_other', 'other'),
304: ('not_modified',),
305: ('use_proxy',),
306: ('switch_proxy',),
307: ('temporary_redirect', 'temporary_moved', 'temporary'),
308: ('permanent_redirect',
'resume_incomplete', 'resume',), # These 2 to be removed in 3.0

# 客户端错误状态码
400: ('bad_request', 'bad'),
401: ('unauthorized',),
402: ('payment_required', 'payment'),
403: ('forbidden',),
404: ('not_found', '-o-'),
405: ('method_not_allowed', 'not_allowed'),
406: ('not_acceptable',),
407: ('proxy_authentication_required', 'proxy_auth', 'proxy_authentication'),
408: ('request_timeout', 'timeout'),
409: ('conflict',),
410: ('gone',),
411: ('length_required',),
412: ('precondition_failed', 'precondition'),
413: ('request_entity_too_large',),
414: ('request_uri_too_large',),
415: ('unsupported_media_type', 'unsupported_media', 'media_type'),
416: ('requested_range_not_satisfiable', 'requested_range', 'range_not_satisfiable'),
417: ('expectation_failed',),
418: ('im_a_teapot', 'teapot', 'i_am_a_teapot'),
421: ('misdirected_request',),
422: ('unprocessable_entity', 'unprocessable'),
423: ('locked',),
424: ('failed_dependency', 'dependency'),
425: ('unordered_collection', 'unordered'),
426: ('upgrade_required', 'upgrade'),
428: ('precondition_required', 'precondition'),
429: ('too_many_requests', 'too_many'),
431: ('header_fields_too_large', 'fields_too_large'),
444: ('no_response', 'none'),
449: ('retry_with', 'retry'),
450: ('blocked_by_windows_parental_controls', 'parental_controls'),
451: ('unavailable_for_legal_reasons', 'legal_reasons'),
499: ('client_closed_request',),

# 服务端错误状态码
500: ('internal_server_error', 'server_error', '/o\\', '✗'),
501: ('not_implemented',),
502: ('bad_gateway',),
503: ('service_unavailable', 'unavailable'),
504: ('gateway_timeout',),
505: ('http_version_not_supported', 'http_version'),
506: ('variant_also_negotiates',),
507: ('insufficient_storage',),
509: ('bandwidth_limit_exceeded', 'bandwidth'),
510: ('not_extended',),
511: ('network_authentication_required', 'network_auth', 'network_authentication')

4、Requests高级操作

1
2
3
4
5
6
7
8
9
# 文件上传
import requests

files = {
'files':open('favicon.ico','rb')
}

response = requests.post('http://httpbin.org/post',files=files)
print(response.text)
1
2
3
4
5
6
7
# 获取cookies
import requests

response = requests.get('http://www.baidu.com')
print(response.cookies)
for k,v in response.cookies.items():
print(k +'='+ v)
1
2
3
4
5
6
7
8
9
10
11
# 会话维持
import requests

requests.get('http://httpbin.org/cookies/set/number/123456789') #为网站的访问设置cookie
response = requests.get('http://httpbin.org/cookies') #与上面的行为时独立的,所以获取不到任何与cookie相关的信息
print(response.text)

s = requests.Session() #声明Session对象,使用这个对象发起两次GET请求(相当于同一个浏览器发出来的请求)
s.get('http://httpbin.org/cookies/set/number/123456789')
r = s.get('http://httpbin.org/cookies')
print(r.text)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# 证书验证
import requests

#直接请求https会报错SLLError
#response = requests.get('https://www.12306.cn')
#print(response.status_code)

#添加verify=False可以忽略证书验证,但是还是会报警告
#response = requests.get('https://www.12306.cn',verify=False)
#print(response.text)

# 1.引入requests.packages.urllib3设置忽略警告
from requests.packages import urllib3
urllib3.disable_warnings()
response = requests.get('https://www.12306.cn',verify=False)
print(response.status_code)

# 2.通过捕获警告到日志的方式忽略警告
import logging
logging.captureWarnings(True)
response = requests.get('https://www.12306.cn',verify=False)
print(response.status_code)

# 3.指定一个本地证书用作客户端证书,这可以是单个文件(包含密钥和证书)或一个包含两个文件路径的元组
response = requests.get('https://www.12306.cn',cert=('/path/server.crt', '/path/key'))
print(response.status_code)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 代理设置
import requests

proxies = {
"http": "http://10.10.1.10:3128",
"https": "http://10.10.1.10:1080",
}
requests.get("https://www.taobao.com", proxies=proxies)

# 若代理需要使用HTTP Basic Auth,可以使用类似http://user:password@host:port这样的语法来设置代理。
proxies = {
"http": "http://user:password@10.10.1.10:3128/",
}
requests.get("https://www.taobao.com", proxies=proxies)

# 除了基本的HTTP代理外,requests还支持SOCKS协议的代理。pip3 install 'requests[socks]'
proxies = {
'http': 'socks5://user:password@host:port',
'https': 'socks5://user:password@host:port'
}
requests.get("https://www.taobao.com", proxies=proxies)
1
2
3
4
5
6
7
8
9
10
11
12
# 超时设置 
import requests

r = requests.get('https://www.taobao.com',timeout=1)
print(r.status_code)

# 分别指定连接(connect)和读取(read)两个阶段超时,可以传入一个元组
r = requests.get('https://www.taobao.com',timeout=(5.11,30))

# 想永久等待,可以直接将timeout设置为None,或直接不加参数
r = requests.get('https://www.taobao.com',timeout=None)
r = requests.get('https://www.taobao.com')
1
2
3
4
5
6
7
8
9
# 身份认证
import requests

r = requests.get('http://120.27.34.24.9001',auth=('username', 'password'))
print(r.status_code)

#上面代码是简写,实际调用的requests.auth.HTTPBasicAuth
from requests.auth import HTTPBasicAuth
r=requests.get('http://120.27.34.24.9001',auth=HTTPBasicAuth('username', 'password'))
1
2
3
4
5
6
7
8
9
10
11
12
13
# 异常处理
# http://docs.python-requests.org/en/master/api/#exceptions
import requests
from requests.exceptions import ReadTimeout,HTTPError,RequestException

try:
r = requests.get('https://www.taobao.com',timeout=0.1)
except ReadTimeout:
print('ReadTimeout')
except HTTPError:
print('HTTPError')
except RequestException:
print('RequestException')

持续更新…

最后更新: 2018年08月14日 17:14

原始链接: http://pythonfood.github.io/2018/07/02/爬虫-requests库/

× 多少都行~
打赏二维码