崔庆才老师爬虫的学习笔记。

一、Urllib库详解

1、Urllib库

  • urllib.request:请求模块
  • urllib.error:异常处理模块
  • urllib.parse:url解析模块(拆分、合并等)
  • urllib.robotparser:robot.txt解析模块

urllib.request.urlopen(url,data = None,[timeout]*,cafile = None,capath = None,cadefault = False,context = None)

1
2
3
4
# get请求
import urllib.request
response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8'))
1
2
3
4
5
6
7
# post请求
import urllib.request
import urllib.parse

data = bytes(urllib.parse.urlencode({'word':'hello'}),encoding='utf8')
response = urllib.request.urlopen('http://httpbin.org/post',data=data)
print(response.read())
1
2
3
4
5
# time超时
import urllib.request

response = urllib.request.urlopen('http://httpbin.org/get',timeout=1)
print(response.read())
1
2
3
4
5
6
7
8
9
10
# error错误
import socket
import urllib.request
import urllib.error

try:
response = urllib.request.urlopen('http://httpbin.org/get',timeout = 0.1)
except urllib.error.URLError as e:
if isinstance(e.reason,socket.timeout):
print('TIME OUT')

2、响应

1
2
3
4
5
# 响应类型
import urllib.request

response = urllib.request.urlopen('https://www.python.org')
print(type(response))
1
2
3
4
5
6
7
8
# 状态码、响应头
import urllib.request

response = urllib.request.urlopen('https://www.python.org')

print(response.status)
print(response.getheaders())
#print(response.getheaders("Server"))

3、Request

1
2
3
4
5
import urllib.request

request = urllib.request.Request('https://www.python.org')
response = urllib.request.urlopen(request)
print(response.read().decode('utf-8'))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Request添加header、data
import urllib.request
import urllib.parse

url = 'http://httpbin.org/post'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64)',
'Host':'httpbin.org'
}
dict = {'name':'Germey' }
data = bytes(urllib.parse.urlencode(dict),encoding="utf8")

request = urllib.request.Request(url=url,headers=headers,data=data,method='POST')
response = urllib.request.urlopen(request)
print(response.read().decode('utf-8'))
1
2
3
4
5
6
7
8
9
10
11
12
13
# 另外一种添加header方式
import urllib.request
import urllib.parse

url = 'http://httpbin.org/post'
dict = {'name':'Germey' }
data = bytes(urllib.parse.urlencode(dict),encoding="utf8")

request = urllib.request.Request(url=url,data=data,method='POST')
request.add_header( 'User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64)') # 注意key,value之间用逗号隔开
request.add_header('Host','httpbin.org')
response = urllib.request.urlopen(request)
print(response.read().decode('utf-8'))

4、Handler

https://docs.python.org/3/library/urllib.request.html#module-urllib.request

1
2
3
4
5
6
7
8
9
10
11
12
# 代理
import urllib.request

proxy_handler = urllib.request.ProxyHandler(
{
'http':'http://127.0.0.1:9743',
'https':'https://127.0.0.1:9743'
}
)
opener = urllib.request.build_opener(proxy_handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))

5、Cookie

客户端保存,用来记录客户身份的文本文件、维持登录状态

1
2
3
4
5
6
7
8
9
10
# cookie获取
import urllib.request
import http.cookiejar

cookie = http.cookiejar.CookieJar() # 设置一个cookie栈
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
for item in cookie:
print(item.name+'='+item.value)
1
2
3
4
5
6
7
8
9
10
# 保存cookie为文本文件
import urllib.request
import http.cookiejar

filename = 'cookie.txt'
cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True,ignore_expires=True) #保存cookie
1
2
3
4
5
6
7
8
9
10
# 另外一种cookie保存格式
import urllib.request
import http.cookiejar

filename = 'cookie_LWP.txt'
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True,ignore_expires=True) #保存cookie
1
2
3
4
5
6
7
8
9
10
# 用什么方式保存cookie,就用什么方式读取
import urllib.request
import http.cookiejar

cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookie_LWP.txt',ignore_discard=True,ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))

6、异常处理

https://docs.python.org/3/library/urllib.error.html#module-urllib.error

1
2
3
4
5
6
7
from urllib import request
from urllib import error

try:
response = request.urlopen('http://www.cuiqingcai.com/index.html')
except error.URLError as e:
print(e.reason)
1
2
3
4
5
6
7
8
9
10
11
12
# 可以捕捉的异常
from urllib import request
from urllib import error

try:
response = request.urlopen('http://www.cuiqingcai.com/index.html')
except error.HTTPError as e: # 先捕捉子类异常
print(e.reason,e.code,e.headers,sep='\n')
except error.URLError as e: # 再捕捉父类异常
print(e.reason)
else:
print('Request Successfully')
1
2
3
4
5
6
7
8
9
10
11
# 验证异常是那种原因
from urllib import request
from urllib import error
import socket

try:
response = request.urlopen('http://www.baidu.com',timeout=0.01)
except error.URLError as e:
print(type(e.reason))
if isinstance(e.reason,socket.timeout):
print('Time Out')

7、URL解析

https://docs.python.org/3/library/urllib.parse.html#module-urllib.parse

(1)urlparse

将url进行分割,分割成几个部分,再依次将其复制

parse.urlparse(urlstring,scheme='',allow_fragments = True)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from urllib.parse import urlparse

result = urlparse('https://www.baidu.com/s?wd=urllib&ie=UTF-8') # 解析url为6个部分
print(type(result))
print(result)

result1 = urlparse('www.baidu.com/s?wd=urllib&ie=UTF-8',scheme='https') # 无协议类型,自行添加协议类型
print(result1)

result2 = urlparse('http://www.baidu.com/s?wd=urllib&ie=UTF-8',scheme='https') # 有协议类型,添加协议类型还是会显示默认协议
print(result2)

result3 = urlparse('http://www.baidu.com/s?wd=urllib&ie=UTF-8#comment',allow_fragments=True) # allow_fragments为True
print(result3)

result4 = urlparse('http://www.baidu.com/s?wd=urllib&ie=UTF-8#comment',allow_fragments=False) # allow_fragments为False,会自行向前拼接
print(result4)

(2)urlunparse

urlparse的反函数

1
2
3
4
5
6
from urllib.parse import urlunparse

#data可以通过urlparse得出的参数往里面带,注意:即使是空符号也要写进去,不然会出错
data = ['http', 'www.baidu.com', 'index.html', 'user', 'a=6', 'comment']
result = urlunparse(data)
print(result)

(3)urljoin

拼接url

1
2
3
4
5
6
7
8
9
10
from urllib.parse import urljoin

result = urljoin('http://www.baidu.com','FQA.html') # 正常拼接
print(result)

result1 = urljoin('http://www.baidu.com','http://www.taobao.com/FQA.html') # 后面url会覆盖前面url共有的部分,多余部分自行拼接
print(result1)

result2 = urljoin('http://www.baidu.com/about','https://www.taobao.com/FQA.html') # 后面url会覆盖前面url共有的部分,总之以后面为准
print(result2)

(4)urlencode

字典对象转化为get请求参数

1
2
3
4
5
6
7
8
9
10
from urllib.parse import urlencode

params={
'name':'Arise',
'age':'21'
}
base_url = 'http://www.baidu.com?'

url = base_url + urlencode(params)
print(url)

8、robotparser

用来解析robot.txt(只做了解)

https://docs.python.org/3/library/urllib.robotparser.html#module-urllib.robotparser

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import urllib.robotparser  

rp = urllib.robotparser.RobotFileParser()
rp.set_url("http://www.musi-cal.com/robots.txt")
rp.read()
rrate = rp.request_rate("*")
print(rrate.requests)
#3
rrate.seconds
#20
rp.crawl_delay("*")
#6
rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco")
#False
rp.can_fetch("*", "http://www.musi-cal.com/")
#True

持续更新…

最后更新: 2018年08月14日 17:09

原始链接: http://pythonfood.github.io/2018/07/02/爬虫-urllib库/

× 多少都行~
打赏二维码