崔庆才老师爬虫的学习笔记。

一、爬取实战

1、流程框架

(1)抓取索引页内容

利用requests请求目标站点,得到索引网页HTML代码,返回结果。

(2)代理设置

如果遇到302状态码,则证明IP被封,切换代理重试。

(3)分析详情页内容

请求详情页,分析得到标题,正文等内容。

(4)将数据保存到数据库

将结构化数据保存到MongoDB。

2、实战

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import pymongo
import requests
from requests.exceptions import ConnectionError
from lxml.etree import XMLSyntaxError
from urllib.parse import urlencode
from pyquery import PyQuery as pq
from wxconfig import *

# 设置数据库连接
client = pymongo.MongoClient(MONGO_URI)
db = client[MONGO_DB]
# 爬取url
base_url = 'http://weixin.sogou.com/weixin?'
# 请求头,需要带上cookie,不然只能访问10页数据
headers = {
'Cookie': 'SUID=F6177C7B3220910A000000058E4D679; SUV=1491392122762346; ABTEST=1|1491392129|v1; SNUID=0DED8681FBFEB69230E6BF3DFB2F8D6B; ld=OZllllllll2Yi2balllllV06C77lllllWTZgdkllll9lllllxv7ll5@@@@@@@@@@; LSTMV=189%2C31; LCLKINT=1805; weixinIndexVisited=1; SUIR=0DED8681FBFEB69230E6BF3DFB2F8D6B; JSESSIONID=aaa-BcHIDk9xYdr4odFSv; PHPSESSID=afohijek3ju93ab6l0eqeph902; sct=21; IPLOC=CN; ppinf=5|1491580643|1492790243|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToyNzolRTUlQjQlOTQlRTUlQkElODYlRTYlODklOER8Y3J0OjEwOjE0OTE1ODA2NDN8cmVmbmljazoyNzolRTUlQjQlOTQlRTUlQkElODYlRTYlODklOER8dXNlcmlkOjQ0Om85dDJsdUJfZWVYOGRqSjRKN0xhNlBta0RJODRAd2VpeGluLnNvaHUuY29tfA; pprdig=j7ojfJRegMrYrl96LmzUhNq-RujAWyuXT_H3xZba8nNtaj7NKA5d0ORq-yoqedkBg4USxLzmbUMnIVsCUjFciRnHDPJ6TyNrurEdWT_LvHsQIKkygfLJH-U2MJvhwtHuW09enCEzcDAA_GdjwX6_-_fqTJuv9w9Gsw4rF9xfGf4; sgid=; ppmdig=1491580643000000d6ae8b0ebe76bbd1844c993d1ff47cea',
'Host': 'weixin.sogou.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
}
# 开始时是否启用代理
proxy = None

def get_proxy():
# 代理获取函数,这里使用类flask的接口
try:
response = requests.get(PROXY_POOL_URL)
if response.status_code == 200:
return response.text
# 没有异常直接结束
return None
except ConnectionError:
return None

def get_html(url, count=1):
print('Crawling', url)
print('Try Count', count)
global proxy
#设置访问深度
if count >= MAX_COUNT:
print('Tried Too Many Counts')
return None
try:
# 判断启用代理和有代理
if proxy:
# 给代理池取出的数据加上协议头
proxies = {
'http':'http://' + proxy
}
# 需要设置allow_redirects=False,禁止requests自动处理重定向
response = requests.get(url, headers=headers, proxies=proxies, allow_redirects=False)
else:
response = requests.get(url, headers=headers, allow_redirects=False)
# 如果请求成功返回
if response.status_code == 200:
return response.text
if response.status_code == 302:
# 出现302说明IP被封,需要更换代理
print('302')
# 获取一个新代理
proxy = get_proxy()
if proxy:
print('Using Proxy', proxy)
# 获取到代理ip重新获取网页
return get_html(url)
else:
print('Get Proxy Failed')
# 没有代理了,直接退出
return None
except ConnectionError as e:
print('Error Occurred', e.args)
proxy = get_proxy()
count += 1
return get_html(url, count)

def get_index(keyword,page):
data = {
'query':keyword,
'type':2,
'page':page
}
queries = urlencode(data)
url = base_url + queries
html = get_html(url)
return html

def parse_index(html):
doc = pq(html)
items = doc('.news-box .news-list li .txt-box h3 a').items()
for item in items:
yield item.attr('href')

def get_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
return None

def parse_detail(html):
try:
doc = pq(html)
title = doc('.rich_media_title').text()
content = doc('.rich_media_content').text()
date = doc('#post-date').text()
nickname = doc('#js_profile_qrcode > div > strong').text()
wechat = doc('#js_profile_qrcode > div > p:nth-child(3) > span').text()
return {
'title': title,
'content': content,
'date': date,
'nickname': nickname,
'wechat': wechat
}
except XMLSyntaxError:
return None

def save_to_mongo(data):
if db['articles'].update({'title':data['title']},{'$set':data},True):
print('Saved to Mongo', data['title'])
else:
print('Saved to Mongo Failed', data['title'])

def main():
# 设置访问的页数和次数
for page in range(1, 5):
# 传入访问的关键字
html = get_index(KEYWORD, page)
# 获取到html文件,获取链接
if html:
article_urls = parse_index(html)
for article_url in article_urls:
article_html = get_detail(article_url)
if article_html:
# 解析文章内容
article_data = parse_detail(article_html)
print(article_data)
# 保存到数据库
if article_data:
save_to_mongo(article_data)


if __name__ == '__main__':
main()

wxconfig.py

1
2
3
4
5
6
7
# coding: utf-8

PROXY_POOL_URL = 'http://127.0.0.1:5000/get'
KEYWORD = '风景'
MONGO_URI = 'localhost'
MONGO_DB = 'weixin'
MAX_COUNT = 5

持续更新…

× 多少都行~
打赏二维码