崔庆才老师爬虫的学习笔记。

一、爬取实战

1、抓取移动端微博

http://weibo.cn/search/mblog

2、启动cookies池

(1)启动redis服务。

(2)cmd输入命令python import.py ,再输入微博账号密码,导入数据库。

(3)cmd输入命令python run.py,启动cookies池服务,地址localhost:5000

3、实战

weibosearch / scrapy.cfg

1
2
3
4
5
6
[settings]
default = weibo.settings

[deploy]
#url = http://localhost:6800/
project = weibo

weibosearch / weibo / items.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# -*- coding: utf-8 -*-

from scrapy import Item, Field

class WeiboItem(Item):
table_name = 'weibo' # 数据表名称

id = Field()
content = Field()
forward_count = Field()
comment_count = Field()
like_count = Field()
posted_at = Field()
url = Field()
user = Field()
crawled_at = Field() # 爬取时间

weibosearch / weibo / middlewares.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# -*- coding: utf-8 -*-

import json
import logging
import requests
from requests.exceptions import ConnectionError
from scrapy.exceptions import IgnoreRequest


class CookiesMiddleWare():
def __init__(self, cookies_pool_url):
self.logger = logging.getLogger(__name__)
self.cookies_pool_url = cookies_pool_url

def _get_random_cookies(self): # 从cookie池读取cookie
try:
response = requests.get(self.cookies_pool_url)
if response.status_code == 200:
return json.loads(response.text)
except ConnectionError:
return None

def process_request(self, request, spider): # 请求request设置cookies
cookies = self._get_random_cookies()
if cookies:
request.cookies = cookies
self.logger.debug('Using Cookies ' + json.dumps(cookies))
else:
self.logger.debug('No Valid Cookies')

@classmethod
def from_crawler(cls, crawler): # 获取settings配置cookie池信息
return cls(
cookies_pool_url=crawler.settings.get('COOKIES_POOL_URL')
)

def process_response(self, request, response, spider): # 返回response,状态码为300重新获取cookies
if response.status in [300, 301, 302, 303]:
try:
redirect_url = response.headers['location'] # 获得重定向url
if 'login.weibo' in redirect_url or 'login.sina' in redirect_url: # 如果登录字段在重定向url,则Cookie失效
self.logger.warning('Updating Cookies')
elif 'weibo.cn/security' in redirect_url: # 如果安全提示字段在重定向url,账号被锁
self.logger.warning('Now Cookies' + json.dumps(request.cookies))
self.logger.warning('One Account is locked!')
request.cookies = self._get_random_cookies() # request需要重新获取cookies
self.logger.debug('Using Cookies' + json.dumps(request.cookies))
return request # 重新返回request
except Exception:
raise IgnoreRequest
elif response.status in [414]: # 状态码414返回request
return request
else:
return response # 正常情况返回response

weibosearch / weibo / pipelines.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# -*- coding: utf-8 -*-

import re
import time
import pymongo
from weibo.items import WeiboItem

class WeiboPipeline(object):
def parse_time(self, datetime): # 清洗发布时间显示,都转换为:x年x月x日 x时:x分
if re.match('\d+月\d+日', datetime): # x月x日 x时:x分
datetime = time.strftime('%Y年', time.localtime()) + datetime
if re.match('\d+分钟前', datetime): # x分中前
minute = re.match('(\d+)', datetime).group(1)
datetime = time.strftime('%Y年%m月%d日 %H:%M', time.localtime(time.time() - float(minute) * 60))
if re.match('今天.*', datetime): # 今天 x时:x分
datetime = re.match('今天(.*)', datetime).group(1).strip()
datetime = time.strftime('%Y年%m月%d日', time.localtime()) + ' ' + datetime
return datetime

def process_item(self, item, spider):
if isinstance(item, WeiboItem): # 如果item是WeiboItem
if item.get('content'):
item['content'] = item['content'].lstrip(':').strip() # 内容处理
if item.get('posted_at'):
item['posted_at'] = item['posted_at'].strip()
item['posted_at'] = self.parse_time(item.get('posted_at')) # 时间处理
return item


class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db

@classmethod
def from_crawler(cls, crawler): # 获取settings配置信息
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)

def open_spider(self, spider): # 开启spider时连接数据库
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]

def close_spider(self, spider): # 关闭spider时关闭数据库
self.client.close()

def process_item(self, item, spider): # 主程序,往数据表跟新或插入数据,同时去重
self.db[item.table_name].update({'id': item.get('id')}, {'$set': dict(item)}, True)
return item

weibosearch / weibo / settings.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# -*- coding: utf-8 -*-

BOT_NAME = 'weibo'

SPIDER_MODULES = ['weibo.spiders']
NEWSPIDER_MODULE = 'weibo.spiders'

ROBOTSTXT_OBEY = False

DOWNLOADER_MIDDLEWARES = {
'weibo.middlewares.CookiesMiddleWare': 543,
}

ITEM_PIPELINES = {
'weibo.pipelines.WeiboPipeline': 300,
'weibo.pipelines.MongoPipeline': 301,
}

COOKIES_POOL_URL = 'http://localhost:5000/weibo/random' # cookie池服务地址

MONGO_URI = 'localhost'
MONGO_DATABASE = 'weibo'

weibosearch / weibo / spiders / search.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# -*- coding: utf-8 -*-

import re
from scrapy import Spider, Request, FormRequest
from weibo.items import WeiboItem


class SearchSpider(Spider):
name = "search"
allowed_domains = ["weibo.cn"]
search_url = 'http://weibo.cn/search/mblog'
max_page = 200 # 最大请求页为200
keywords = ['000001'] # 搜索关键字列表,示例是一个股票代码

def start_requests(self): # 初始请求
for keyword in self.keywords:
url = '{url}?keyword={keyword}'.format(url=self.search_url, keyword=keyword) # 格式化请求的url
for page in range(self.max_page + 1):
data = {
'mp': str(self.max_page), # 参数mp最大请求页
'page': str(page) # 请求页
}
yield FormRequest(url, callback=self.parse_index, formdata=data) # 生成器生成FormRequest,回调parse_index

def parse_index(self, response): # 解析索引页
weibos = response.xpath('//div[@class="c" and contains(@id, "M_")]') # 获得微博列表
print(len(weibos), weibos)
for weibo in weibos: # 循环微博列表
is_forward = bool(weibo.xpath('.//span[@class="cmt"]').extract_first()) # 是否转发
if is_forward:
detail_url = weibo.xpath('.//a[contains(., "原文评论[")]//@href').extract_first() # 如果转发的,获取原文评论
else:
detail_url = weibo.xpath('(.//a[contains(., "评论[")]/@href)').extract_first() # 如果不是转发的,获取评论
yield Request(detail_url, callback=self.parse_detail) # 生成request,回调parse_detail

def parse_detail(self, response): # 解析详情页
url = response.url
content = ''.join(response.xpath('//div[@id="M_"]//span[@class="ctt"]//text()').extract()) # 微博正文需要拼接
id = re.search('comment\/(.*?)\?', response.url).group(1)
comment_count = response.xpath('//span[@class="pms"]//text()').re_first('评论\[(.*?)\]')
forward_count = response.xpath('//a[contains(., "转发[")]//text()').re_first('转发\[(.*?)\]')
like_count = response.xpath('//a[contains(., "赞[")]//text()').re_first('赞\[(.*?)\]')
posted_at = response.xpath('//div[@id="M_"]//span[@class="ct"]//text()').extract_first(default=None)
user = response.xpath('//div[@id="M_"]/div[1]/a/text()').extract_first()
weibo_item = WeiboItem() # 实例化weiboitem
for field in weibo_item.fields: # 遍历weiboitem的字段
try:
weibo_item[field] = eval(field) # eval可以动态的获取field进行赋值
except NameError:
print('Field is Not Defined', field)
yield weibo_item

持续更新…

× 多少都行~
打赏二维码