崔庆才老师爬虫的学习笔记。

一、爬取实战

1、安装

pip install tushare

2、官网地址

http://tushare.org/

3、命令行演示

1
2
3
4
5
6
7
import tushare as ts

result = ts.get_hs300s() # 获取沪深300
print(result)
print(type(result))
result['name'].tolist() # 获取沪深300股票名称
result['code'].tolist() # 获取沪深300股票代号

4、实战

weibostock / scrapy.cfg

1
2
3
4
5
6
[settings]
default = weibo.settings

[deploy]
#url = http://localhost:6800/
project = weibo

weibostock / weibo / items.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# -*- coding: utf-8 -*-

from scrapy import Item, Field

class WeiboItem(Item):
table_name = 'weibo'

id = Field()
content = Field()
forward_count = Field()
comment_count = Field()
like_count = Field()
posted_at = Field()
url = Field()
user = Field()
keyword = Field() # 保存到item,就可以进行赋值了
crawled_at = Field()

weibostock / weibo / middlewares.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# -*- coding: utf-8 -*-

import json
import logging
import requests
from requests.exceptions import ConnectionError
from scrapy.exceptions import IgnoreRequest


class CookiesMiddleWare():
def __init__(self, cookies_pool_url):
self.logger = logging.getLogger(__name__)
self.cookies_pool_url = cookies_pool_url

def _get_random_cookies(self):
try:
response = requests.get(self.cookies_pool_url)
if response.status_code == 200:
return json.loads(response.text)
except ConnectionError:
return None

def process_request(self, request, spider):
cookies = self._get_random_cookies()
if cookies:
request.cookies = cookies
self.logger.debug('Using Cookies ' + json.dumps(cookies))
else:
self.logger.debug('No Valid Cookies')

@classmethod
def from_crawler(cls, crawler):
return cls(
cookies_pool_url=crawler.settings.get('COOKIES_POOL_URL')
)

def process_response(self, request, response, spider):
if response.status in [300, 301, 302, 303]:
try:
redirect_url = response.headers['location']
if 'login.weibo' in redirect_url or 'login.sina' in redirect_url: # Cookie失效
self.logger.warning('Updating Cookies')
elif 'weibo.cn/security' in redirect_url:
self.logger.warning('Now Cookies' + json.dumps(request.cookies))
self.logger.warning('One Account is locked!')
request.cookies = self._get_random_cookies()
self.logger.debug('Using Cookies' + json.dumps(request.cookies))
return request
except Exception:
raise IgnoreRequest
elif response.status in [414]:
return request
else:
return response

weibostock / weibo / pipelines.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# -*- coding: utf-8 -*-

import re
import time
import pymongo
from weibo.items import WeiboItem

class WeiboPipeline(object):
def parse_time(self, datetime):
if re.match('\d+月\d+日', datetime):
datetime = time.strftime('%Y年', time.localtime()) + datetime
if re.match('\d+分钟前', datetime):
minute = re.match('(\d+)', datetime).group(1)
datetime = time.strftime('%Y年%m月%d日 %H:%M', time.localtime(time.time() - float(minute) * 60))
if re.match('今天.*', datetime):
datetime = re.match('今天(.*)', datetime).group(1).strip()
datetime = time.strftime('%Y年%m月%d日', time.localtime()) + ' ' + datetime
return datetime

def process_item(self, item, spider):
if isinstance(item, WeiboItem):
if item.get('content'):
item['content'] = item['content'].lstrip(':').strip()
if item.get('posted_at'):
item['posted_at'] = item['posted_at'].strip()
item['posted_at'] = self.parse_time(item.get('posted_at'))
return item


class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db

@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)

def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]

def close_spider(self, spider):
self.client.close()

def process_item(self, item, spider):
self.db[item.table_name].update({'id': item.get('id')}, {'$set': dict(item)}, True)
return item

weibostock / weibo / settings.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# -*- coding: utf-8 -*-

BOT_NAME = 'weibo'

SPIDER_MODULES = ['weibo.spiders']
NEWSPIDER_MODULE = 'weibo.spiders'

ROBOTSTXT_OBEY = False

DOWNLOADER_MIDDLEWARES = {
'weibo.middlewares.CookiesMiddleWare': 543,
}

ITEM_PIPELINES = {
'weibo.pipelines.WeiboPipeline': 300,
'weibo.pipelines.MongoPipeline': 301,
}

COOKIES_POOL_URL = 'http://localhost:5000/weibo/random'

MONGO_URI = 'localhost'
MONGO_DATABASE = 'weibo'

weibostock / weibo / spiders / search.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# -*- coding: utf-8 -*-

import re
from scrapy import Spider, Request, FormRequest
from weibo.items import WeiboItem
import tushare as ts

class SearchSpider(Spider):
name = "search"
allowed_domains = ["weibo.cn"]
search_url = 'http://weibo.cn/search/mblog'
max_page = 200

def start_requests(self):
result = ts.get_sz50s() # 深圳50股票
# result = ts.get_zz500s()
# result = ts.get_hs300s()
self.keywords = result['code'].tolist() # 股票代号生成列表

for keyword in self.keywords:
url = '{url}?keyword={keyword}'.format(url=self.search_url, keyword=keyword)
for page in range(self.max_page + 1):
data = {
'mp': str(self.max_page),
'page': str(page)
}
yield FormRequest(url, callback=self.parse_index, formdata=data, meta={'keyword': keyword}) # meta参数传递数据保存item

def parse_index(self, response):
weibos = response.xpath('//div[@class="c" and contains(@id, "M_")]')
for weibo in weibos:
is_forward = bool(weibo.xpath('.//span[@class="cmt"]').extract_first())
if is_forward:
detail_url = weibo.xpath('.//a[contains(., "原文评论[")]//@href').extract_first()
else:
detail_url = weibo.xpath('(.//a[contains(., "评论[")]/@href)').extract_first()
yield Request(detail_url, callback=self.parse_detail, meta={'keyword': response.meta.get('keyword')}) # meta参数传递数据保存item

def parse_detail(self, response):
url = response.url
content = ''.join(response.xpath('//div[@id="M_"]//span[@class="ctt"]//text()').extract())
id = re.search('comment\/(.*?)\?', response.url).group(1)
comment_count = response.xpath('//span[@class="pms"]//text()').re_first('评论\[(.*?)\]')
forward_count = response.xpath('//a[contains(., "转发[")]//text()').re_first('转发\[(.*?)\]')
like_count = response.xpath('//a[contains(., "赞[")]//text()').re_first('赞\[(.*?)\]')
posted_at = response.xpath('//div[@id="M_"]//span[@class="ct"]//text()').extract_first(default=None)
user = response.xpath('//div[@id="M_"]/div[1]/a/text()').extract_first()
keyword = response.meta.get('keyword') # 接收的时候用response.meta.get()获得关键字
weibo_item = WeiboItem()
for field in weibo_item.fields:
try:
weibo_item[field] = eval(field)
except NameError:
print('Field is Not Defined', field)
yield weibo_item

持续更新…

× 多少都行~
打赏二维码