一、爬取实战
1、在zhihuuser的项目上修改
首先从github上复制知乎项目到本地git clone https://github.com/Germey/Zhihu.git
。
再在pycharm中打开。
不能在源代码上修改,需要建一个分支git checkout -b distributed
。
切换到分支git branch
。
2、搭建分布式架构
本地项目修改settings.py,引入scrapy-redis,这里redis数据库地址设置的一个阿里云服务器。
zhihu-distributed / scrapy.cfg1
2
3
4
5
6
7[settings]
default = zhihuuser.settings
[deploy]
#url = http://localhost:6800/
url = http://123.206.65.37:6800/addversion.json # 远程主机的url
project = zhihuuser
zhihu-distributed / zhihuuser / items.py1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46# -*- coding: utf-8 -*-
from scrapy import Item, Field
class UserItem(Item):
# define the fields for your item here like:
id = Field()
name = Field()
avatar_url = Field()
headline = Field()
description = Field()
url = Field()
url_token = Field()
gender = Field()
cover_url = Field()
type = Field()
badge = Field()
answer_count = Field()
articles_count = Field()
commercial_question_count = Field()
favorite_count = Field()
favorited_count = Field()
follower_count = Field()
following_columns_count = Field()
following_count = Field()
pins_count = Field()
question_count = Field()
thank_from_count = Field()
thank_to_count = Field()
thanked_count = Field()
vote_from_count = Field()
vote_to_count = Field()
voteup_count = Field()
following_favlists_count = Field()
following_question_count = Field()
following_topic_count = Field()
marked_answers_count = Field()
mutual_followees_count = Field()
hosted_live_count = Field()
participated_live_count = Field()
locations = Field()
educations = Field()
employments = Field()
zhihu-distributed / zhihuuser / middlewares.py1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50# -*- coding: utf-8 -*-
from scrapy import signals
class ZhihuSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
zhihu-distributed / zhihuuser / pipelines.py1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33# -*- coding: utf-8 -*-
import pymongo
class ZhihuPipeline(object):
def process_item(self, item, spider):
return item
class MongoPipeline(object):
collection_name = 'users'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].update({'url_token': item['url_token']}, dict(item), True)
return item
zhihu-distributed / zhihuuser / settings.py1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27# -*- coding: utf-8 -*-
BOT_NAME = 'zhihuuser'
SPIDER_MODULES = ['zhihuuser.spiders']
NEWSPIDER_MODULE = 'zhihuuser.spiders'
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
}
ITEM_PIPELINES = {
'zhihuuser.pipelines.MongoPipeline': 300, # 存取本地pipeline
'scrapy_redis.pipelines.RedisPipeline': 301 # 将爬取结果引入到redis的pipeline里面去
}
MONGO_URI = 'localhost'
MONGO_DATABASE = 'zhihu'
SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 核心调度器换成scrapy_redis调度器
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 去重的class
REDIS_URL = 'redis://root:redistest@120.27.34.24:6379' # redis数据库的连接地址
zhihu-distributed / zhihuuser / spiders / zhihu.py1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68# -*- coding: utf-8 -*-
import json
from scrapy import Spider, Request
from zhihuuser.items import UserItem
class ZhihuSpider(Spider):
name = "zhihu"
allowed_domains = ["www.zhihu.com"]
user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
start_user = 'tianshansoft'
user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
def start_requests(self):
yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)
yield Request(self.follows_url.format(user=self.start_user, include=self.follows_query, limit=20, offset=0),
self.parse_follows)
yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, limit=20, offset=0),
self.parse_followers)
def parse_user(self, response):
result = json.loads(response.text)
item = UserItem()
for field in item.fields:
if field in result.keys():
item[field] = result.get(field)
yield item
yield Request(
self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0),
self.parse_follows)
yield Request(
self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0),
self.parse_followers)
def parse_follows(self, response):
results = json.loads(response.text)
if 'data' in results.keys():
for result in results.get('data'):
yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
self.parse_user)
if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
next_page = results.get('paging').get('next')
yield Request(next_page,
self.parse_follows)
def parse_followers(self, response):
results = json.loads(response.text)
if 'data' in results.keys():
for result in results.get('data'):
yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
self.parse_user)
if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
next_page = results.get('paging').get('next')
yield Request(next_page,
self.parse_followers)
3、更新代码到github上
git status
git add -A
git commit -m 'add distributed'
git push origin distributed
4、在腾讯云上部署scrapy环境
注意设置mongodb.conf文件,允许远程访问,注释掉bind_ip=127.0.0.1即可允许远程访问了。
从github上复制代码git clone https://github.com/Germey/Zhihu.git -b distributed
。
运行爬虫scrapy crawl zhihu
,相当于在另一台主机上开启了知乎的爬取。
scrapy_resis/unils.py # 一些工具库
持续更新…
最后更新: 2018年08月21日 17:39
原始链接: http://pythonfood.github.io/2018/07/05/Scrapy分布式架构搭建抓取知乎/