崔庆才老师爬虫的学习笔记。

一、基本使用

1、目标站点

http://quotes.toscrape.com/

2、流程框架

(1)抓取第一页

请求第一页URL并得到源代码，进行下一步分析。

(2)获取内容和下一页链接

分析源代码，获取首页内容，获取下一页链接等待进一步爬取。

(3)翻页爬取

请求下一页信息，分析内容并请求再下一页链接。

(4)保存爬取结果

将爬取结果保存为特定格式如文本、数据库。

3、爬虫实战

命令行输入：

新建工程：scrapy startproject quotetutroial

进入目录：cd quotetutroial

生成爬虫：scrapy genspider quotes quotes.toscrape.com

项目结构：

scrapy.cfg 配置文件

items.py 保存数据的数据结构

middlewares.py 爬取过程中定义的一些中间件

pipelines.py 项目管道，可以输出一些items

settings.py 定义的一些配置信息

spider/quotes.py 主要的运行代码

运行爬虫：scrapy crawl quotes

4、项目代码

quotetutroial / scrapy.cfg

# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html

[settings]
default = quotetutroial.settings

[deploy]
#url = http://localhost:6800/
project = quotetutroial

quotetutroial / quotetutroial / items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class QuoteItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 爬取完后，一个一个进行赋值，生成一个整体
    text = scrapy.Field()
    author = scrapy.Field()
    tags = scrapy.Field()

quotetutroial / quotetutroial / middlewares.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

class QuotetutroialSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class QuotetutroialDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

quotetutroial / quotetutroial / pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymongo
from scrapy.exceptions import DropItem

class TextPipeline(object):

    def __init__(self):
        self.limit = 50

    def process_item(self, item, spider):
        if item['text']:
            if len(item['text']) > self.limit:
                item['text'] = item['text'][0:self.limit].rstrip() + '...'
                return item
        else:
            return DropItem('Missing Text')

class MongoPipeline(object):
    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):  # from_crawler方法可以从setting.py里拿到相应的配置信息
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DB')
        )  # 这是一个类方法，需要返回一个class对象

    def open_spider(self, spider):  # open_spider方法定义爬虫将要启动时的操作
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def process_item(self, item, spider):
        name = item.__class__.__name__  # 将item的名称传给下边db表名
        self.db[name].insert(dict(item))
        return item

    def close_spider(self, spider):
        self.client.close()
    # 要想pipeline生效，需要在settings.py里指定ITEM_PIPELINES,序号越小优先级越高
    # 这里指定了两个pipeline，假设删掉一个pipline，那么只会执行另外一个pipline

quotetutroial / quotetutroial / settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for quotetutroial project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'quotetutroial'

SPIDER_MODULES = ['quotetutroial.spiders']
NEWSPIDER_MODULE = 'quotetutroial.spiders'

MONGO_URI = 'localhost'
MONGO_DB = 'quotestutorial'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'quotetutroial (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'quotetutroial.middlewares.QuotetutroialSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'quotetutroial.middlewares.QuotetutroialDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    #'quotetutroial.pipelines.TextPipeline': 300,
    'quotetutroial.pipelines.MongoPipeline': 400,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

quotetutroial / quotetutroial / spiders / quotes.py

# -*- coding: utf-8 -*-

import scrapy
from quotetutroial.items import QuoteItem

class QuotesSpider(scrapy.Spider):
    name = 'quotes'
    allowed_domains = ['quotes.toscrape.com']
    start_urls = ['http://quotes.toscrape.com/']

    def parse(self, response):
        quotes = response.css('.quote')
        for quote in quotes:
            item = QuoteItem()
            text = quotes.css('.text::text').extract_first()  # ::text是scrapy的语法格式，表示获取文本
            author = quotes.css('.author::text').extract_first()  # extract_first()表示获取第一个结果
            tags = quotes.css('.tags .tag::text').extract()  # extract()表示获取所有结果
            # 这里可以在终端里使用命令scrapy shell quotes.toscrape.com进入交互模式！！！
            # 输入exit()退出交互模式
            item['text'] = text
            item['author'] = author
            item['tags'] = tags
            yield item

        next = response.css('.pager .next a::attr(href)').extract_first()
        url = response.urljoin(next)  # 因为next是不完全的url，所以需要调用urljoin生成完整的url
        yield scrapy.Request(url=url, callback=self.parse)  # 回调自身parse，实现翻页
        # 此时可以命令行输入命令scrapy crawl quotes -o quotes.json保存数据
        # 还可以输入命令scrapy crawl quotes -o quotes.jl保存数据成json line格式
        # 还可以输入命令scrapy crawl quotes -o quotes.csv保存数据成csv格式
        # 还可以输入命令scrapy crawl quotes -o quotes.xml保存数据成xml格式
        # 还可以输入命令scrapy crawl quotes -o quotes.pickle保存数据成pickle格式
        # 还可以输入命令scrapy crawl quotes -o quotes.marshal保存数据成marshal格式
        # 还可以输入命令scrapy crawl quotes -o ftp://user:pass@ftp.example.com/path/quotes.csv保存数据到ftp服务器