崔庆才老师爬虫的学习笔记。

一、爬取实战

1、PySpider框架

去重处理
结果监控
多进程处理
PyQuery提取
错误重试
WebUI管理
代码简洁
JavaScript渲染

2、安装

cmd管理员身份运行，执行命令pip install pyspider

3、运行

cmd输入命令pyspider。浏览器输入地址：http://localhost:5000/

ps:需要安装过phantomjs

4、实战

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-06-07 22:43:08
# Project: TripAdvisor

from pyspider.libs.base_handler import *
import pymongo

class Handler(BaseHandler):
    crawl_config = {
    }
    client = pymongo.MongoClient('localhost')
    db = client['trip']
    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://www.tripadvisor.cn/Attractions-g186338-Activities-c47-t163-London_England.html#ATTRACTION_LIST', callback=self.index_page, validate_cert=False)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('.listing_title > a').items():
            self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
            next = response.doc('.pagination .nav.next').attr.href
            self.crawl(next, callback=self.index_page, validate_cert=False)

    @config(priority=2)
    def detail_page(self, response):
        name = response.doc('.heading_title').text()
        rating = response.doc('div > .more').text()
        adress = response.doc('.location > .address').text()
        phone = response.doc('.phone > div').text()
        return {
            "url":response.url,
            "name":name,
            "rating":rating,
            "adress":adress,
            "phone":phone
        }
    def on_result(self,result):
        if result:
            self.save_to_mongo(result)
            
    def save_to_mongo(self,result):
        if self.db['london'].insert(result):
            print('save to mongodb:',result)