1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
|
from pyspider.libs.base_handler import * import pymongo
class Handler(BaseHandler): crawl_config = { } client = pymongo.MongoClient('localhost') db = client['trip'] @every(minutes=24 * 60) def on_start(self): self.crawl('http://www.tripadvisor.cn/Attractions-g186338-Activities-c47-t163-London_England.html#ATTRACTION_LIST', callback=self.index_page, validate_cert=False)
@config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('.listing_title > a').items(): self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False) next = response.doc('.pagination .nav.next').attr.href self.crawl(next, callback=self.index_page, validate_cert=False)
@config(priority=2) def detail_page(self, response): name = response.doc('.heading_title').text() rating = response.doc('div > .more').text() adress = response.doc('.location > .address').text() phone = response.doc('.phone > div').text() return { "url":response.url, "name":name, "rating":rating, "adress":adress, "phone":phone } def on_result(self,result): if result: self.save_to_mongo(result) def save_to_mongo(self,result): if self.db['london'].insert(result): print('save to mongodb:',result)
|