import re import pymongo from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from pyquery import PyQuery as pq from config import *
defnext_page(page_number):#输入页码,进行翻页 print('正在翻页') try: input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > input'))) #等待页码输入框 submit = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit'))) #等待提交按钮 input.clear() #输入前先清除内容 input.send_keys(page_number) submit.click() wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number))) #等待高亮页码数值显示跳转的页码,确定跳转完成 get_products() except TimeoutException: return next_page(page_number) #超时异常后,重新进行翻页即可 defget_products():#获取当页商品数据 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item'))) html = browser.page_source #获取当前页面源码 doc = pq(html) #声明pyquery对象 items = doc('#mainsrp-itemlist .items .item').items() # 获取当前页所有商品对象 for item in items: #遍历所有商品 product = { 'image':item.find('.pic .img').attr('src'), 'price':item.find('.price').text(), 'deal':item.find('.deal-cnt').text()[:-3], 'title':item.find('.title').text(), 'shop':item.find('.shop').text(), 'location':item.find('.location').text() } save_to_mongo(product) defsave_to_mongo(result):#保存到mongodb数据库 try: if db[MONGO_TABLE].insert(result): #判断数据插入到数据表中,MONGO_TABLE从配置文件config.py中获取 print('存储到MongoDB成功',result) except Exception: print('存储到MongoDB失败',result)
defmain(): try: total = search() total = int(re.compile('(\d+)').search(total).group(1)) #正则匹配对象,搜索总页数字符串,结果索引1,转为int型就是总页数了 for i in range(2,total+1): #从第二页开始循环翻页 next_page(i) except Exception: print('浏览器出错啦') finally: print('爬取完成') browser.close() #完成后一定关闭浏览器 if __name__ == '__main__': main()