使用Selenium爬取京东

前言

又是受某老师之托……鸽了俩礼拜我都忘了他一催我才想起来
由于我看不懂懒得分析京东的Pjax，直接拿Selenuim爬取京东

安装依赖

pip3 install selenium bs4 pymysql

代码

头部声明：

from selenium import webdriver
from selenium.webdriver import ActionChains
from bs4 import BeautifulSoup
import requests
import pymysql
import pymysql.cursors 
import logging
#
logging.basicConfig(level=logging.INFO,format="[%(asctime)s][%(levelname)s]%(message)s", datefmt='%Y-%m-%d %H:%M:%S')
#
#创建webdriver对象
chrome = webdriver.Chrome()
#
#数据库设置
DB_HOST = '127.0.0.1'
DB_PORT = 3306
DB_USER = 'user'
DB_PASS = 'password'
DB_NAME = 'dbname'
DB_CHAR = 'utf8mb4'
TB_NAME = 'jdspider'

抓取搜索页：

def JD_Search_Single(keyword:str,page:int=1) -> list:
 '''
 京东搜索单次
 参数：
     关键词
     页码,1=第一页,3=第二页,5=第三页以此类推
 返回：
     [(商品名,价格,链接),……]
 '''
 chrome.get(f'https://search.jd.com/Search?keyword={keyword}&enc=utf-8&page={page}')
 chrome.implicitly_wait(7)
 goodlist = Htmt_2_GoodList(chrome.page_source)
 return(goodlist)

def Htmt_2_GoodList(html:str) -> list:
 '''
 搜索页Html解析成商品列表
 参数：
     搜索页Html
 返回：
     (商品名,价格,链接)
 '''
 bs = BeautifulSoup(html,'lxml')
 goodslist = bs.find(name='div',attrs={'id':'J_goodsList'})
 goods = []
 for good in goodslist.find_all(name='div',attrs={'class':'gl-i-wrap'}):
     try:
         #不会处理商品搜索页上有tab的情况
         price = good.find(name='div',attrs={'class':'p-price'}).strong.i.get_text()
         name = good.find(name='div',attrs={'class':'p-name'}).a.em.get_text()
         link = good.find(name='div',attrs={'class':'p-name'}).a.attrs.get('href','null')
         link = f'https:{link}' if link[0] == '/' else link
         logging.info(f'爬取商品列表[{price.rjust(6)}][{name}][{link}]')
         goods.append((name,price,link))
     except Exception as e:
         logging.error(f'**遇到未知错误[{e}]')
         Error_Alert(f'Htmt_2_GoodList-{e}')
 return(goods)

分析详情页：

def JD_Good_Detail(link:str) -> list:
 '''
 京东商品详情页
 参数：
     商品链接
 返回：
     (商品名,[商品简介])
 '''
 try:
     chrome.switch_to_window(chrome.window_handles[0])
     chrome.execute_script(f'window.open("{link}")')
     chrome.switch_to_window(chrome.window_handles[1])
     chrome.implicitly_wait(5)
     chrome.execute_script("scroll(0, 100000);")
     chrome.implicitly_wait(30)
     try:
         title = chrome.find_element_by_css_selector('#choose-attr-1 > div.dd > div.item.selected > a > i').get_attribute('textContent')
     except Exception:
         title = "Null"
     summary = Html_2_GoodDetail(chrome.page_source)
     chrome.close()
     chrome.switch_to_window(chrome.window_handles[0])
     logging.info(f'爬取商品[{title}]')
 except Exception as e:
     logging.error(f'**遇到未知错误[{e}]')
     Error_Alert(f'JD_Good_Detail-{e}')
 return((title,summary))

def Html_2_GoodDetail(html:str) -> list:
 '''
 商品页Html解析出商品简介
 参数：
     商品详情页Html
 返回：
     [(简介标题,简介内容),……]
 '''
 bs = BeautifulSoup(html,'lxml')
 summarylist = []
 for unit in bs.find(name='div',attrs={'id':'J-detail-content'}).find_all(name='div',attrs={'class':'book-detail-item'}):
     try:
         title = unit.find(name='div',attrs={'class':'item-mt'}).h3.get_text()
         content = unit.find(name='div',attrs={'class':'book-detail-content'}).get_text().strip()
         logging.info(f'[{title}]')
         summarylist.append((title,content))
     except Exception as e:
         logging.error(f'**遇到未知错误[{e}]')
         Error_Alert(f'Html_2_GoodDetail-{e}')
 return(summarylist)

入口程序：

def JD_Crawl(keyword:str,limit:int=30):
 '''
 京东爬虫
 参数：
     关键词
     数量
 '''
 logging.info('开始拉取商品列表')
 count = 0
 page = 1
 while(count < limit):
     try:
         #拉取一页商品
         goodlist = JD_Search_Single(keyword,page)
         page+=2
         for good in goodlist:
             name,price,link = good
             detail = JD_Good_Detail(link)
             Save_To_Database(name,price,link,detail[0],detail[1])                                                  
             count +=1
     except Exception as e:
         logging.error(f'**出错了{e}')
         Error_Alert(f'JD_Crawl-{e}')

数据库相关：

def db_connect():
 '''
 连接数据库
 '''
 db = pymysql.connect(host=DB_HOST,port=DB_PORT,user=DB_USER,passwd=DB_PASS,db=DB_NAME,charset=DB_CHAR)
 return(db)

def Save_To_Database(name:str,price:str,link:str,typename:str,gooddetail:list):
 '''
 把数据存进数据库s
 参数：
     (商品名,价格,链接,[(细分名,[(细分标题,商品简介),……])])
 '''
 insert_data = f'''INSERT INTO `{DB_NAME}`.`{TB_NAME}`(
                     `name`,
                     `price`,
                     `url`,
                     `type`,
                     `detail`
                 )VALUES(
                     "{name}",
                     "{price}",
                     "{link}",
                     "{typename}",
                     "{pymysql.escape_string(str(gooddetail))}"
                 );'''
 try:
     db = db_connect()
     cursor = db.cursor()
     result = cursor.execute(insert_data)
     logging.info(f'写入数据库成功,返回值[{result}]')
     db.commit()
     db.close()
 except Exception as e:
     logging.error(f'**数据库写入失败[{e}]')
     Error_Alert(f'Save_To_Database-{e}')

def Set_Up_Database():
 '''
 创建数据库
 '''
 create_db = f'''CREATE TABLE `{DB_NAME}`.`{TB_NAME}` (
                 `id` INT(11) UNSIGNED NOT NULL AUTO_INCREMENT,
                 `name` VARCHAR(255) NOT NULL COMMENT '商品名称',
                 `price` VARCHAR(255) NOT NULL COMMENT '商品价格',
                 `url` VARCHAR(255) NOT NULL COMMENT '商品链接',
                 `type` VARCHAR(255) NOT NULL COMMENT '商品细分类型',
                 `detail` TEXT NOT NULL COMMENT '商品详情',
                 PRIMARY KEY (`id`),
                 UNIQUE KEY `id_UNIQUE` (`id`)
                 )ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='京东爬虫';'''

 try:
     db = db_connect()
     cursor = db.cursor()
     result = cursor.execute(create_db)
     logging.info(f'创建数据库成功,返回值[{result}]')
     db.close()
 except Exception as e:
     logging.error(f'**创建数据库失败[{e}]')
     Error_Alert(f'Set_Up_Database-{e}')

辅助函数：

def Error_Alert(message:str):
 '''
 出错提醒
 '''
 try:
     url = 'https://sc.ftqq.com/【方糖气球推送码，[获取](https://sc.ftqq.com/)】.send'
     data = {
         'text':'脚本出错提醒',
         'desp':message
         }
     requests.post(url=url,data=data)
 except Exception as e:
     logging.error('推送失败')
 finally:
     time.sleep(300)

if __name__ == '__main__':
 JD_Crawl('机器学习',3000)

效果

爬取效果
说实话Bug挺多的，而且使用Selenium导致效率太低了，~~但是我懒得改了~~。

本文链接：https://blog.chrxw.com/archives/2019/11/15/679.html
转载请保留本文链接，谢谢

使用Selenium爬取京东

Chr • 2019 年 11 月 15 日

使用Selenium爬取京东

前言

安装依赖

代码

效果

发表评论取消回复
说点什么吧

【过期】618京东金币任务自动脚本【2.3】

油猴脚本不求人！零基础油猴脚本编写指南

MIX2S刷入Windows11Arm教程

Fiddler汉化插件 & 汉化文本

Ventoy美化主题 V2.0

小黑盒账号凭据安卓端抓取教程【免ROOT】

【日记】2019年3月

2019年终总结

【Xposed开发笔记】去除NGA客户端开屏广告

安卓Frida RPC的一些简单应用

使用Selenium爬取京东

前言

安装依赖

代码

效果

发表评论 取消回复 说点什么吧

使用Selenium爬取京东

发表评论取消回复
说点什么吧