使用Selenium爬取京东

Chr

2019 年 11 月 15 日

2770 次浏览

暂无评论

7356字数

程序设计

前言

又是受某老师之托……鸽了俩礼拜我都忘了他一催我才想起来
由于我看不懂懒得分析京东的Pjax，直接拿Selenuim爬取京东

安装依赖

pip3 install selenium bs4 pymysql

代码

头部声明：

from selenium import webdriver
from selenium.webdriver import ActionChains
from bs4 import BeautifulSoup
import requests
import pymysql
import pymysql.cursors 
import logging
#
logging.basicConfig(level=logging.INFO,format="[%(asctime)s][%(levelname)s]%(message)s", datefmt='%Y-%m-%d %H:%M:%S')
#
#创建webdriver对象
chrome = webdriver.Chrome()
#
#数据库设置
DB_HOST = '127.0.0.1'
DB_PORT = 3306
DB_USER = 'user'
DB_PASS = 'password'
DB_NAME = 'dbname'
DB_CHAR = 'utf8mb4'
TB_NAME = 'jdspider'

抓取搜索页：

def JD_Search_Single(keyword:str,page:int=1) -> list:
 '''
 京东搜索单次
 参数：
     关键词
     页码,1=第一页,3=第二页,5=第三页以此类推
 返回：
     [(商品名,价格,链接),……]
 '''
 chrome.get(f'https://search.jd.com/Search?keyword={keyword}&enc=utf-8&page={page}')
 chrome.implicitly_wait(7)
 goodlist = Htmt_2_GoodList(chrome.page_source)
 return(goodlist)

def Htmt_2_GoodList(html:str) -> list:
 '''
 搜索页Html解析成商品列表
 参数：
     搜索页Html
 返回：
     (商品名,价格,链接)
 '''
 bs = BeautifulSoup(html,'lxml')
 goodslist = bs.find(name='div',attrs={'id':'J_goodsList'})
 goods = []
 for good in goodslist.find_all(name='div',attrs={'class':'gl-i-wrap'}):
     try:
         #不会处理商品搜索页上有tab的情况
         price = good.find(name='div',attrs={'class':'p-price'}).strong.i.get_text()
         name = good.find(name='div',attrs={'class':'p-name'}).a.em.get_text()
         link = good.find(name='div',attrs={'class':'p-name'}).a.attrs.get('href','null')
         link = f'https:{link}' if link[0] == '/' else link
         logging.info(f'爬取商品列表[{price.rjust(6)}][{name}][{link}]')
         goods.append((name,price,link))
     except Exception as e:
         logging.error(f'**遇到未知错误[{e}]')
         Error_Alert(f'Htmt_2_GoodList-{e}')
 return(goods)

分析详情页：

def JD_Good_Detail(link:str) -> list:
 '''
 京东商品详情页
 参数：
     商品链接
 返回：
     (商品名,[商品简介])
 '''
 try:
     chrome.switch_to_window(chrome.window_handles[0])
     chrome.execute_script(f'window.open("{link}")')
     chrome.switch_to_window(chrome.window_handles[1])
     chrome.implicitly_wait(5)
     chrome.execute_script("scroll(0, 100000);")
     chrome.implicitly_wait(30)
     try:
         title = chrome.find_element_by_css_selector('#choose-attr-1 > div.dd > div.item.selected > a > i').get_attribute('textContent')
     except Exception:
         title = "Null"
     summary = Html_2_GoodDetail(chrome.page_source)
     chrome.close()
     chrome.switch_to_window(chrome.window_handles[0])
     logging.info(f'爬取商品[{title}]')
 except Exception as e:
     logging.error(f'**遇到未知错误[{e}]')
     Error_Alert(f'JD_Good_Detail-{e}')
 return((title,summary))

def Html_2_GoodDetail(html:str) -> list:
 '''
 商品页Html解析出商品简介
 参数：
     商品详情页Html
 返回：
     [(简介标题,简介内容),……]
 '''
 bs = BeautifulSoup(html,'lxml')
 summarylist = []
 for unit in bs.find(name='div',attrs={'id':'J-detail-content'}).find_all(name='div',attrs={'class':'book-detail-item'}):
     try:
         title = unit.find(name='div',attrs={'class':'item-mt'}).h3.get_text()
         content = unit.find(name='div',attrs={'class':'book-detail-content'}).get_text().strip()
         logging.info(f'[{title}]')
         summarylist.append((title,content))
     except Exception as e:
         logging.error(f'**遇到未知错误[{e}]')
         Error_Alert(f'Html_2_GoodDetail-{e}')
 return(summarylist)

入口程序：

def JD_Crawl(keyword:str,limit:int=30):
 '''
 京东爬虫
 参数：
     关键词
     数量
 '''
 logging.info('开始拉取商品列表')
 count = 0
 page = 1
 while(count < limit):
     try:
         #拉取一页商品
         goodlist = JD_Search_Single(keyword,page)
         page+=2
         for good in goodlist:
             name,price,link = good
             detail = JD_Good_Detail(link)
             Save_To_Database(name,price,link,detail[0],detail[1])                                                  
             count +=1
     except Exception as e:
         logging.error(f'**出错了{e}')
         Error_Alert(f'JD_Crawl-{e}')

数据库相关：

def db_connect():
 '''
 连接数据库
 '''
 db = pymysql.connect(host=DB_HOST,port=DB_PORT,user=DB_USER,passwd=DB_PASS,db=DB_NAME,charset=DB_CHAR)
 return(db)

def Save_To_Database(name:str,price:str,link:str,typename:str,gooddetail:list):
 '''
 把数据存进数据库s
 参数：
     (商品名,价格,链接,[(细分名,[(细分标题,商品简介),……])])
 '''
 insert_data = f'''INSERT INTO `{DB_NAME}`.`{TB_NAME}`(
                     `name`,
                     `price`,
                     `url`,
                     `type`,
                     `detail`
                 )VALUES(
                     "{name}",
                     "{price}",
                     "{link}",
                     "{typename}",
                     "{pymysql.escape_string(str(gooddetail))}"
                 );'''
 try:
     db = db_connect()
     cursor = db.cursor()
     result = cursor.execute(insert_data)
     logging.info(f'写入数据库成功,返回值[{result}]')
     db.commit()
     db.close()
 except Exception as e:
     logging.error(f'**数据库写入失败[{e}]')
     Error_Alert(f'Save_To_Database-{e}')

def Set_Up_Database():
 '''
 创建数据库
 '''
 create_db = f'''CREATE TABLE `{DB_NAME}`.`{TB_NAME}` (
                 `id` INT(11) UNSIGNED NOT NULL AUTO_INCREMENT,
                 `name` VARCHAR(255) NOT NULL COMMENT '商品名称',
                 `price` VARCHAR(255) NOT NULL COMMENT '商品价格',
                 `url` VARCHAR(255) NOT NULL COMMENT '商品链接',
                 `type` VARCHAR(255) NOT NULL COMMENT '商品细分类型',
                 `detail` TEXT NOT NULL COMMENT '商品详情',
                 PRIMARY KEY (`id`),
                 UNIQUE KEY `id_UNIQUE` (`id`)
                 )ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='京东爬虫';'''

 try:
     db = db_connect()
     cursor = db.cursor()
     result = cursor.execute(create_db)
     logging.info(f'创建数据库成功,返回值[{result}]')
     db.close()
 except Exception as e:
     logging.error(f'**创建数据库失败[{e}]')
     Error_Alert(f'Set_Up_Database-{e}')

辅助函数：

def Error_Alert(message:str):
 '''
 出错提醒
 '''
 try:
     url = 'https://sc.ftqq.com/【方糖气球推送码，[获取](https://sc.ftqq.com/)】.send'
     data = {
         'text':'脚本出错提醒',
         'desp':message
         }
     requests.post(url=url,data=data)
 except Exception as e:
     logging.error('推送失败')
 finally:
     time.sleep(300)

if __name__ == '__main__':
 JD_Crawl('机器学习',3000)

效果

爬取效果
说实话Bug挺多的，而且使用Selenium导致效率太低了，~~但是我懒得改了~~。

本文链接：https://blog.chrxw.com/archives/2019/11/15/679.html
转载请保留本文链接，谢谢

使用Selenium爬取京东

Chr • 2019 年 11 月 15 日

使用Selenium爬取京东

前言

安装依赖

代码

效果

发表评论取消回复
使用cookie技术保留您的个人信息以便您下次快速评论，继续评论表示您已同意该条款

【过期】618京东金币任务自动脚本【2.3】

小黑盒逆向分析笔记（六）完结

小黑盒逆向分析笔记（五）【未完成】

小黑盒逆向分析笔记（三）

【过期】双十一京东任务自动脚本【0.6】

Hyper-V 定时创建快照脚本

51单片机例程-跑马灯2

51单片机例程-数码管静态显示

【日记】2019年10月

51单片机例程-译码器原理

使用Selenium爬取京东

前言

安装依赖

代码

效果

发表评论 取消回复 使用cookie技术保留您的个人信息以便您下次快速评论，继续评论表示您已同意该条款

使用Selenium爬取京东

发表评论取消回复
使用cookie技术保留您的个人信息以便您下次快速评论，继续评论表示您已同意该条款