使用Selenium爬取京东
前言
又是受某老师之托……鸽了俩礼拜我都忘了他一催我才想起来
由于我看不懂懒得分析京东的Pjax,直接拿Selenuim爬取京东
安装依赖
pip3 install selenium bs4 pymysql
代码
- 头部声明:
from selenium import webdriver
from selenium.webdriver import ActionChains
from bs4 import BeautifulSoup
import requests
import pymysql
import pymysql.cursors
import logging
#
logging.basicConfig(level=logging.INFO,format="[%(asctime)s][%(levelname)s]%(message)s", datefmt='%Y-%m-%d %H:%M:%S')
#
#创建webdriver对象
chrome = webdriver.Chrome()
#
#数据库设置
DB_HOST = '127.0.0.1'
DB_PORT = 3306
DB_USER = 'user'
DB_PASS = 'password'
DB_NAME = 'dbname'
DB_CHAR = 'utf8mb4'
TB_NAME = 'jdspider'
- 抓取搜索页:
def JD_Search_Single(keyword:str,page:int=1) -> list:
'''
京东搜索单次
参数:
关键词
页码,1=第一页,3=第二页,5=第三页以此类推
返回:
[(商品名,价格,链接),……]
'''
chrome.get(f'https://search.jd.com/Search?keyword={keyword}&enc=utf-8&page={page}')
chrome.implicitly_wait(7)
goodlist = Htmt_2_GoodList(chrome.page_source)
return(goodlist)
def Htmt_2_GoodList(html:str) -> list:
'''
搜索页Html解析成商品列表
参数:
搜索页Html
返回:
(商品名,价格,链接)
'''
bs = BeautifulSoup(html,'lxml')
goodslist = bs.find(name='div',attrs={'id':'J_goodsList'})
goods = []
for good in goodslist.find_all(name='div',attrs={'class':'gl-i-wrap'}):
try:
#不会处理商品搜索页上有tab的情况
price = good.find(name='div',attrs={'class':'p-price'}).strong.i.get_text()
name = good.find(name='div',attrs={'class':'p-name'}).a.em.get_text()
link = good.find(name='div',attrs={'class':'p-name'}).a.attrs.get('href','null')
link = f'https:{link}' if link[0] == '/' else link
logging.info(f'爬取商品列表[{price.rjust(6)}][{name}][{link}]')
goods.append((name,price,link))
except Exception as e:
logging.error(f'**遇到未知错误[{e}]')
Error_Alert(f'Htmt_2_GoodList-{e}')
return(goods)
- 分析详情页:
def JD_Good_Detail(link:str) -> list:
'''
京东商品详情页
参数:
商品链接
返回:
(商品名,[商品简介])
'''
try:
chrome.switch_to_window(chrome.window_handles[0])
chrome.execute_script(f'window.open("{link}")')
chrome.switch_to_window(chrome.window_handles[1])
chrome.implicitly_wait(5)
chrome.execute_script("scroll(0, 100000);")
chrome.implicitly_wait(30)
try:
title = chrome.find_element_by_css_selector('#choose-attr-1 > div.dd > div.item.selected > a > i').get_attribute('textContent')
except Exception:
title = "Null"
summary = Html_2_GoodDetail(chrome.page_source)
chrome.close()
chrome.switch_to_window(chrome.window_handles[0])
logging.info(f'爬取商品[{title}]')
except Exception as e:
logging.error(f'**遇到未知错误[{e}]')
Error_Alert(f'JD_Good_Detail-{e}')
return((title,summary))
def Html_2_GoodDetail(html:str) -> list:
'''
商品页Html解析出商品简介
参数:
商品详情页Html
返回:
[(简介标题,简介内容),……]
'''
bs = BeautifulSoup(html,'lxml')
summarylist = []
for unit in bs.find(name='div',attrs={'id':'J-detail-content'}).find_all(name='div',attrs={'class':'book-detail-item'}):
try:
title = unit.find(name='div',attrs={'class':'item-mt'}).h3.get_text()
content = unit.find(name='div',attrs={'class':'book-detail-content'}).get_text().strip()
logging.info(f'[{title}]')
summarylist.append((title,content))
except Exception as e:
logging.error(f'**遇到未知错误[{e}]')
Error_Alert(f'Html_2_GoodDetail-{e}')
return(summarylist)
- 入口程序:
def JD_Crawl(keyword:str,limit:int=30):
'''
京东爬虫
参数:
关键词
数量
'''
logging.info('开始拉取商品列表')
count = 0
page = 1
while(count < limit):
try:
#拉取一页商品
goodlist = JD_Search_Single(keyword,page)
page+=2
for good in goodlist:
name,price,link = good
detail = JD_Good_Detail(link)
Save_To_Database(name,price,link,detail[0],detail[1])
count +=1
except Exception as e:
logging.error(f'**出错了{e}')
Error_Alert(f'JD_Crawl-{e}')
- 数据库相关:
def db_connect():
'''
连接数据库
'''
db = pymysql.connect(host=DB_HOST,port=DB_PORT,user=DB_USER,passwd=DB_PASS,db=DB_NAME,charset=DB_CHAR)
return(db)
def Save_To_Database(name:str,price:str,link:str,typename:str,gooddetail:list):
'''
把数据存进数据库s
参数:
(商品名,价格,链接,[(细分名,[(细分标题,商品简介),……])])
'''
insert_data = f'''INSERT INTO `{DB_NAME}`.`{TB_NAME}`(
`name`,
`price`,
`url`,
`type`,
`detail`
)VALUES(
"{name}",
"{price}",
"{link}",
"{typename}",
"{pymysql.escape_string(str(gooddetail))}"
);'''
try:
db = db_connect()
cursor = db.cursor()
result = cursor.execute(insert_data)
logging.info(f'写入数据库成功,返回值[{result}]')
db.commit()
db.close()
except Exception as e:
logging.error(f'**数据库写入失败[{e}]')
Error_Alert(f'Save_To_Database-{e}')
def Set_Up_Database():
'''
创建数据库
'''
create_db = f'''CREATE TABLE `{DB_NAME}`.`{TB_NAME}` (
`id` INT(11) UNSIGNED NOT NULL AUTO_INCREMENT,
`name` VARCHAR(255) NOT NULL COMMENT '商品名称',
`price` VARCHAR(255) NOT NULL COMMENT '商品价格',
`url` VARCHAR(255) NOT NULL COMMENT '商品链接',
`type` VARCHAR(255) NOT NULL COMMENT '商品细分类型',
`detail` TEXT NOT NULL COMMENT '商品详情',
PRIMARY KEY (`id`),
UNIQUE KEY `id_UNIQUE` (`id`)
)ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='京东爬虫';'''
try:
db = db_connect()
cursor = db.cursor()
result = cursor.execute(create_db)
logging.info(f'创建数据库成功,返回值[{result}]')
db.close()
except Exception as e:
logging.error(f'**创建数据库失败[{e}]')
Error_Alert(f'Set_Up_Database-{e}')
- 辅助函数:
def Error_Alert(message:str):
'''
出错提醒
'''
try:
url = 'https://sc.ftqq.com/【方糖气球推送码,[获取](https://sc.ftqq.com/)】.send'
data = {
'text':'脚本出错提醒',
'desp':message
}
requests.post(url=url,data=data)
except Exception as e:
logging.error('推送失败')
finally:
time.sleep(300)
if __name__ == '__main__':
JD_Crawl('机器学习',3000)
效果
说实话Bug挺多的,而且使用Selenium导致效率太低了,但是我懒得改了。
当前页面是本站的「Google AMP」版。查看和发表评论请点击:完整版 »