前言
又是受某老师之托……鸽了俩礼拜我都忘了他一催我才想起来
由于我看不懂懒得分析京东的Pjax,直接拿Selenuim爬取京东
安装依赖
pip3 install selenium bs4 pymysql
代码
-
头部声明:
from selenium import webdriver from selenium.webdriver import ActionChains from bs4 import BeautifulSoup import requests import pymysql import pymysql.cursors import logging # logging.basicConfig(level=logging.INFO,format="[%(asctime)s][%(levelname)s]%(message)s", datefmt='%Y-%m-%d %H:%M:%S') # #创建webdriver对象 chrome = webdriver.Chrome() # #数据库设置 DB_HOST = '127.0.0.1' DB_PORT = 3306 DB_USER = 'user' DB_PASS = 'password' DB_NAME = 'dbname' DB_CHAR = 'utf8mb4' TB_NAME = 'jdspider'
-
抓取搜索页:
def JD_Search_Single(keyword:str,page:int=1) -> list: ''' 京东搜索单次 参数: 关键词 页码,1=第一页,3=第二页,5=第三页以此类推 返回: [(商品名,价格,链接),……] ''' chrome.get(f'https://search.jd.com/Search?keyword={keyword}&enc=utf-8&page={page}') chrome.implicitly_wait(7) goodlist = Htmt_2_GoodList(chrome.page_source) return(goodlist)
def Htmt_2_GoodList(html:str) -> list: ''' 搜索页Html解析成商品列表 参数: 搜索页Html 返回: (商品名,价格,链接) ''' bs = BeautifulSoup(html,'lxml') goodslist = bs.find(name='div',attrs={'id':'J_goodsList'}) goods = [] for good in goodslist.find_all(name='div',attrs={'class':'gl-i-wrap'}): try: #不会处理商品搜索页上有tab的情况 price = good.find(name='div',attrs={'class':'p-price'}).strong.i.get_text() name = good.find(name='div',attrs={'class':'p-name'}).a.em.get_text() link = good.find(name='div',attrs={'class':'p-name'}).a.attrs.get('href','null') link = f'https:{link}' if link[0] == '/' else link logging.info(f'爬取商品列表[{price.rjust(6)}][{name}][{link}]') goods.append((name,price,link)) except Exception as e: logging.error(f'**遇到未知错误[{e}]') Error_Alert(f'Htmt_2_GoodList-{e}') return(goods)
-
分析详情页:
def JD_Good_Detail(link:str) -> list: ''' 京东商品详情页 参数: 商品链接 返回: (商品名,[商品简介]) ''' try: chrome.switch_to_window(chrome.window_handles[0]) chrome.execute_script(f'window.open("{link}")') chrome.switch_to_window(chrome.window_handles[1]) chrome.implicitly_wait(5) chrome.execute_script("scroll(0, 100000);") chrome.implicitly_wait(30) try: title = chrome.find_element_by_css_selector('#choose-attr-1 > div.dd > div.item.selected > a > i').get_attribute('textContent') except Exception: title = "Null" summary = Html_2_GoodDetail(chrome.page_source) chrome.close() chrome.switch_to_window(chrome.window_handles[0]) logging.info(f'爬取商品[{title}]') except Exception as e: logging.error(f'**遇到未知错误[{e}]') Error_Alert(f'JD_Good_Detail-{e}') return((title,summary))
def Html_2_GoodDetail(html:str) -> list: ''' 商品页Html解析出商品简介 参数: 商品详情页Html 返回: [(简介标题,简介内容),……] ''' bs = BeautifulSoup(html,'lxml') summarylist = [] for unit in bs.find(name='div',attrs={'id':'J-detail-content'}).find_all(name='div',attrs={'class':'book-detail-item'}): try: title = unit.find(name='div',attrs={'class':'item-mt'}).h3.get_text() content = unit.find(name='div',attrs={'class':'book-detail-content'}).get_text().strip() logging.info(f'[{title}]') summarylist.append((title,content)) except Exception as e: logging.error(f'**遇到未知错误[{e}]') Error_Alert(f'Html_2_GoodDetail-{e}') return(summarylist)
-
入口程序:
def JD_Crawl(keyword:str,limit:int=30): ''' 京东爬虫 参数: 关键词 数量 ''' logging.info('开始拉取商品列表') count = 0 page = 1 while(count < limit): try: #拉取一页商品 goodlist = JD_Search_Single(keyword,page) page+=2 for good in goodlist: name,price,link = good detail = JD_Good_Detail(link) Save_To_Database(name,price,link,detail[0],detail[1]) count +=1 except Exception as e: logging.error(f'**出错了{e}') Error_Alert(f'JD_Crawl-{e}')
-
数据库相关:
def db_connect(): ''' 连接数据库 ''' db = pymysql.connect(host=DB_HOST,port=DB_PORT,user=DB_USER,passwd=DB_PASS,db=DB_NAME,charset=DB_CHAR) return(db)
def Save_To_Database(name:str,price:str,link:str,typename:str,gooddetail:list): ''' 把数据存进数据库s 参数: (商品名,价格,链接,[(细分名,[(细分标题,商品简介),……])]) ''' insert_data = f'''INSERT INTO `{DB_NAME}`.`{TB_NAME}`( `name`, `price`, `url`, `type`, `detail` )VALUES( "{name}", "{price}", "{link}", "{typename}", "{pymysql.escape_string(str(gooddetail))}" );''' try: db = db_connect() cursor = db.cursor() result = cursor.execute(insert_data) logging.info(f'写入数据库成功,返回值[{result}]') db.commit() db.close() except Exception as e: logging.error(f'**数据库写入失败[{e}]') Error_Alert(f'Save_To_Database-{e}')
def Set_Up_Database(): ''' 创建数据库 ''' create_db = f'''CREATE TABLE `{DB_NAME}`.`{TB_NAME}` ( `id` INT(11) UNSIGNED NOT NULL AUTO_INCREMENT, `name` VARCHAR(255) NOT NULL COMMENT '商品名称', `price` VARCHAR(255) NOT NULL COMMENT '商品价格', `url` VARCHAR(255) NOT NULL COMMENT '商品链接', `type` VARCHAR(255) NOT NULL COMMENT '商品细分类型', `detail` TEXT NOT NULL COMMENT '商品详情', PRIMARY KEY (`id`), UNIQUE KEY `id_UNIQUE` (`id`) )ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='京东爬虫';''' try: db = db_connect() cursor = db.cursor() result = cursor.execute(create_db) logging.info(f'创建数据库成功,返回值[{result}]') db.close() except Exception as e: logging.error(f'**创建数据库失败[{e}]') Error_Alert(f'Set_Up_Database-{e}')
-
辅助函数:
def Error_Alert(message:str): ''' 出错提醒 ''' try: url = 'https://sc.ftqq.com/【方糖气球推送码,[获取](https://sc.ftqq.com/)】.send' data = { 'text':'脚本出错提醒', 'desp':message } requests.post(url=url,data=data) except Exception as e: logging.error('推送失败') finally: time.sleep(300)
if __name__ == '__main__': JD_Crawl('机器学习',3000)
效果
说实话Bug挺多的,而且使用Selenium导致效率太低了,但是我懒得改了。
本文链接:https://blog.chrxw.com/archives/2019/11/15/679.html
转载请保留本文链接,谢谢