使用Selenium爬取京东

2019-11-15T19:36:00

前言

又是受某老师之托……鸽了俩礼拜我都忘了他一催我才想起来
由于我看不懂懒得分析京东的Pjax,直接拿Selenuim爬取京东

安装依赖

pip3 install selenium bs4 pymysql

代码

  1. 头部声明:
from selenium import webdriver
from selenium.webdriver import ActionChains
from bs4 import BeautifulSoup
import requests
import pymysql
import pymysql.cursors 
import logging
#
logging.basicConfig(level=logging.INFO,format="[%(asctime)s][%(levelname)s]%(message)s", datefmt='%Y-%m-%d %H:%M:%S')
#
#创建webdriver对象
chrome = webdriver.Chrome()
#
#数据库设置
DB_HOST = '127.0.0.1'
DB_PORT = 3306
DB_USER = 'user'
DB_PASS = 'password'
DB_NAME = 'dbname'
DB_CHAR = 'utf8mb4'
TB_NAME = 'jdspider'
  1. 抓取搜索页:
def JD_Search_Single(keyword:str,page:int=1) -> list:
    '''
    京东搜索单次
    参数:
        关键词
        页码,1=第一页,3=第二页,5=第三页以此类推
    返回:
        [(商品名,价格,链接),……]
    '''
    chrome.get(f'https://search.jd.com/Search?keyword={keyword}&enc=utf-8&page={page}')
    chrome.implicitly_wait(7)
    goodlist = Htmt_2_GoodList(chrome.page_source)
    return(goodlist)
def Htmt_2_GoodList(html:str) -> list:
    '''
    搜索页Html解析成商品列表
    参数:
        搜索页Html
    返回:
        (商品名,价格,链接)
    '''
    bs = BeautifulSoup(html,'lxml')
    goodslist = bs.find(name='div',attrs={'id':'J_goodsList'})
    goods = []
    for good in goodslist.find_all(name='div',attrs={'class':'gl-i-wrap'}):
        try:
            #不会处理商品搜索页上有tab的情况
            price = good.find(name='div',attrs={'class':'p-price'}).strong.i.get_text()
            name = good.find(name='div',attrs={'class':'p-name'}).a.em.get_text()
            link = good.find(name='div',attrs={'class':'p-name'}).a.attrs.get('href','null')
            link = f'https:{link}' if link[0] == '/' else link
            logging.info(f'爬取商品列表[{price.rjust(6)}][{name}][{link}]')
            goods.append((name,price,link))
        except Exception as e:
            logging.error(f'**遇到未知错误[{e}]')
            Error_Alert(f'Htmt_2_GoodList-{e}')
    return(goods)
  1. 分析详情页:
def JD_Good_Detail(link:str) -> list:
    '''
    京东商品详情页
    参数:
        商品链接
    返回:
        (商品名,[商品简介])
    '''
    try:
        chrome.switch_to_window(chrome.window_handles[0])
        chrome.execute_script(f'window.open("{link}")')
        chrome.switch_to_window(chrome.window_handles[1])
        chrome.implicitly_wait(5)
        chrome.execute_script("scroll(0, 100000);")
        chrome.implicitly_wait(30)
        try:
            title = chrome.find_element_by_css_selector('#choose-attr-1 > div.dd > div.item.selected > a > i').get_attribute('textContent')
        except Exception:
            title = "Null"
        summary = Html_2_GoodDetail(chrome.page_source)
        chrome.close()
        chrome.switch_to_window(chrome.window_handles[0])
        logging.info(f'爬取商品[{title}]')
    except Exception as e:
        logging.error(f'**遇到未知错误[{e}]')
        Error_Alert(f'JD_Good_Detail-{e}')
    return((title,summary))
def Html_2_GoodDetail(html:str) -> list:
    '''
    商品页Html解析出商品简介
    参数:
        商品详情页Html
    返回:
        [(简介标题,简介内容),……]
    '''
    bs = BeautifulSoup(html,'lxml')
    summarylist = []
    for unit in bs.find(name='div',attrs={'id':'J-detail-content'}).find_all(name='div',attrs={'class':'book-detail-item'}):
        try:
            title = unit.find(name='div',attrs={'class':'item-mt'}).h3.get_text()
            content = unit.find(name='div',attrs={'class':'book-detail-content'}).get_text().strip()
            logging.info(f'[{title}]')
            summarylist.append((title,content))
        except Exception as e:
            logging.error(f'**遇到未知错误[{e}]')
            Error_Alert(f'Html_2_GoodDetail-{e}')
    return(summarylist)
  1. 入口程序:
def JD_Crawl(keyword:str,limit:int=30):
    '''
    京东爬虫
    参数:
        关键词
        数量
    '''
    logging.info('开始拉取商品列表')
    count = 0
    page = 1
    while(count < limit):
        try:
            #拉取一页商品
            goodlist = JD_Search_Single(keyword,page)
            page+=2
            for good in goodlist:
                name,price,link = good
                detail = JD_Good_Detail(link)
                Save_To_Database(name,price,link,detail[0],detail[1])                                                  
                count +=1
        except Exception as e:
            logging.error(f'**出错了{e}')
            Error_Alert(f'JD_Crawl-{e}')
  1. 数据库相关:
def db_connect():
    '''
    连接数据库
    '''
    db = pymysql.connect(host=DB_HOST,port=DB_PORT,user=DB_USER,passwd=DB_PASS,db=DB_NAME,charset=DB_CHAR)
    return(db)
def Save_To_Database(name:str,price:str,link:str,typename:str,gooddetail:list):
    '''
    把数据存进数据库s
    参数:
        (商品名,价格,链接,[(细分名,[(细分标题,商品简介),……])])
    '''
    insert_data = f'''INSERT INTO `{DB_NAME}`.`{TB_NAME}`(
                        `name`,
                        `price`,
                        `url`,
                        `type`,
                        `detail`
                    )VALUES(
                        "{name}",
                        "{price}",
                        "{link}",
                        "{typename}",
                        "{pymysql.escape_string(str(gooddetail))}"
                    );'''
    try:
        db = db_connect()
        cursor = db.cursor()
        result = cursor.execute(insert_data)
        logging.info(f'写入数据库成功,返回值[{result}]')
        db.commit()
        db.close()
    except Exception as e:
        logging.error(f'**数据库写入失败[{e}]')
        Error_Alert(f'Save_To_Database-{e}')
def Set_Up_Database():
    '''
    创建数据库
    '''
    create_db = f'''CREATE TABLE `{DB_NAME}`.`{TB_NAME}` (
                    `id` INT(11) UNSIGNED NOT NULL AUTO_INCREMENT,
                    `name` VARCHAR(255) NOT NULL COMMENT '商品名称',
                    `price` VARCHAR(255) NOT NULL COMMENT '商品价格',
                    `url` VARCHAR(255) NOT NULL COMMENT '商品链接',
                    `type` VARCHAR(255) NOT NULL COMMENT '商品细分类型',
                    `detail` TEXT NOT NULL COMMENT '商品详情',
                    PRIMARY KEY (`id`),
                    UNIQUE KEY `id_UNIQUE` (`id`)
                    )ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='京东爬虫';'''

    try:
        db = db_connect()
        cursor = db.cursor()
        result = cursor.execute(create_db)
        logging.info(f'创建数据库成功,返回值[{result}]')
        db.close()
    except Exception as e:
        logging.error(f'**创建数据库失败[{e}]')
        Error_Alert(f'Set_Up_Database-{e}')
  1. 辅助函数:
def Error_Alert(message:str):
    '''
    出错提醒
    '''
    try:
        url = 'https://sc.ftqq.com/【方糖气球推送码,[获取](https://sc.ftqq.com/)】.send'
        data = {
            'text':'脚本出错提醒',
            'desp':message
            }
        requests.post(url=url,data=data)
    except Exception as e:
        logging.error('推送失败')
    finally:
        time.sleep(300) 
if __name__ == '__main__':
    JD_Crawl('机器学习',3000)

效果


说实话Bug挺多的,而且使用Selenium导致效率太低了,但是我懒得改了

当前页面是本站的「Baidu MIP」版。发表评论请点击:完整版 »