前言

又是受某老师之托……鸽了俩礼拜我都忘了他一催我才想起来
由于我看不懂懒得分析京东的Pjax,直接拿Selenuim爬取京东

安装依赖

pip3 install selenium bs4 pymysql

代码

  1. 头部声明:

    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from bs4 import BeautifulSoup
    import requests
    import pymysql
    import pymysql.cursors 
    import logging
    #
    logging.basicConfig(level=logging.INFO,format="[%(asctime)s][%(levelname)s]%(message)s", datefmt='%Y-%m-%d %H:%M:%S')
    #
    #创建webdriver对象
    chrome = webdriver.Chrome()
    #
    #数据库设置
    DB_HOST = '127.0.0.1'
    DB_PORT = 3306
    DB_USER = 'user'
    DB_PASS = 'password'
    DB_NAME = 'dbname'
    DB_CHAR = 'utf8mb4'
    TB_NAME = 'jdspider'
  2. 抓取搜索页:

    def JD_Search_Single(keyword:str,page:int=1) -> list:
     '''
     京东搜索单次
     参数:
         关键词
         页码,1=第一页,3=第二页,5=第三页以此类推
     返回:
         [(商品名,价格,链接),……]
     '''
     chrome.get(f'https://search.jd.com/Search?keyword={keyword}&enc=utf-8&page={page}')
     chrome.implicitly_wait(7)
     goodlist = Htmt_2_GoodList(chrome.page_source)
     return(goodlist)
    def Htmt_2_GoodList(html:str) -> list:
     '''
     搜索页Html解析成商品列表
     参数:
         搜索页Html
     返回:
         (商品名,价格,链接)
     '''
     bs = BeautifulSoup(html,'lxml')
     goodslist = bs.find(name='div',attrs={'id':'J_goodsList'})
     goods = []
     for good in goodslist.find_all(name='div',attrs={'class':'gl-i-wrap'}):
         try:
             #不会处理商品搜索页上有tab的情况
             price = good.find(name='div',attrs={'class':'p-price'}).strong.i.get_text()
             name = good.find(name='div',attrs={'class':'p-name'}).a.em.get_text()
             link = good.find(name='div',attrs={'class':'p-name'}).a.attrs.get('href','null')
             link = f'https:{link}' if link[0] == '/' else link
             logging.info(f'爬取商品列表[{price.rjust(6)}][{name}][{link}]')
             goods.append((name,price,link))
         except Exception as e:
             logging.error(f'**遇到未知错误[{e}]')
             Error_Alert(f'Htmt_2_GoodList-{e}')
     return(goods)
  3. 分析详情页:

    def JD_Good_Detail(link:str) -> list:
     '''
     京东商品详情页
     参数:
         商品链接
     返回:
         (商品名,[商品简介])
     '''
     try:
         chrome.switch_to_window(chrome.window_handles[0])
         chrome.execute_script(f'window.open("{link}")')
         chrome.switch_to_window(chrome.window_handles[1])
         chrome.implicitly_wait(5)
         chrome.execute_script("scroll(0, 100000);")
         chrome.implicitly_wait(30)
         try:
             title = chrome.find_element_by_css_selector('#choose-attr-1 > div.dd > div.item.selected > a > i').get_attribute('textContent')
         except Exception:
             title = "Null"
         summary = Html_2_GoodDetail(chrome.page_source)
         chrome.close()
         chrome.switch_to_window(chrome.window_handles[0])
         logging.info(f'爬取商品[{title}]')
     except Exception as e:
         logging.error(f'**遇到未知错误[{e}]')
         Error_Alert(f'JD_Good_Detail-{e}')
     return((title,summary))
    def Html_2_GoodDetail(html:str) -> list:
     '''
     商品页Html解析出商品简介
     参数:
         商品详情页Html
     返回:
         [(简介标题,简介内容),……]
     '''
     bs = BeautifulSoup(html,'lxml')
     summarylist = []
     for unit in bs.find(name='div',attrs={'id':'J-detail-content'}).find_all(name='div',attrs={'class':'book-detail-item'}):
         try:
             title = unit.find(name='div',attrs={'class':'item-mt'}).h3.get_text()
             content = unit.find(name='div',attrs={'class':'book-detail-content'}).get_text().strip()
             logging.info(f'[{title}]')
             summarylist.append((title,content))
         except Exception as e:
             logging.error(f'**遇到未知错误[{e}]')
             Error_Alert(f'Html_2_GoodDetail-{e}')
     return(summarylist)
  4. 入口程序:

    def JD_Crawl(keyword:str,limit:int=30):
     '''
     京东爬虫
     参数:
         关键词
         数量
     '''
     logging.info('开始拉取商品列表')
     count = 0
     page = 1
     while(count < limit):
         try:
             #拉取一页商品
             goodlist = JD_Search_Single(keyword,page)
             page+=2
             for good in goodlist:
                 name,price,link = good
                 detail = JD_Good_Detail(link)
                 Save_To_Database(name,price,link,detail[0],detail[1])                                                  
                 count +=1
         except Exception as e:
             logging.error(f'**出错了{e}')
             Error_Alert(f'JD_Crawl-{e}')
  5. 数据库相关:

    def db_connect():
     '''
     连接数据库
     '''
     db = pymysql.connect(host=DB_HOST,port=DB_PORT,user=DB_USER,passwd=DB_PASS,db=DB_NAME,charset=DB_CHAR)
     return(db)
    def Save_To_Database(name:str,price:str,link:str,typename:str,gooddetail:list):
     '''
     把数据存进数据库s
     参数:
         (商品名,价格,链接,[(细分名,[(细分标题,商品简介),……])])
     '''
     insert_data = f'''INSERT INTO `{DB_NAME}`.`{TB_NAME}`(
                         `name`,
                         `price`,
                         `url`,
                         `type`,
                         `detail`
                     )VALUES(
                         "{name}",
                         "{price}",
                         "{link}",
                         "{typename}",
                         "{pymysql.escape_string(str(gooddetail))}"
                     );'''
     try:
         db = db_connect()
         cursor = db.cursor()
         result = cursor.execute(insert_data)
         logging.info(f'写入数据库成功,返回值[{result}]')
         db.commit()
         db.close()
     except Exception as e:
         logging.error(f'**数据库写入失败[{e}]')
         Error_Alert(f'Save_To_Database-{e}')
    def Set_Up_Database():
     '''
     创建数据库
     '''
     create_db = f'''CREATE TABLE `{DB_NAME}`.`{TB_NAME}` (
                     `id` INT(11) UNSIGNED NOT NULL AUTO_INCREMENT,
                     `name` VARCHAR(255) NOT NULL COMMENT '商品名称',
                     `price` VARCHAR(255) NOT NULL COMMENT '商品价格',
                     `url` VARCHAR(255) NOT NULL COMMENT '商品链接',
                     `type` VARCHAR(255) NOT NULL COMMENT '商品细分类型',
                     `detail` TEXT NOT NULL COMMENT '商品详情',
                     PRIMARY KEY (`id`),
                     UNIQUE KEY `id_UNIQUE` (`id`)
                     )ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='京东爬虫';'''
    
     try:
         db = db_connect()
         cursor = db.cursor()
         result = cursor.execute(create_db)
         logging.info(f'创建数据库成功,返回值[{result}]')
         db.close()
     except Exception as e:
         logging.error(f'**创建数据库失败[{e}]')
         Error_Alert(f'Set_Up_Database-{e}')
  6. 辅助函数:

    def Error_Alert(message:str):
     '''
     出错提醒
     '''
     try:
         url = 'https://sc.ftqq.com/【方糖气球推送码,[获取](https://sc.ftqq.com/)】.send'
         data = {
             'text':'脚本出错提醒',
             'desp':message
             }
         requests.post(url=url,data=data)
     except Exception as e:
         logging.error('推送失败')
     finally:
         time.sleep(300) 
    if __name__ == '__main__':
     JD_Crawl('机器学习',3000)

效果

爬取效果
说实话Bug挺多的,而且使用Selenium导致效率太低了,但是我懒得改了

最后修改:2019 年 11 月 27 日
Null