0
点赞
收藏
分享

微信扫一扫

利用Selenium爬取京东商品信息

       京东做了懒加载,懒加载的原理其实就是先给img标签一个自定义属性,属性值就是图片链接,检测浏览器滑动高度,达到一定高度就会将自定义属性里的图片链接放到图片的src属性中!随后为了达到反爬效果【实则软用没有,随便猜都能猜到】再将自定义属性的值替换成其他值。

import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options


def JD_SOU(SOU_KEY):
    def NOT_GUI():
        chrome_options = Options()
        chrome_options.add_argument('window-size=1920x3000')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--hide-scrollbars')
        chrome_options.add_argument('blink-settings=imagesEnabled=false')
        chrome_options.add_argument('--headless')
        return chrome_options

    def GET_JD(browser):
        l_list = browser.find_elements(By.CSS_SELECTOR, '.gl-warp li')
        print(len(l_list))
        for l in l_list:
            try:
                url = l.find_element(By.CSS_SELECTOR, '.p-name a').get_attribute('href')
                price = l.find_element(By.CSS_SELECTOR, '.p-price strong i').text
                title = l.find_element(By.CSS_SELECTOR, '.p-name em').text
                shop_home = l.find_element(By.CSS_SELECTOR, '.p-shop a').get_attribute('href')
                shop_name = l.find_element(By.CSS_SELECTOR, '.p-shop a').text
                commit_num = l.find_element(By.CSS_SELECTOR, '.p-commit strong a').text
                commit_url = l.find_element(By.CSS_SELECTOR, '.p-commit strong a').get_attribute('href')
                img_url = l.find_element(By.CSS_SELECTOR, '.p-img img').get_attribute('src')
                img_url = 'https:' + l.find_element(By.CSS_SELECTOR, '.p-img img').get_attribute(
                    'data-lazy-img') if not img_url else img_url
                yh = l.find_element(By.CSS_SELECTOR, '.p-icons').text.split('\n')
                print("""
                商品主页:%s
                商品价格:%s
                商品标题:%s
                商家主页:%s
                商家名称:%s
                评论数量:%s
                评论地址:%s
                商品图片:%s
                优惠标签:%s
                """ % (url, price, title, shop_name, shop_home, commit_num, commit_url, img_url, yh))
            except Exception as e:
                continue

        next_page = browser.find_element(By.PARTIAL_LINK_TEXT, '下一页')
        next_page.click()
        time.sleep(0.5)
        GET_JD(browser)

    browser = webdriver.Chrome(options=NOT_GUI())
    browser.implicitly_wait(10)
    try:
        browser.get('https://www.jd.com/')
        s_input = browser.find_element(By.ID, 'key')
        s_input.send_keys('%s'% SOU_KEY)
        s_input.send_keys(Keys.ENTER)
        GET_JD(browser)
    except Exception as e:
        print(e)
    finally:
        browser.close()


JD_SOU('小黑子立牌')

举报

相关推荐

0 条评论