0
点赞
收藏
分享

微信扫一扫

python - JD spider

小飞侠熙熙 2022-10-28 阅读 209


# !/usr/bin/env python
# coding=utf-8

import re

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from pyquery import PyQuery as pq

from main.config import *
import pymongo

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
# 浏览器
# browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
# 设置浏览器大小
# browser.set_window_size(1400, 900)
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10) # 等待到until返回true

# detail_browser = webdriver.Chrome() # 访问商品详细信息浏览器
def search():
print("搜索,")
try:
# 访问jd
browser.get("https://www.jd.com")
# 输入框
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#key"))
)
# 提交按钮
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button"))
)
# 设置搜索内容
input.send_keys(KEYWORD)
# 提交
submit.click()
# 获取总页数
total = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > em:nth-child(1) > b"))
)
get_products()
return total.text
except TimeoutException:
return search()

def next_page(page_number):
print("第%d页," % page_number)
try:
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > input"))
)
submit = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > a"))
)
input.clear()
input.send_keys(str(page_number))
submit.click()
"""
text_to_be_present_in_element
in the specified element. locator, text
text_to_be_present_in_element_value
in the element’s locator, text
"""
wait.until(
EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#J_bottomPage > span.p-num > a.curr"), str(page_number))
)
get_products()
except TimeoutException:
next_page(page_number)

def get_products():
wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#J_goodsList .gl-item"))
)
html = browser.page_source
doc = pq(html)
items = doc("#J_goodsList .gl-item").items()
for item in items:
# 商品url
# url = re.compile('href="(.*?)"').search(item.find(".p-name").html()).group(1)\
# .replace(";", "&").replace("https:", "")
# print(url)
# get_product(url)
product = {
"price" : re.compile(r"(\d+\.\d{2})").search(item.find(".p-price").text()).group(1),
"detail" : item.find(".p-name").text(),
"shop" : item.find(".p-shop").text(),
"commit" : item.find(".p-commit").text()
}
save_to_mongo(product)

def save_to_mongo(res):
try:
if db[MONGO_TABLE].insert(res):
print("ok!", res)
except Exception:
print("error!", res)
# def get_product(url):
# try:
# # 访问jd
# detail_browser.get("https:" + url)
# wait.until(
# EC.presence_of_element_located((By.CSS_SELECTOR, "body > div:nth-child(9) > div"))
# )
# html = detail_browser.page_source
# doc = pq(html)
# item = doc("body > div:nth-child(9) > div > div.itemInfo-wrap")
# product = {
# "title" : item.find(".sku-name").text(),
# "price" : item.find(".price").text(),
# "shop" : item.find("#summary-service .hl_red").text(),
# "location" : item.find("#summary-service").text()
# }
# print(product)
# return product
# except TimeoutException:
# return get_product()

def main():
try:
total = int(search())
print(total)
for i in range(2, total+1):
next_page(i)
except Exception:
print("error,")
finally:
browser.close()

if __name__ == "__main__":
main()


MONGO_URL = 'localhost'
MONGO_DB = 'jd'
MONGO_TABLE = 'product'

# 不加载图片;缓存
# SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']

KEYWORD = '电视'



举报

相关推荐

0 条评论