使用python爬取豆瓣读书相关数据
import requests #进行请求
from lxml import etree #解析数据
#发送请求 获取数据
def get_html(url):
#构建请求头,伪装成浏览器
db_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36','referer': 'https://www.douban.com/'}
basic_url = url
db_novel = requests.get(url=basic_url,headers=db_headers)
#将获取的数据转换成text文本
db_novel_html = db_novel.text
return db_novel_html
# 定义函数:解析数据(提取需要的数据)
def parse_html(db_html):
#将text数据转换成html数据
db_html = etree.HTML(db_html)
#使用xpath提取相关数据
basic_info = db_html.xpath('//ul[@class="subject-list"]/li/div[@class="info"]/div[@class="pub"]/text()')
# 使用循环提取数据,并使用列表推导式
#for info in basic_info:
# res = info.replace("\n","").strip().split("/")
basic_info = [info.replace("\n","").strip().split("/") for info in basic_info]
rating_nums = db_html.xpath('//ul[@class="subject-list"]/li/div[@class="info"]/div[@class="star clearfix"]/span[@class="rating_nums"]/text()')
book_title = db_html.xpath('//ul[@class="subject-list"]/li/div[@class="info"]//a/@title')
#合并评分和书名,将对应位置进行打包
rating_title = list(zip(rating_nums,book_title))
return basic_info,rating_title
#观察网址的变化,以便获取多页的数据
for page in range(0,100,20):
basic_url = "https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start={}&type=T".format(page)
db_html = get_html(basic_url)
basic_info, rating_tile = parse_html(db_html)