import requests
import re
import os
from bs4 import BeautifulSoup
from time import sleep
import fnmatch
url = 'https://www.hexuexiao.cn/meinv/guzhuang/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
resp = requests.get(url,headers=headers)
html =resp.text
soup = BeautifulSoup(html,'lxml')
urls_a =soup.find_all('a')
for url_a in urls_a:
pattern = url_a['href']
if "list-3.html" in pattern:
print(pattern)
if len(re.findall(r'/[a-z]+/[a-z]+/$' , pattern)) == 0 :
continue
home_url = re.findall(r'/[a-z]+/[a-z]+/$' , pattern)
home_url = str(home_url)
home_url = home_url.replace("[",'').replace(']','').replace("'",'')
home_url = 'https://www.hexuexiao.cn' + str(home_url)
resp = requests.get(home_url,headers=headers)
html =resp.text
soup = BeautifulSoup(html,'lxml')
urls = soup.find_all('img')
# for img in urls:
#
# img_name = img['title']
# img = img['src']
# idx = img.rfind("300.jpg")
# img_url = img[:idx] + str.replace(img[idx:] , "300.jpg" , "source.jpg")
# img_photo = requests.get(img_url,headers=headers).content
# with open(img_name + '.jpg' , 'wb') as f:
# f.write(img_photo)
# print("下载完成图片:"+ img_name)
正则表达式、BS4、Xpath、CSS,Python常用的选择器是这个四个吧,之前用过css和正则,也都是浅尝辄止,今天的话主要是bs4,站在初学的角度来说的话bs4似乎更容易上手,但是坏处就是筛选出来的结果比较多比较杂乱,需要和正则搭配使用。这几个反正都得多练习下。