爬取整个网站图片

妖妖妈

关注

阅读 81

2022-11-10


网站:http://pic.netbian.com

爬取整个网站图片_二级

#-*- coding:utf-8 -*-
import urllib2
import re,sys,os

reload(sys)
sys.setdefaultencoding("utf-8")
#http://pic.netbian.com

import requests

num=1
headers = {
'Referer': 'http://pic.netbian.com/e/search/result/?searchid=1224',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
}
def down_img(url,root):
global num
response = requests.get(url, headers=headers)
pic= response.text


reg = re.compile('a href="(/tupian/\w+.html)"')

file_pic= re.findall(reg,pic)

if not os.path.exists("Pic"):
os.makedirs("Pic")
if not os.path.exists("Pic/"+root):
os.makedirs("Pic/"+root)

for i in file_pic:
img_url="http://pic.netbian.com/"+i
response = requests.get(img_url, headers=headers)
pic_text= response.text
reg = re.compile(r'/uploads.*?.jpg')
file_address= re.findall(reg,pic_text)
img_address="http://pic.netbian.com/"+file_address[0]
#print img_address
request = urllib2.Request(url=img_address, headers=headers)
response = urllib2.urlopen(request)
new_name="Pic/"+root+"/"+"%d" % num +".jpg"
print new_name
if not os.path.exists(new_name):
with open(new_name, "wb") as f:
f.write(response.read())
num+=1

type_pic=["4kfengjing","4kyouxi","4kmeinv","4kdongman","4kyingshi","4kmingxing",
"4kqiche","4kdongwu","4krenwu","4kmeishi","4kzongjiao","4kbeijing"]
type_img=int(raw_input("请输入选择下载的类型:\n1.风景\n2.游戏\n3.美女\n4.动漫\n5.影视\n6.明星\n7.汽车\n8.动物\n9.人物\n10.美食\n11.宗教\n12.背景\n".encode(sys.getfilesystemencoding())))
page=int(raw_input("请输入下载页数:页数>=1\n".encode(sys.getfilesystemencoding())))
for i in range(1,page+1):#此处页面的个数,可根据情况修改
url=("http://pic.netbian.com/%s/index_"% type_pic[type_img-1])+str(i)+".html"
if "_1.html" in url:
url=url.replace("_1.html",".html")
print url
down_img(url,type_pic[type_img-1])

print "finish!\n"

爬取整个网站图片_html_02


爬取整个网站图片_.net_03

说明:首先选择下载类型,然后输入下载页数(page>=1),即可进行下载。
首次运行会在同级目录建立Pic目录,然后根据下载类型建立二级目录。
同样类型以下载的图片不会二次下载。一页图片个数为21张。


精彩评论(0)

0 0 举报