0
点赞
收藏
分享

微信扫一扫

爬虫接口数据抓去,下载图片

你带来了我的快乐 2022-11-20 阅读 134


# -*- encoding = utf-8 -*-

import requests
from urllib.parse import quote
import threading
threading_lock = threading.BoundedSemaphore(value=1)

kw = '校花'
kw = quote(kw)


#https://www.duitang.com/napi/blog/list/by_search/?kw=%E6%A0%A1%E8%8A%B1&start=48&_=1501895019883

# 单个url所有返回数据
def get_page(url):
page = requests.get(url)
page = page.content
page = page.decode('utf-8')
return page

#获取单个页面所有的图片url
def findall_in_page(page,startpart,endpart):
all_strings = []
end = 0
while page.find(startpart,end) != -1:
start = page.find(startpart,end) + len(startpart)
end = page.find(endpart,start)
string = page[start:end]
all_strings.append(string)
return all_strings

#通过url获取所有图片url
def pic_urls_from_page(pages):
pic_urls = []
for page in pages:
urls = findall_in_page(page,'path":"','"')
pic_urls.extend(urls)
return pic_urls


#获取所有url
def get_pages_from_label(label):
label = quote(label)
all_pages = []
for start in range(0,3600,100):
url = 'https://www.duitang.com/napi/blog/list/by_search/?kw={}&start={}&limit=1000'
url = url.format(label,start)
page = get_page(url)
all_pages.append(page)
return all_pages


def download_pics(img_url,n):

r = requests.get(img_url)
ext = img_url.split('.')[-1]
path = 'duitang_interface_img_dir/images/' +str(n) +'.' + ext
with open(path,'wb') as f:
f.write(r.content)
threading_lock.release()


def main():
print('start')
all_pages = get_pages_from_label('校花')
print('获取所有页面信息完毕')
all_img_urls = pic_urls_from_page(all_pages)
print('获取所有图片url完毕')
n = 0
print('开始下载')
for img_url in all_img_urls:
n += 1
print('正在下载第',n,'张')
threading_lock.acquire()
t = threading.Thread(target=download_pics,args=(img_url,n))
t.start()
print('all done')

main()

举报

相关推荐

0 条评论