#!/usr/bin/env python
#-*- coding: utf-8 -*-
import requests
import re
import os
def photolists(html):
"""获取每一页的图集url 及 标题 """
photo_urls = re.findall(r'<li><a href="(.*?)" target="_blank"><img', html) # 图片url
photo_titles = re.findall(r'alt=\'(.*?)\' width=\'236\' height=\'354\' />', html) # 图集标题
return list(zip(photo_titles, photo_urls))
def photourl(url, headers):
response = requests.get(url=url, headers=headers)
html = response.text
# 图集最大页码
maxpage = re.findall(r'>…</span><a href=\'.*?\'><span>(\d+)</span></a><a href=\'.*?\'><span>下一页', html)
#print(maxpage)
pages = range(1,int(maxpage[0])+1) # 每个图集的最大页码
photo =[]
for page in pages:
# print(url+'/'+str(page)) # 图集的url
try:
phtoturl = url+'/'+str(page) # 每张图片的真正url
response = requests.get(url=phtoturl, headers=headers)
html = response.text
photodata = re.findall(r'<img class="blur" src="(.*?)" alt=',html)
phototname = photodata[0].split(".")[-2].split('/')[-1]
photo.append([photodata[0],phototname])
except:
break
return photo
# 主程序
def main():
# 主页url
url = 'https://www.mzitu.com/'
# 构造请求头
headers = {
'Referer': 'https://www.mzitu.com/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
}
response = requests.get(url, headers=headers)
photos = photolists(response.text) # 得到当前页的所有图集 url及标题
for i in range(len(photos)): # 打开每一个图集
# print(photos[i][1])
# 创建文件夹
#if not os.path.exists(r'D:\{}'.format(photos[i][0])):
# os.mkdir(r'D:\{}'.format(photos[i][0]))
# 获取图集 并保存
phdatas = photourl(photos[i][1], headers) # 图集中每个图片的url
# 下载图片
for photo in phdatas:
print(photo[0])
print(photo[1])
res = requests.get(url=photo[0], headers=headers)
with open(photo[1] + '.jpg', 'wb') as f:
f.write(res.content)
break
break
#程序入口
if __name__ == '__main__':
main()
没注意到那个信息。。。 我换了你说的请求头了。 下载回来的图片 , 说是已损坏的。。。。 |