|

楼主 |
发表于 2020-6-13 15:43:19
|
显示全部楼层
- #!/usr/bin/env python
- #-*- coding: utf-8 -*-
- import requests
- import re
- import os
- def photolists(html):
- """获取每一页的图集url 及 标题 """
- photo_urls = re.findall(r'<li><a href="(.*?)" target="_blank"><img', html) # 图片url
- photo_titles = re.findall(r'alt=\'(.*?)\' width=\'236\' height=\'354\' />', html) # 图集标题
- return list(zip(photo_titles, photo_urls))
- def photourl(url, headers):
- response = requests.get(url=url, headers=headers)
- html = response.text
- # 图集最大页码
- maxpage = re.findall(r'>…</span><a href=\'.*?\'><span>(\d+)</span></a><a href=\'.*?\'><span>下一页', html)
- #print(maxpage)
- pages = range(1,int(maxpage[0])+1) # 每个图集的最大页码
- photo =[]
- for page in pages:
- # print(url+'/'+str(page)) # 图集的url
- try:
- phtoturl = url+'/'+str(page) # 每张图片的真正url
- response = requests.get(url=phtoturl, headers=headers)
- html = response.text
- photodata = re.findall(r'<img class="blur" src="(.*?)" alt=',html)
- phototname = photodata[0].split(".")[-2].split('/')[-1]
- photo.append([photodata[0],phototname])
- except:
- break
- return photo
- # 主程序
- def main():
- # 主页url
- url = 'https://www.mzitu.com/'
- # 构造请求头
- headers = {
- 'Referer': 'https://www.mzitu.com/',
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'
- }
- response = requests.get(url, headers=headers)
- photos = photolists(response.text) # 得到当前页的所有图集 url及标题
- for i in range(len(photos)): # 打开每一个图集
- # print(photos[i][1])
- # 创建文件夹
- #if not os.path.exists(r'D:\{}'.format(photos[i][0])):
- # os.mkdir(r'D:\{}'.format(photos[i][0]))
- # 获取图集 并保存
- phdatas = photourl(photos[i][1], headers) # 图集中每个图片的url
- # 下载图片
- for photo in phdatas:
- print(photo[0])
- print(photo[1])
- res = requests.get(url=photo[0], headers=headers)
- with open(photo[1] + '.jpg', 'wb') as f:
- f.write(res.content)
- break
- break
- #程序入口
- if __name__ == '__main__':
- main()
复制代码
没注意到那个信息。。。 我换了你说的请求头了。 下载回来的图片 , 说是已损坏的。。。。 |
|