|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 mynfqc 于 2020-3-18 13:29 编辑
从实用的角度讲,我并不希望无差别爬图,只希望把浏览喜欢的套图存下来
以前在尤果还买过几套图,以后都白嫖了哈哈
- import requests
- from bs4 import BeautifulSoup
- import os
- import time
- # 从www.mzitu.com下载一组套图
- def crawl_pic_set(dir, headers, set_url):
- # print(file_name)
- print('正在请求 ' + str(set_url))
- html = requests.get(set_url, headers=headers).text
- print('正在解析...')
- soup = BeautifulSoup(html, 'lxml')
- for content in soup.select('.content'):
- img_total = int(content.find(class_='pagenavi').select('span')[-2].string)
- print('共' + str(img_total) + '张\n')
- for i in range(1, img_total + 1):
- pic_url = set_url + "/" + str(i)
- print('当前进度 ' + str(i) + '/' + str(img_total) + '\n')
- # dir_path = dir + file_name
- image_path = dir + '/' + str(i) + '.jpg'
- print('检查文件是否存在 ' + image_path)
- if (not os.path.exists(image_path)) or os.path.getsize(image_path) < 1024:
- soup = BeautifulSoup(requests.get(pic_url, headers=headers).text, 'lxml')
- # 访问太频繁会被屏蔽
- time.sleep(1)
- main_image = soup.find(class_='main-image')
- if main_image == None:
- print(soup)
- continue
- image_src = main_image.find('img').get('src')
- print('请求图片地址 ' + str(image_src))
- print('文件下载... ')
- image_content = requests.get(image_src, headers=headers).content
- print('下载成功')
- print('检查文件夹是否存在 ' + dir)
- if not os.path.exists(dir):
- print('创建文件夹 ' + dir)
- os.makedirs(dir)
- else:
- print('文件夹已存在')
- print('存入磁盘...')
- with open(image_path, 'wb') as f:
- f.write(image_content)
- print('存放完成\n')
- f.close()
- else:
- print('文件已存在,无需重复下载\n')
- def main():
- # 存套图的目录名
- dir = 'E:/妹子图/尤果 杜小雨'
- # 套图url
- set_url = "https://www.mzitu.com/114919"
- print('图片保存地址 ' + str(dir))
- home_url = 'https://www.mzitu.com'
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 Edg/80.0.361.62',
- 'Referer': home_url}
- crawl_pic_set(dir, headers, set_url)
- if __name__ == "__main__":
- main()
复制代码 |
|