|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- import requests
- import re
- import os
- if not os.path.exists('./糗事百科pic'): # 判断文件夹是否存在. 不存在就创建
- os.mkdir('./糗事百科pic')
- headers = {
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
- }
- url = 'https://www.qiushibaike.com/imgrank/page/%d/'
- for page in range(1, 14):
- page_url = format(url % page) # page变量和url的组合
- html_text = requests.get(page_url, headers=headers, timeout=30)
- '''
- 要提取的关键部分,对照正则表达式. 方便理解
- <div class="thumb">
- <a href="/article/123349419" target="_blank">
- <img src="//pic.qiushibaike.com/system/pictures/12334/123349419/medium/BOL40YKC24QRKRBI.jpg" alt="糗事#123349419" class="illustration" width="100%" height="auto">
- </a>
- </div>
- '''
- ex = '<div class="thumb">.*?<img src="(.*?)" alt='
- img_url = re.findall(ex, html_text.text, re.S) # 正则.re.S允许多行
- for src_url in img_url: # 遍历url列表
- src_url = 'https:'+src_url # 拼接url 补全http:
- img_data = requests.get(src_url, headers=headers).content # 获取图片二进制数据
- file_name = src_url.split('/')[-1] # 分割出文件名.
- with open('./糗事百科pic/'+file_name, 'wb') as f:
- f.write(img_data)
- print(file_name, '下载完成!')
- print('第%d页完成.' % page)
- print('All Over!!')
复制代码 |
|