正则爬取糗事百科所有图片数据.当前共13页
import requestsimport re
import os
if not os.path.exists('./糗事百科pic'):# 判断文件夹是否存在. 不存在就创建
os.mkdir('./糗事百科pic')
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}
url = 'https://www.qiushibaike.com/imgrank/page/%d/'
for page in range(1, 14):
page_url = format(url % page)# page变量和url的组合
html_text = requests.get(page_url, headers=headers, timeout=30)
'''
要提取的关键部分,对照正则表达式. 方便理解
<div class="thumb">
<a href="/article/123349419" target="_blank">
<img src="//pic.qiushibaike.com/system/pictures/12334/123349419/medium/BOL40YKC24QRKRBI.jpg" alt="糗事#123349419" class="illustration" width="100%" height="auto">
</a>
</div>
'''
ex = '<div class="thumb">.*?<img src="(.*?)" alt='
img_url = re.findall(ex, html_text.text, re.S)# 正则.re.S允许多行
for src_url in img_url:# 遍历url列表
src_url = 'https:'+src_url# 拼接url 补全http:
img_data = requests.get(src_url, headers=headers).content# 获取图片二进制数据
file_name = src_url.split('/')[-1]# 分割出文件名.
with open('./糗事百科pic/'+file_name, 'wb') as f:
f.write(img_data)
print(file_name, '下载完成!')
print('第%d页完成.' % page)
print('All Over!!')
页:
[1]