import urllib.request as r
import os
import re
class Download_mm:
def __init__(self, pages=10):
self.url = 'http://jandan.net/ooxx/'
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/80.0.3987.122 Safari/537.36'
}
self.page = pages
self.img_addrs = []
def url_open(self, url):
req = r.Request(url=self.url, headers=self.headers)
res = r.urlopen(req)
html = res.read()
return html
def get_next_page_url(self):
html = self.url_open(self.url).decode('utf-8')
p = re.compile('<a title="Older Comments" href="(.*?)"')
# 此时地址缺少'http:',需要加上
self.url = 'http:' + str(re.findall(p, html)[0])
def find_image(self):
html = self.url_open(self.url).decode('utf-8')
p = re.compile('<img src="(.*?)"')
self.img_addrs = re.findall(p, html)
# 此时地址缺少'http:',需要加上
for i in range(len(self.img_addrs)):
self.img_addrs[i] = 'http:' + self.img_addrs[i]
return self.img_addrs
def save_image(self, folder, img_addrs):
for each in img_addrs:
filename = each.split('/')[-1]
with open(filename, 'wb') as f:
img = self.url_open(each)
f.write(img)
def download(self, folder='OOXX', pages=10):
os.mkdir(folder)
os.chdir(folder)
pages = int(input('请输入想要保存的页数:'))
# while循环计算次数,根据pages
# 先获取网页
# 再寻找该页面所有图片地址
# 保存图片
# 下一页
while pages:
self.find_image() # 此时图片地址存入了self.img_addrs
self.save_image(folder, self.img_addrs)
self.get_next_page_url()
pages -= 1
if __name__ == '__main__':
down = Download_mm()
down.download()
我用的时候有些图片下载之后打不开
现在试不了,因为服务器拒绝我访问了。。。所以有什么问题你自己看着改吧 |