|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- import urllib.request
- import os
- import re
- #打开连接不解码
- def urlopen(url):
- req=urllib.request.Request(url)
- req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36")
- respones=urllib.request.urlopen(url)
- html = respones.read()
- print(url)
- return html
- #获取最新页码数
- def getpage(url):
- html = urlopen(url).decode('utf-8')
- a = re.findall(r'class="current-comment-page">\[(\d+)\]',html)
- return a[0]
- #获取图片地址,返回当前页图片地址列表
- def findimg(page_url):
- html = urlopen(page_url).decode('utf-8')
- imgurl = re.findall(r'img src="(//\S{3}\.sinaimg\.cn/(\w){5,10}/(\w+)\.(jpg|gif))',html)
- res=[]
- for i in range(len(imgurl)):
- res.append(imgurl[i][0])
- return res
- #保存图片
- def save_imgs(dirs,imgs):
- for i in range(len(imgs)):
- with open(str(i)+"."+imgs[i][-3:],"wb") as f:
- html = urlopen("http:"+imgs[i])
- f.write(html)
- #主函数
- def download_mm(dirs = 'ooxx',page=10):
- try:
- os.mkdir(dirs)
- except FileExistsError:
- pass
- os.chdir(dirs)
- url="http://jandan.net/ooxx"
- pagenum=int(getpage(url))
- imglist=[]
- for i in range(page):
- pagenum -= 1
- page_url=url + "/page-" + str(pagenum) + "#comments"
- img_adders = findimg(page_url)
- imglist.append(img_adders)
- reslist=[]
- for i in range(len(imglist)):
- for s in range(len(imglist[i])):
- reslist.append(imglist[i][s])
- save_imgs(dirs,reslist)
- if __name__ == '__main__':
- download_mm()
复制代码
大佬给批改批改,还没学到的也可以一起交流学习 |
|