|
发表于 2017-1-12 11:12:06
|
显示全部楼层
抱歉,我很久不上论坛了,那个代码也是初学的时候写的,我找不到了。后来我改进了一些,我把我之后的代码给你吧,这个问题好像是有几个正则匹配有问题。
- import urllib.request
- import os
- def url_open(url):
-
- req = urllib.request.Request(url)
-
- req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 5.1; rv:43.0) Gecko/20100101 Firefox/43.0')
-
-
- response = urllib.request.urlopen(req)
- html = response.read()
- return html
- def get_page(url):
- html = url_open(url).decode('utf-8')
- a = html.find('current-comment-page') + 23
- b = html.find(']', a)
- url_pic=html[a:b]
- return url_pic
- def find_imgs(url):
- html = url_open(url).decode('utf-8')
- img_addrs = []
- a = html.find('查看原图')
- while a != -1:
- b = html.find('"', a+27)
- if b != -1:
- pic_url="http:"+html[a+25:b]
- c=pic_url.find(".gif")
- if c != -1 :
- pic_url=pic_url.replace("thumb180","large")
- img_addrs.append(pic_url)
- a = html.find('查看原图', b)
- return img_addrs
-
- def save_imgs(folder, img_addrs):
- for each in img_addrs:
- filename = each.split('/')[-1]
- ##print("正在保存"+filename)
- with open(filename, 'wb') as f:
- img = url_open(each)
- f.write(img)
- def download_mm(pages=10):
- folder="/sdcard/Download/cs/"
- os.chdir(folder)
- url = 'http://jandan.net/ooxx/'
- page_num = int(get_page(url))
- pages=int(pages)
- for i in range(pages):
- page_num -= i
- page_url = url + 'page-' + str(page_num) + '#comments'
- img_addrs = find_imgs(page_url)
- save_imgs(folder, img_addrs)
- if __name__ == '__main__':
- numbers=input("下载几页的呀?")
- download_mm(numbers)
复制代码
因为之前在手机上运行,关于路径那块你自己修改下,有问题自己改改吧。好像100页之后的网页代码有点改变,所以只能搞的页数少一点的。 |
|