import urllib.request
import os
import base64
import re
def url_open(url):
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36')
response = urllib.request.urlopen(req)
html = response.read()
print(html)
return html
def get_page(url):
html = url_open(url).decode('utf-8')
a = r'<span class="current-comment-page">\[(\d+)\]</span>'
page_list = re.findall(a,html)
for each in page_list:
t = each
return t
def base_num(page_num):
times = '20200811-' + str(page_num)
s = times.encode('utf-8')
sw = base64.b64encode(s)
str_sw = str(sw)
string = str_sw.split("'")[-1]
return string
def find_pic_url(url):
html = url_open(url).decode('utf-8')
pic_url = []
a = r'<img src="([^"]+\.jpg)"'
b = re.findall(a,html)
if not len(b):
pic_url.append(b)
return pic_url
def save_pic(folder, pic_url):
for each in pic_url:
filename = each.split('/')[-1]
with open(filename, 'wb') as f:
img = open_url(each)
f.write(img)
def download_mm(folder = 'mm', pages = 10):
#创建一个文件夹
os.mkdir(folder)
os.chdir(folder)
url = 'http://jandan.net/ooxx/'
#获取页码
page_num = int(get_page(url))
for i in range(pages):
page_num -=i
#base64加密
x = base_num(page_num)
#获取页码地址
page_url = url + x + '#comments'
#获取图片具体地址并保存成列表
pic_url = find_pic_url(page_url)
save_pic(folder, pic_url)
if __name__ == '__main__':
download_mm()
|