爬取煎蛋妹子图求助
import urllib.requestimport os
import base64
import re
def url_open(url):
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36')
response = urllib.request.urlopen(req)
html = response.read()
print(html)
return html
def get_page(url):
html = url_open(url).decode('utf-8')
a = r'<span class="current-comment-page">\[(\d+)\]</span>'
page_list = re.findall(a,html)
for each in page_list:
t = each
return t
def base_num(page_num):
times = '20200811-' + str(page_num)
s = times.encode('utf-8')
sw = base64.b64encode(s)
str_sw = str(sw)
string = str_sw.split("'")[-1]
return string
def find_pic_url(url):
html = url_open(url).decode('utf-8')
pic_url = []
a = r'<img src="([^"]+\.jpg)"'
b = re.findall(a,html)
if not len(b):
pic_url.append(b)
return pic_url
def save_pic(folder, pic_url):
for each in pic_url:
filename = each.split('/')[-1]
with open(filename, 'wb') as f:
img = open_url(each)
f.write(img)
def download_mm(folder = 'mm', pages = 10):
#创建一个文件夹
os.mkdir(folder)
os.chdir(folder)
url = 'http://jandan.net/ooxx/'
#获取页码
page_num = int(get_page(url))
for i in range(pages):
page_num -=i
#base64加密
x = base_num(page_num)
#获取页码地址
page_url = url + x + '#comments'
#获取图片具体地址并保存成列表
pic_url = find_pic_url(page_url)
save_pic(folder, pic_url)
if __name__ == '__main__':
download_mm()
我这个代码哪里出了错误 导致下载不下来图片呢
帮你改完了,代码中一个函数名拼错了,还有加密的地方有点错误
以及获取图片 url 时候,你 if 判断导致返回的永远是 pic_url = [] 空列表了
import urllib.request
import os
import base64
import re
def url_open(url):
req = urllib.request.Request(url)
req.add_header('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36')
response = urllib.request.urlopen(req)
html = response.read()
return html
def get_page(url):
html = url_open(url).decode('utf-8')
a = r'<span class="current-comment-page">\[(\d+)\]</span>'
page_list = re.findall(a, html)
for each in page_list:
t = each
return t
def base_num(page_num):
page_num = '20200811-'+ str(page_num)
result = base64.b64encode(page_num.encode())
result = result.decode('utf_8')
return result
def find_pic_url(url):
html = url_open(url).decode('utf-8')
a = r'<img src="([^"]+\.jpg)"'
pic_url = re.findall(a, html)
return pic_url
def save_pic(pic_url):
for each in pic_url:
filename = each.split('/')[-1]
with open(filename, 'wb') as f:
img = url_open('http:'+each)
f.write(img)
def download_mm(folder='mm', pages=10):
# 创建一个文件夹
os.mkdir(folder)
os.chdir(folder)
url = 'http://jandan.net/ooxx/'
# 获取页码
page_num = int(get_page(url))
for i in range(pages):
page_num -= i
# base64加密
x = base_num(page_num)
# 获取页码地址
page_url = url + x + '#comments'
# 获取图片具体地址并保存成列表
pic_url = find_pic_url(page_url)
save_pic(pic_url)
if __name__ == '__main__':
download_mm()
页:
[1]