|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
采用与鱼c课程基本相同的代码,只修改爬图的网址和页码循环的逻辑,爬 jandan.net/zoo 和 jandan.net/pic 都成功了,但是爬 jandan.net/ooxx 遇到问题,可以爬百来张,如果网站上发了新图也可以被爬下来,但是基本每次都在一个相同的位置停止报错:urllib.error.URLError: <urlopen error no host given>
代码如下:
import urllib.request
import os
import base64
import random
def url_open(url):
req = urllib.request.Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0')
# porxies = ['183.146.213.157:80', '39.106.114.143:80', '101.4.136.34:81']
# proxy = random.choice(porxies)
#
# proxy_support = urllib.request.ProxyHandler({'http':proxy})
# opener = urllib.request.build_opener(proxy_support)
# urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
html = response.read()
return html
def get_page(url):
html = url_open(url).decode('utf-8')
a = html.find('current-comment-page') + 23
b = html.find(']', a)
return html[a:b]
def find_imgs(url):
html = url_open(url).decode('utf-8')
img_addrs = []
a = html.find('img src=')
while a != -1:
b = html.find('.jpg', a, a+255)
if b != -1:
img_addrs.append('http:'+html[a+9:b+4])
else:
b = a+9
a = html.find('img src=', b)
return img_addrs
def save_imgs(img_addrs):
for each in img_addrs:
filename = each.split('/')[-1]
with open(filename, 'wb') as f:
img = url_open(each)
f.write(img)
def download_animal(folder='jiandan_ooxx'):
os.mkdir(folder)
os.chdir(folder)
url = 'http://jandan.net/ooxx/'
page_num = int(get_page(url))
while page_num != 0:
page_index = str(base64.b64encode(('20200217-' + str(page_num)).encode('utf-8')), 'utf-8') #对网址进行base64编码
page_url = url + page_index + '#comments'
img_addrs = find_imgs(page_url)
save_imgs(img_addrs)
page_num -= 1
if __name__ == '__main__':
download_animal()
报错:
Traceback (most recent call last):
File "C:/python_pycharm/test.py", line 71, in <module>
download_animal()
File "C:/python_pycharm/test.py", line 66, in download_animal
save_imgs(img_addrs)
File "C:/python_pycharm/test.py", line 52, in save_imgs
img = url_open(each)
File "C:/python_pycharm/test.py", line 18, in url_open
response = urllib.request.urlopen(url)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 522, in open
req = meth(req)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python38\lib\urllib\request.py", line 1243, in do_request_
raise URLError('no host given')
urllib.error.URLError: <urlopen error no host given>
Process finished with exit code 1
|
|