课程爬虫求助
本帖最后由 mashiro666 于 2020-3-24 01:44 编辑import urllib.request
import os
head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
def get_page(url):
req = urllib.request.Request(url,headers=head)
response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')
a = html.find('page-numbers current') + 21
b = html.find('<', a)
return html
def find_imgs(url):
req = urllib.request.Request(url, headers=head)
response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')
img_addrs = []
a = html.find('src=')
while a != -1:
b = html.find('.jpg', a, a+255)
if b != -1:
img_addrs.append(html) #[]左闭右开
else:
b = a + 5
a = html.find('src=', b)
return img_addrs
def save_imgs(folder,img_address):
for each in img_address:
filename = each.split('/')[-1] #用/分割,拿出最后一个
with open(filename, 'wb') as f:
req = urllib.request.Request(each,headers=head)
response = urllib.request.urlopen(req)
img = response.read()
f.write(img)
#文件名mm,爬5页
def download_mzitu(folder='mm',pages=5):
#修改文件地址
#创建文件夹
os.mkdir(folder)
os.chdir(folder)
#网站地址
url = 'https://www.mzitu.com/mm/'
#获得页面的地址
page_num = int(get_page(url))
for i in range(pages):
page_num += 1
page_url =url + 'page/' + str(page_num) + '/'
img_address = find_imgs(page_url)
save_imgs(folder, img_address)
if __name__ == '__main__':
print('test')
download_mzitu(folder='mm',pages=5)
Traceback (most recent call last):
File "E:/python__pycharm/056meizitu.py", line 67, in <module>
download_mzitu(folder='mm',pages=5)
File "E:/python__pycharm/056meizitu.py", line 56, in download_mzitu
page_num = int(get_page(url))
ValueError: invalid literal for int() with base 10: '>1'
我仿照小甲鱼写的代码,但是看不懂我哪里错了,可以帮忙看看哪里错了吗
错误信息狠明显,自己调试
写代码一定要自己会调试 wp231957 发表于 2020-3-24 06:15
错误信息狠明显,自己调试
写代码一定要自己会调试
我用debug调试了一下发现get_page里的a,b有问题,但是我改了还是不能按照页数显示
page_num = int(get_page(url))
这里应该是数值的数量不止一个,所以int无法转换成整形,你可以在这行代码上方提前打印一下‘get_page(url)’,看看是什么,然后再顺着问题找原因 闹闹YYY 发表于 2020-3-25 09:31
page_num = int(get_page(url))
这里应该是数值的数量不止一个,所以int无法转换成整形,你可以在这行代 ...
刚才看了一下,打印出来的是'>1',所以修改一下
#获得页面的地址
page_num = int(get_page(url))
改为:
num = get_page(url)
page_num = int(num)
随便切分一下,之后还有点问题,如果执行过程序,已经生成了文件夹,那么download_mzitu函数中的os.mkdir(folder)就会报错,说文件夹已经存在了,把这里注释之后又提示url 404,就没再看了,url问题自己看一下吧,是不是哪里写错了还是什么原因
页:
[1]