本帖最后由 Twilight.. 于 2021-8-10 03:02 编辑 import urllib.request as ur
import chardet as ch
list_html = []
with open(r'C:\Users\whoo\Desktop\python作业\urls.txt') as f:
detail_url = f.readlines()
#print(detail_url)
for each in detail_url:
# print(each) # 检测爬取哪个网站失败 发现是豆瓣
if 'douban' not in each:
res = ur.urlopen(each).read()
encod = ch.detect(res)['encoding']
html = res.decode(encod)
list_html.append(html)
else:
ur_header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/92.0.4515.131 Safari/537.36\
Edg/92.0.902.67'
}
req = ur.Request(each, headers=ur_header)
res = ur.urlopen(each).read()
encod = ch.detect(res)['encoding']
html = res.decode(encod)
list_html.append(html)
for i in range(len(list_html)):
# if 'douban' not in list_html[i]: # 检验无豆瓣情况
file_name = 'url_' + str(i+1) + '.txt'
with open(r'C:\Users\whoo\Desktop\python作业\%s' % file_name, \
'w', encoding='utf-8') as f_url:
f_url.write(list_html[i])
如果不爬取豆瓣可以成功运行,爬取豆瓣就失败了,查阅了相关资料发现是少了header,但是我加入header仍然报错,错误原因似乎还是缺少header,求教大佬这是什么情况,感谢!Traceback (most recent call last):
File "C:/Users/whoo/Desktop/python作业/053.py", line 56, in <module>
res = ur.urlopen(each).read()
File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 214, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 523, in open
response = meth(req, response)
File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 632, in http_response
response = self.parent.error(
File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 561, in error
return self._call_chain(*args)
File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 494, in _call_chain
result = func(*args)
File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 641, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 418:
又看了一遍你的代码,你传参都传不对,这是基于你代码改的,hearders尽量放前边,这个一般是公用的 import urllib.request as ur
import chardet as ch
list_html = []
detail_url = ["http://www.fishc.com", "http://www.baidu.com",
"https://www.douban.com/", "http://www.zhihu.com", "http://www.taobao.com"]
# print(detail_url)
ur_header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/92.0.4515.131 Safari/537.36\
Edg/92.0.902.67'
}
for each in detail_url:
if 'douban' not in each:
res = ur.urlopen(each).read()
encod = ch.detect(res)['encoding']
# html = res.decode(encod) # 获取网站编码
list_html.append(encod)
else:
req = ur.Request(each, headers=ur_header)
res = ur.urlopen(req).read() # 不是传入each
encod = ch.detect(res)['encoding']
# html = res.decode(encod)
list_html.append(encod)
for i in list_html:
print(i)
# for i in range(len(list_html)):
# # if 'douban' not in list_html[i]: # 检验无豆瓣情况
# file_name = 'url_' + str(i+1) + '.txt'
# with open(r'C:\Users\whoo\Desktop\python作业\%s' % file_name,
# 'w', encoding='utf-8') as f_url:
# f_url.write(list_html[i])
|