|
3鱼币
本帖最后由 Twilight.. 于 2021-8-10 03:02 编辑
- import urllib.request as ur
- import chardet as ch
- list_html = []
- with open(r'C:\Users\whoo\Desktop\python作业\urls.txt') as f:
- detail_url = f.readlines()
-
- #print(detail_url)
-
- for each in detail_url:
- # print(each) # 检测爬取哪个网站失败 发现是豆瓣
-
- if 'douban' not in each:
- res = ur.urlopen(each).read()
- encod = ch.detect(res)['encoding']
- html = res.decode(encod)
- list_html.append(html)
- else:
- ur_header = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
- AppleWebKit/537.36 (KHTML, like Gecko)\
- Chrome/92.0.4515.131 Safari/537.36\
- Edg/92.0.902.67'
- }
- req = ur.Request(each, headers=ur_header)
- res = ur.urlopen(each).read()
- encod = ch.detect(res)['encoding']
- html = res.decode(encod)
- list_html.append(html)
-
- for i in range(len(list_html)):
- # if 'douban' not in list_html[i]: # 检验无豆瓣情况
- file_name = 'url_' + str(i+1) + '.txt'
- with open(r'C:\Users\whoo\Desktop\python作业\%s' % file_name, \
- 'w', encoding='utf-8') as f_url:
- f_url.write(list_html[i])
复制代码
如果不爬取豆瓣可以成功运行,爬取豆瓣就失败了,查阅了相关资料发现是少了header,但是我加入header仍然报错,错误原因似乎还是缺少header,求教大佬这是什么情况,感谢!
- Traceback (most recent call last):
- File "C:/Users/whoo/Desktop/python作业/053.py", line 56, in <module>
- res = ur.urlopen(each).read()
- File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 214, in urlopen
- return opener.open(url, data, timeout)
- File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 523, in open
- response = meth(req, response)
- File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 632, in http_response
- response = self.parent.error(
- File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 561, in error
- return self._call_chain(*args)
- File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 494, in _call_chain
- result = func(*args)
- File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 641, in http_error_default
- raise HTTPError(req.full_url, code, msg, hdrs, fp)
- urllib.error.HTTPError: HTTP Error 418:
复制代码
又看了一遍你的代码,你传参都传不对,这是基于你代码改的,hearders尽量放前边,这个一般是公用的
- import urllib.request as ur
- import chardet as ch
- list_html = []
- detail_url = ["http://www.fishc.com", "http://www.baidu.com",
- "https://www.douban.com/", "http://www.zhihu.com", "http://www.taobao.com"]
- # print(detail_url)
- ur_header = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
- AppleWebKit/537.36 (KHTML, like Gecko)\
- Chrome/92.0.4515.131 Safari/537.36\
- Edg/92.0.902.67'
- }
- for each in detail_url:
- if 'douban' not in each:
- res = ur.urlopen(each).read()
- encod = ch.detect(res)['encoding']
- # html = res.decode(encod) # 获取网站编码
- list_html.append(encod)
- else:
- req = ur.Request(each, headers=ur_header)
- res = ur.urlopen(req).read() # 不是传入each
- encod = ch.detect(res)['encoding']
- # html = res.decode(encod)
- list_html.append(encod)
- for i in list_html:
- print(i)
- # for i in range(len(list_html)):
- # # if 'douban' not in list_html[i]: # 检验无豆瓣情况
- # file_name = 'url_' + str(i+1) + '.txt'
- # with open(r'C:\Users\whoo\Desktop\python作业\%s' % file_name,
- # 'w', encoding='utf-8') as f_url:
- # f_url.write(list_html[i])
复制代码
|
最佳答案
查看完整内容
又看了一遍你的代码,你传参都传不对,这是基于你代码改的,hearders尽量放前边,这个一般是公用的
|