Twilight.. 发表于 2021-8-10 02:59:58

python 53课课后作业第2题求助

本帖最后由 Twilight.. 于 2021-8-10 03:02 编辑

import urllib.request as ur
import chardet as ch

list_html = []

with open(r'C:\Users\whoo\Desktop\python作业\urls.txt') as f:
    detail_url = f.readlines()
   
#print(detail_url)
   
for each in detail_url:
#    print(each)# 检测爬取哪个网站失败 发现是豆瓣
                  
    if 'douban' not in each:
      res = ur.urlopen(each).read()
      encod = ch.detect(res)['encoding']
      html = res.decode(encod)
      list_html.append(html)

    else:
      ur_header = {
                  'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
                  AppleWebKit/537.36 (KHTML, like Gecko)\
                  Chrome/92.0.4515.131 Safari/537.36\
                  Edg/92.0.902.67'
                   }
      req = ur.Request(each, headers=ur_header)
      res = ur.urlopen(each).read()
      encod = ch.detect(res)['encoding']
      html = res.decode(encod)
      list_html.append(html)
      
for i in range(len(list_html)):
#    if 'douban' not in list_html:# 检验无豆瓣情况
      file_name = 'url_' + str(i+1) + '.txt'

      with open(r'C:\Users\whoo\Desktop\python作业\%s' % file_name, \
                  'w', encoding='utf-8') as f_url:
            f_url.write(list_html)




如果不爬取豆瓣可以成功运行,爬取豆瓣就失败了,查阅了相关资料发现是少了header,但是我加入header仍然报错,错误原因似乎还是缺少header,求教大佬这是什么情况,感谢!
Traceback (most recent call last):
File "C:/Users/whoo/Desktop/python作业/053.py", line 56, in <module>
    res = ur.urlopen(each).read()
File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 214, in urlopen
    return opener.open(url, data, timeout)
File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 523, in open
    response = meth(req, response)
File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 632, in http_response
    response = self.parent.error(
File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 561, in error
    return self._call_chain(*args)
File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 494, in _call_chain
    result = func(*args)
File "C:\Users\whoo\AppData\Local\Programs\Python\Python39\lib\urllib\request.py", line 641, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 418:

大马强 发表于 2021-8-10 02:59:59

又看了一遍你的代码,你传参都传不对,这是基于你代码改的,hearders尽量放前边,这个一般是公用的
import urllib.request as ur
import chardet as ch

list_html = []
detail_url = ["http://www.fishc.com", "http://www.baidu.com",
            "https://www.douban.com/", "http://www.zhihu.com", "http://www.taobao.com"]

# print(detail_url)
ur_header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
                  AppleWebKit/537.36 (KHTML, like Gecko)\
                  Chrome/92.0.4515.131 Safari/537.36\
                  Edg/92.0.902.67'
}
for each in detail_url:

    if 'douban' not in each:
      res = ur.urlopen(each).read()
      encod = ch.detect(res)['encoding']
      # html = res.decode(encod) # 获取网站编码
      list_html.append(encod)

    else:

      req = ur.Request(each, headers=ur_header)
      res = ur.urlopen(req).read()# 不是传入each
      encod = ch.detect(res)['encoding']
      # html = res.decode(encod)
      list_html.append(encod)

for i in list_html:

    print(i)
# for i in range(len(list_html)):
#   #    if 'douban' not in list_html:# 检验无豆瓣情况
#   file_name = 'url_' + str(i+1) + '.txt'

#   with open(r'C:\Users\whoo\Desktop\python作业\%s' % file_name,
#               'w', encoding='utf-8') as f_url:
#         f_url.write(list_html)

大马强 发表于 2021-8-10 07:42:53

应该是反扒,加上个cookie,
import urllib.request as ur
import chardet as ch

list_html = []
detail_url = ["http://www.fishc.com", "http://www.baidu.com",
            "https://www.douban.com/", "http://www.zhihu.com", "http://www.taobao.com"]
headers = {
    'Cookie': 'bid=CT96GElqqtk; ll="118316"; _vwo_uuid_v2=DEEC02882F097FFCFC1334EDD12A884A5|0c957d156eb10a1b686d47e5345faada; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1606179914%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utmc=30149280; __utma=223695111.1749305921.1603814435.1605861645.1606179914.4; __utmb=223695111.0.10.1606179914; __utmc=223695111; __utmz=223695111.1606179914.4.4.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=30149280.1168884207.1603814435.1606179914.1606179914.4; __utmz=30149280.1606179914.4.4.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; __utmb=30149280.2.9.1606179914; RT=nu=https%3A%2F%2Fmovie.douban.com%2Fsubject%2F5294851%2F&cl=1606180086310&r=https%3A%2F%2Fmovie.douban.com%2Ftyperank%3Ftype_name%3D%25E5%2589%25A7%25E6%2583%2585%25E7%2589%2587%26type%3D11%26interval_id%3D100%3A90%26action%3D&ul=1606180142344&hd=1606180142436; _pk_id.100001.4cf6=0c01a4e5f6734023.1603814435.3.1606180143.1605862965.', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
# with open(r'./python/其他/豆瓣.txt', "w", encoding="utf-8") as f:
#   detail_url = f.readlines()

# print(detail_url)

for each in detail_url:
    #    print(each)# 检测爬取哪个网站失败 发现是豆瓣
    req = urllib.request.Request(each, headers=headers)
    response = urllib.request.urlopen(req)
    html = response.read()
    encode = ch.detect(html)['encoding']
    list_html.append(encode)

for i in list_html:
    print(i)

# for i in range(len(list_html)):
#   print(i)
#      if 'douban' not in list_html:# 检验无豆瓣情况
#   file_name = 'url_' + str(i+1) + '.txt'

#   with open(r'C:\Users\whoo\Desktop\python作业\%s' % file_name,
#               'w', encoding='utf-8') as f_url:
#         f_url.write(list_html)

大马强 发表于 2021-8-10 07:52:17

还有你目标不是为了获取网站编码吗?

King丨小义 发表于 2021-8-10 07:59:28

看了半天,你28行打开的网页错了。怎么能打开each,应该是req.
28行改成res = ur.urlopen(req).read()   就行了

Twilight.. 发表于 2021-8-10 10:34:56

大马强 发表于 2021-8-10 07:50
又看了一遍你的代码,你传参都传不对,这是基于你代码改的,hearders尽量放前边,这个一般是公用的

谢谢大佬,可能是时间太晚了有点糊涂了{:10_266:}
请问修改代码中的headers是大部分网站的通用标头吗

大马强 发表于 2021-8-10 10:38:31

headers一般存放你本机的数据
页: [1]
查看完整版本: python 53课课后作业第2题求助