反扒
import requestsfrom bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
try:
headers = {'User-Agent':'Mozilla/5.0',\
'Accept':'*/*',\
'Accept-Encoding':'gzip',\
'Accept-Language':'zh-Hans-CN',\
'Cha-Control':'no-cacha',\
'Host':'img3.doubanio.com',\
}
r = requests.get(url,headers=headers)
r.encoding = r.apparent_encoding
r.raise_for_status
print(r.status_code)
return r,text
except:
return '爬取失败'
def fillTop(html,ulist):
soup = BeautifulSoup(html,"html.parser")
movie = soup.find_all('li')
for i in movie:
ulist.append(i.div.div.em.text)
def printMovie(ulist):
print(ulist)
def main():
url = "https://movie.douban.com/top250"
html = getHTMLText(url)
unifo = []
fillTop(html,unifo)
printMovie(unifo)
main()
404了, 404是请求错误 就像你请求google一样 永远都是404
可能是网络原因 请求超时了 你调整一下请求超时的时间 而且这个TOP250就没有反爬 admintest166 发表于 2020-4-3 14:41
404是请求错误 就像你请求google一样 永远都是404
可能是网络原因 请求超时了 你调整一下请求超时的时间...
它这个直接爬返回的是418,然后我设置了表头返回的是404 只用加 User-Agent 就可以了
import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url):
try:
headers = {'User-Agent': 'Mozilla/5.0'}
r = requests.get(url, headers=headers)
r.encoding = r.apparent_encoding
print(r.status_code)
return r.text
except Exception:
return '爬取失败'
def fillTop(html, ulist):
soup = BeautifulSoup(html, "html.parser")
movie = soup.find_all('li')
for i in movie:
ulist.append(i.div.div.em.text)
def printMovie(ulist):
print(ulist)
def main():
url = "https://movie.douban.com/top250"
html = getHTMLText(url)
unifo = []
fillTop(html, unifo)
printMovie(unifo)
main()
页:
[1]