爬豆瓣TOP250失败
这个是原代码,运行后没有报错,可是没有文件出来,不知道哪里错了。import requests
import bs4
import random
import openpyxl
def open_url(url):
'''
ip_poor = ['61.19.145.66:8080','105.27.237.27:80','110.36.239.234:8080'\
'110.39.187.50:49850','159.192.141.89:8080','162.243.244.206:80'\
'184.149.34.86:51529','181.118.167.104:80','3.6.99.110:80']
proxies = {'http':random.choice(ip_poor)}
'''
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
#html = request.get(url=url, headers=headers, proxies=proxies)
html = request.get(url,headers=headers)
return html
def find_info(html):
content = bs4.BeautifulSoup(html.text,'html.parser')
#电影名
movie_name = []
target_name = content.find_all('div',class_='hd')
for each_name in target_name:
movie_name.append(each_name.a.span.text)
#评分
movie_score = []
target_score = content.find_all('span',class_='rating5-t')
for each_score in target_score:
movie_score.append(each_score.text)
#信息
movie_info = []
target_info = content.find_all('div',class_='bd')
for each_info in target_score:
try:
movie_info.append(each_info.p.text.split('\n').strip() + each_info.p.text.split('\n').strip())
except:
continue
content = []
for i in range(len(movie_name)):
result.append(movie_name + movie_score + movie_info)
return contenr
def next_page(html):
target = bs4.BeautifulSoup(html.text,'html.parser')
next_path = target.find_all('span',class_='next').previous_sibling.previous_sibling.text
return int(next_path)
def save_excel(content):
wb = openpyxl.Workbook()
ws = wb.active
ws['A1'] = '电影名'
ws['B1'] = '评分'
ws['C1'] = '电影信息'
for each in content:
ws.append(each)
wb.save(r'D:\Desktop\豆瓣TOP250电影.xlsx')
def main():
url = 'https://movie.douban.com/top250'
html = open_url(url)
next_path = next_path(html)
content = []
for i in range(next_path):
url_next = url + '/?start=' + str(25 * i)
html = open_url(url_next)
content.extend(find_info(html))
save_excel(content)
if __name__ == 'main':
main()
本帖最后由 Twilight6 于 2020-7-2 23:32 编辑
第16行使用 requests 模块少了个 s
错误:html = request.get(url,headers=headers)
正确:html = requests.get(url,headers=headers)
第 44 行~ 46 行,result 不知从何而来,应该是你不小心写成了 content = []
错误:
content = []
for i in range(len(movie_name)):
result.append(movie_name + movie_score + movie_info)
return contenr
正确:
result = []
for i in range(len(movie_name)):
result.append(movie_name + movie_score + movie_info)
return result
第 68 行,next_page 写成了 next_path:
错误:next_path = next_path(html)
正确:next_path = next_page(html)
暂时就找到这么多错误,是否运行成功我还没看
Twilight6 发表于 2020-7-2 23:30
第16行使用 requests 模块少了个 s
{:5_104:}是我太粗心了,看来要细心一点才行 Twilight6 发表于 2020-7-2 23:30
第16行使用 requests 模块少了个 s
虽然错误排掉了,可是还是运行失败 changmind 发表于 2020-7-2 23:48
虽然错误排掉了,可是还是运行失败
之前的存稿,也是看着视频和小甲鱼打的,你拿去对照下吧:
import requests
import bs4
import openpyxl
def open_url(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}
res = requests.get(url, headers=headers)
return res
def find_movies(res):
soup = bs4.BeautifulSoup(res.text, 'html.parser')
# 电影名
movies = []
targets = soup.find_all("div", class_="hd")
for each in targets:
movies.append(each.a.span.text)
# 评分
ranks = []
targets = soup.find_all("span", class_="rating_num")
for each in targets:
ranks.append(each.text)
# 资料
messages = []
targets = soup.find_all("div", class_="bd")
for each in targets:
try:
messages.append(each.p.text.split('\n').strip() + each.p.text.split('\n').strip())
except:
continue
result = []
length = len(movies)
for i in range(length):
result.append( ,ranks ,messages])
return result
# 找出一共有多少个页面
def find_depth(res):
soup = bs4.BeautifulSoup(res.text, 'html.parser')
depth = soup.find('span', class_='next').previous_sibling.previous_sibling.text
return int(depth)
def save_to_excel(result):
wb = openpyxl.Workbook()
ws = wb.active
ws['A1'] = '电影名称'
ws['B1'] = '评分'
ws['C1'] = '资料'
for each in result:
ws.append(each)
wb.save('豆瓣TOP250.xlsx')
def main():
host = "https://movie.douban.com/top250"
res = open_url(host)
depth = find_depth(res)
result = []
for i in range(depth):
url = host + '/?start=' + str(25 * i)
res = open_url(url)
result.extend(find_movies(res))
save_to_excel(result)
if __name__ == "__main__":
main()
页:
[1]