changmind 发表于 2020-7-2 23:17:15

爬豆瓣TOP250失败

这个是原代码,运行后没有报错,可是没有文件出来,不知道哪里错了。
import requests
import bs4
import random
import openpyxl

def open_url(url):
    '''
    ip_poor = ['61.19.145.66:8080','105.27.237.27:80','110.36.239.234:8080'\
         '110.39.187.50:49850','159.192.141.89:8080','162.243.244.206:80'\
         '184.149.34.86:51529','181.118.167.104:80','3.6.99.110:80']
    proxies = {'http':random.choice(ip_poor)}
    '''      
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
   
    #html = request.get(url=url, headers=headers, proxies=proxies)
    html = request.get(url,headers=headers)
   
    return html

def find_info(html):
    content = bs4.BeautifulSoup(html.text,'html.parser')

    #电影名
    movie_name = []
    target_name = content.find_all('div',class_='hd')
    for each_name in target_name:
      movie_name.append(each_name.a.span.text)

    #评分
    movie_score = []
    target_score = content.find_all('span',class_='rating5-t')
    for each_score in target_score:
      movie_score.append(each_score.text)

    #信息
    movie_info = []
    target_info = content.find_all('div',class_='bd')
    for each_info in target_score:
      try:
            movie_info.append(each_info.p.text.split('\n').strip() + each_info.p.text.split('\n').strip())
      except:
            continue

    content = []
    for i in range(len(movie_name)):
      result.append(movie_name + movie_score + movie_info)

    return contenr

def next_page(html):
    target = bs4.BeautifulSoup(html.text,'html.parser')
    next_path = target.find_all('span',class_='next').previous_sibling.previous_sibling.text
    return int(next_path)

def save_excel(content):
    wb = openpyxl.Workbook()
    ws = wb.active
    ws['A1'] = '电影名'
    ws['B1'] = '评分'
    ws['C1'] = '电影信息'
    for each in content:
      ws.append(each)
    wb.save(r'D:\Desktop\豆瓣TOP250电影.xlsx')

def main():
    url = 'https://movie.douban.com/top250'
    html = open_url(url)
    next_path = next_path(html)

    content = []
    for i in range(next_path):
      url_next = url + '/?start=' + str(25 * i)
      html = open_url(url_next)
      content.extend(find_info(html))

    save_excel(content)

if __name__ == 'main':
    main()

Twilight6 发表于 2020-7-2 23:30:10

本帖最后由 Twilight6 于 2020-7-2 23:32 编辑




第16行使用 requests 模块少了个 s
错误:html = request.get(url,headers=headers)
正确:html = requests.get(url,headers=headers)

第 44 行~ 46 行,result 不知从何而来,应该是你不小心写成了 content = []
错误:
content = []
    for i in range(len(movie_name)):
      result.append(movie_name + movie_score + movie_info)
return contenr
正确:
result = []
    for i in range(len(movie_name)):
      result.append(movie_name + movie_score + movie_info)
return result

第 68 行,next_page 写成了 next_path:
错误:next_path = next_path(html)
正确:next_path = next_page(html)

暂时就找到这么多错误,是否运行成功我还没看


changmind 发表于 2020-7-2 23:33:13

Twilight6 发表于 2020-7-2 23:30
第16行使用 requests 模块少了个 s




{:5_104:}是我太粗心了,看来要细心一点才行

changmind 发表于 2020-7-2 23:48:16

Twilight6 发表于 2020-7-2 23:30
第16行使用 requests 模块少了个 s




虽然错误排掉了,可是还是运行失败

Twilight6 发表于 2020-7-3 00:01:59

changmind 发表于 2020-7-2 23:48
虽然错误排掉了,可是还是运行失败



之前的存稿,也是看着视频和小甲鱼打的,你拿去对照下吧:

import requests
import bs4
import openpyxl


def open_url(url):

    headers = {
      'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}

    res = requests.get(url, headers=headers)

    return res


def find_movies(res):
    soup = bs4.BeautifulSoup(res.text, 'html.parser')

    # 电影名
    movies = []
    targets = soup.find_all("div", class_="hd")
    for each in targets:
      movies.append(each.a.span.text)

    # 评分
    ranks = []
    targets = soup.find_all("span", class_="rating_num")
    for each in targets:
      ranks.append(each.text)

    # 资料
    messages = []
    targets = soup.find_all("div", class_="bd")
    for each in targets:
      try:
            messages.append(each.p.text.split('\n').strip() + each.p.text.split('\n').strip())
      except:
            continue

    result = []
    length = len(movies)
    for i in range(length):
      result.append( ,ranks ,messages])

    return result


# 找出一共有多少个页面
def find_depth(res):
    soup = bs4.BeautifulSoup(res.text, 'html.parser')
    depth = soup.find('span', class_='next').previous_sibling.previous_sibling.text

    return int(depth)

def save_to_excel(result):
    wb = openpyxl.Workbook()
    ws = wb.active

    ws['A1'] = '电影名称'
    ws['B1'] = '评分'
    ws['C1'] = '资料'

    for each in result:
      ws.append(each)

    wb.save('豆瓣TOP250.xlsx')


def main():
    host = "https://movie.douban.com/top250"
    res = open_url(host)
    depth = find_depth(res)

    result = []
    for i in range(depth):
      url = host + '/?start=' + str(25 * i)
      res = open_url(url)
      result.extend(find_movies(res))
    save_to_excel(result)


if __name__ == "__main__":
    main()
页: [1]
查看完整版本: 爬豆瓣TOP250失败