|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 2079501562 于 2021-2-1 12:21 编辑
在小甲鱼老师爬虫教程的视频中,有一个爬取豆瓣TOP250电影排行榜的例子
而现在的豆瓣排行榜还有评价,如肖申克的救赎“希望使人自由”
于是产生了顺便爬取评价的想法,代码如下
- # -*- coding: utf-8 -*-
- """
- Created on Mon Feb 1 09:15:48 2021
- @author: 夏の雪
- """
- import requests
- import bs4
- import os
- class Webpage():
- def __init__(self,page_num):
- self.names = []
- self.stars = []
- self.quote = []
- self.directors = []
- self.result = []
- self.years = []
- headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1;WOW64) AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/537.36 115Broswer/6.0.3'}
- page_url = 'https://movie.douban.com/top250?start='+str(25*(page_num-1))+'&filter='
- self.res = requests.get(page_url,
- headers = headers)
- soup = bs4.BeautifulSoup(self.res.text,'html.parser')
- names = soup.find_all('div',class_='hd')
- stars = soup.find_all('span',class_='rating_num' )
- #quote = soup.find_all('span',class_='inq')
- years = soup.find_all('div',class_='bd')
- directors = years
-
-
- for i in names:
- self.names.append(i.a.span.text)
- for i in stars:
- self.stars.append(i.text)
- '''
- for i in quote:
- if ????:
- self.quote.append('评价暂无')
- else:
- self.quote.append(i.text)
- '''
- for i in years:
- try:
- self.years.append(i.p.text.split('\n')[2].strip().split('\xa0/')[0])
- except:
- continue
- for i in directors:
- try:
- self.directors.append(i.p.text.split('\n')[1].strip().split('\xa0')[0])
- except:
- continue
-
- def save(self):
- for i in range(25):
- #self.result.append(self.names[i] +' 评分:'+ self.stars[i] + ' ' + self.directors[i] + ' "' + self.quote[i] + '"' +' 上映时间:' + self.years[i] + '\n')
- self.result.append(self.names[i] +' 评分:'+ self.stars[i] + ' ' + self.directors[i] + ' 上映时间:' + self.years[i] + '\n')
-
- if __name__ == '__main__':
- '''
- a = Webpage(8)
- a.save()
- for each in a.result:
- print(each)
- '''
- try:
- os.remove('豆瓣评分top250.txt')
- except FileNotFoundError:
- pass
- for i in range(1,11):
- a = Webpage(i)
- a.save()
- with open('豆瓣评分top250.txt',"a",encoding = "utf-8") as f:
- for each in a.result:
- f.write(each)
复制代码
程序中注释掉的部分为无法执行的部分,因为第8页开始有的电影并没有评价 ,导致元组的整个出现错误如下图
- import requests
- from lxml import etree
- def main():
- base_url = 'https://movie.douban.com/top250?'
- headers = {'user-agent': 'firefox'}
- movies = []
- for x in range(0, 250, 25):
- url = base_url + f'start={x}'
- r = requests.get(url, headers=headers)
- html = etree.HTML(r.text)
- lis = html.xpath('//ol[@class="grid_view"]/li')
- for li in lis:
- m_name = li.xpath('.//div[@class="hd"]/a/span[1]/text()')[0]
- m_rank = li.xpath('.//div[@class="pic"]/em/text()')[0]
- m_score = li.xpath('.//div[@class="star"]/span[2]/text()')[0]
- m_comment = li.xpath('./div/div[2]/div[2]/p[2]/span/text()') # 获取评价
- if not m_comment: # 如果为空
- m_comment = '没有评价' # 写入没有评价
- else:
- m_comment = m_comment[0] # 如果有值,那么取出内容(因为xpath取出后是列表)
- movies.append([m_name, m_rank, m_score, m_comment])
- for i in movies:
- print(i)
- print('=' * 100)
- if __name__ == '__main__':
- main()
复制代码
|
|