|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 atrago 于 2022-3-8 18:32 编辑
操作环境:Jupyter Notebook
目的:
1、获取豆瓣网站Top250电影排名数据。
2、根据所得数据获得电影排名分布情况和电影年代分布情况。
实现过程:
1、豆瓣高分电影Top250数据获取并分析评分分布情况和年代分布情况
2、利用bs4、request模块爬取豆瓣电影排行榜数据,对获取的数据利用Numpy模块,Pandas模块,进行数据清洗,Matplotlib模块进行可视化
3、成功得到电影评分分布情况和电影年代分布情况可视化图表
困难及解决方案:
1、获取数据后对数据格式排列没有清洗的概念,不知道如何两个表格数据进行合并;查询利用Numpy模块中的merge函数。
2、可视化标签设置错误;查阅matplotlib.pyplot相关阅读指导进行改正。
改进计划:
1、代码标签有部分乱码现象,需要改正
2、将函数封装成模块
Code:- import requests
- import bs4
- import re
- import numpy as ny
- import pandas as pd
- import matplotlib.pyplot as plt
- import matplotlib as mpl
- import os
- from pandas.core.frame import DataFrame
- %matplotlib inline
- #设置图形内嵌
- import matplotlib.style as stl
- stl.use('ggplot') #自带样式美化
- mpl.rcParams['font.sans-serif']=['SimHei'] # #指定默认字体 SimHei为黑体
- def opens(host):#获取地址
-
- headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.76'}
- res = requests.get(host,headers = headers)
-
- return res
- def find_depth(res):#获取总共多少页
- soup = bs4.BeautifulSoup(res.text,'html.parser')
- depth = soup.find('span',class_='next').previous_sibling.previous_sibling.text
- return int(depth)
- def find_movie(res):#获取表格信息
- soup = bs4.BeautifulSoup(res.text,'html.parser')
- title = [] # 电影名
- takes = soup.find_all('div',class_='hd')
- for i in takes:
- title.append(i.a.span.text)
-
- ranks = [] #评分
- takes = soup.find_all('span',class_='rating_num')
- for i in takes:
- ranks.append('%s'%i.text)
-
- tags = [] #标签
- tags_d = []
- takes = soup.find_all('div',class_='bd')
- for i in takes:
- try:
- tags.append(i.p.text.split('\n')[1].strip() +
- i.p.text.split('\n')[2].strip())
- #处理数据,获取年份
- tags_str = ''.join(tags)
- tags_str_d = tags_str.split('...')[1]
- tags_d.append(tags_str_d[:4])
- except:
- continue
-
- result = []
- lenght = len(title)
- for i in range(lenght):
- result.append([title[i],ranks[i],tags_d[i]])#生成二维数据,电影名,评分,年份
- return result
-
- def fun4(df):
-
- pass
-
- def main():
- host = "https://movie.douban.com/top250"
- res = opens(host)
- depth = find_depth(res)
-
- result = []
- for i in range(depth):
- url = host + '/?start=' + str(25*i)
- res = opens(url)
- result.extend(find_movie(res))
- #########################
- with open("onex.txt", "w", encoding="utf-8") as f:
- for each in result:
- f.write(each)
- #######################
- result = []
- fk = pd.read_table('onex.txt',header=None)#读取txt
- for i in range(250):
- result.append(str(fk.iloc[i,0]))
- a = result.copy()
- b = result.copy()
- c = result.copy()
- result_name = []
- result_rank = []
- result_year = []
- for i in range(250):#存进列表
- result_name.append(a[i].split(' ')[0])
- result_rank.append(float(b[i].split('评分:')[1][:3]))
- if ((c[i].split('/')[-3]).removesuffix('\xa0')[-4:]).isdigit() == True :#将无效值变为None
- result_year.append(int((c[i].split('/')[-3]).removesuffix('\xa0')[-4:]))
- else:
- result_year.append(None)
-
- # pot = []
- # for i in range(250):
- # pot.append([result_name[i],result_rank[i],result_year[i]])
- dic = {'name':result_name,'rank':result_rank,'year':result_year}#生成字典
- pot = DataFrame(dic)#获得二维表格数据
-
- pot_r = pot.groupby('rank').count()#按评分分组
- result_pot_r = pot_r.sort_values('rank',ascending=True)['name']
-
- fig= plt.figure(num = 1,figsize=(10,5))
- x = ny.linspace(8.3,9.7,15)
- plt.bar(range(len(result_pot_r)),result_pot_r,tick_label=x)#生成评分分布图表
- plt.title('电影评分分布情况')
- for i in range(len(result_pot_r)):
- plt.text(i-0.1,result_pot_r.iloc[i],result_pot_r.iloc[i],fontsize=12,verticalalignment='center')#标签
- plt.tight_layout()
-
- #按年份进行分组
- bins = ny.arange(1910,2020,10)
- labels = [str(bins[i]) + '-' + str(bins[i+1]) for i in range(0,len(bins)-1)]
- pot_y1 = pd.cut(pot['year'],bins=bins,labels=labels)#划分区间
- pot_y1_count = pd.value_counts(pot_y1)#统计数量
- fig= plt.figure(num = 2,figsize=(8,3))
- plt.bar(labels,pot_y1_count)#生成年代分布图表
- plt.title('电影年代分布情况')
- for i in range(len(labels)):
- plt.text(i-0.1,pot_y1_count[i],pot_y1_count[i],fontsize=12,verticalalignment='center')#标签
- plt.tight_layout()
- if __name__ == '__main__':
- main()
复制代码
附件如下:
|
|