豆瓣Top250数据可视化
本帖最后由 atrago 于 2022-3-8 18:32 编辑操作环境:Jupyter Notebook
目的:
1、获取豆瓣网站Top250电影排名数据。
2、根据所得数据获得电影排名分布情况和电影年代分布情况。
实现过程:
1、豆瓣高分电影Top250数据获取并分析评分分布情况和年代分布情况
2、利用bs4、request模块爬取豆瓣电影排行榜数据,对获取的数据利用Numpy模块,Pandas模块,进行数据清洗,Matplotlib模块进行可视化
3、成功得到电影评分分布情况和电影年代分布情况可视化图表
困难及解决方案:
1、获取数据后对数据格式排列没有清洗的概念,不知道如何两个表格数据进行合并;查询利用Numpy模块中的merge函数。
2、可视化标签设置错误;查阅matplotlib.pyplot相关阅读指导进行改正。
改进计划:
1、代码标签有部分乱码现象,需要改正
2、将函数封装成模块
Code:import requests
import bs4
import re
import numpy as ny
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
from pandas.core.frame import DataFrame
%matplotlib inline
#设置图形内嵌
import matplotlib.style as stl
stl.use('ggplot') #自带样式美化
mpl.rcParams['font.sans-serif']=['SimHei']# #指定默认字体 SimHei为黑体
def opens(host):#获取地址
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.76'}
res = requests.get(host,headers = headers)
return res
def find_depth(res):#获取总共多少页
soup = bs4.BeautifulSoup(res.text,'html.parser')
depth = soup.find('span',class_='next').previous_sibling.previous_sibling.text
return int(depth)
def find_movie(res):#获取表格信息
soup = bs4.BeautifulSoup(res.text,'html.parser')
title = [] # 电影名
takes = soup.find_all('div',class_='hd')
for i in takes:
title.append(i.a.span.text)
ranks = [] #评分
takes = soup.find_all('span',class_='rating_num')
for i in takes:
ranks.append('%s'%i.text)
tags = [] #标签
tags_d = []
takes = soup.find_all('div',class_='bd')
for i in takes:
try:
tags.append(i.p.text.split('\n').strip() +
i.p.text.split('\n').strip())
#处理数据,获取年份
tags_str = ''.join(tags)
tags_str_d = tags_str.split('...')
tags_d.append(tags_str_d[:4])
except:
continue
result = []
lenght = len(title)
for i in range(lenght):
result.append(,ranks,tags_d])#生成二维数据,电影名,评分,年份
return result
def fun4(df):
pass
def main():
host = "https://movie.douban.com/top250"
res = opens(host)
depth = find_depth(res)
result = []
for i in range(depth):
url = host + '/?start=' + str(25*i)
res = opens(url)
result.extend(find_movie(res))
#########################
with open("onex.txt", "w", encoding="utf-8") as f:
for each in result:
f.write(each)
#######################
result = []
fk = pd.read_table('onex.txt',header=None)#读取txt
for i in range(250):
result.append(str(fk.iloc))
a = result.copy()
b = result.copy()
c = result.copy()
result_name = []
result_rank = []
result_year = []
for i in range(250):#存进列表
result_name.append(a.split(' '))
result_rank.append(float(b.split('评分:')[:3]))
if ((c.split('/')[-3]).removesuffix('\xa0')[-4:]).isdigit() == True :#将无效值变为None
result_year.append(int((c.split('/')[-3]).removesuffix('\xa0')[-4:]))
else:
result_year.append(None)
# pot = []
# for i in range(250):
# pot.append(,result_rank,result_year])
dic = {'name':result_name,'rank':result_rank,'year':result_year}#生成字典
pot = DataFrame(dic)#获得二维表格数据
pot_r = pot.groupby('rank').count()#按评分分组
result_pot_r = pot_r.sort_values('rank',ascending=True)['name']
fig= plt.figure(num = 1,figsize=(10,5))
x = ny.linspace(8.3,9.7,15)
plt.bar(range(len(result_pot_r)),result_pot_r,tick_label=x)#生成评分分布图表
plt.title('电影评分分布情况')
for i in range(len(result_pot_r)):
plt.text(i-0.1,result_pot_r.iloc,result_pot_r.iloc,fontsize=12,verticalalignment='center')#标签
plt.tight_layout()
#按年份进行分组
bins = ny.arange(1910,2020,10)
labels = ) + '-' + str(bins) for i in range(0,len(bins)-1)]
pot_y1 = pd.cut(pot['year'],bins=bins,labels=labels)#划分区间
pot_y1_count = pd.value_counts(pot_y1)#统计数量
fig= plt.figure(num = 2,figsize=(8,3))
plt.bar(labels,pot_y1_count)#生成年代分布图表
plt.title('电影年代分布情况')
for i in range(len(labels)):
plt.text(i-0.1,pot_y1_count,pot_y1_count,fontsize=12,verticalalignment='center')#标签
plt.tight_layout()
if __name__ == '__main__':
main()
附件如下:
页:
[1]