爬取网易云音乐精彩评论

ljj19971222 · 发表于 2019-9-28 08:56:55

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

根据小甲鱼的流程，打开网页，勾选disable cashe之后选取慢速加载页面，这里我只有fast 3g和slow 3g都是秒出页面，然后他出来的页面是播放窗口而不是刚才的评论界面，不过还是能找到精彩评论的那个文件，可是我双击那个文件之后他的网页打开地址并没有根据不同的歌曲而变始终是https://music.163.com/weapi/v1/resource/comments/get这样一个网址请问是为何

ljj19971222 · 发表于 2019-9-28 08:57:44

有人吗，不知道怎么发图片啊

ykn大神6 · 发表于 2019-9-28 10:14:27

不知道等级够不够

_谪仙 · 发表于 2019-9-28 12:46:00

网址不是这个，抓包时把preserve log勾选上

Stubborn · 发表于 2019-9-28 17:15:58

#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
# @Time : 2019/3/1 3:20
# @Author : Stubbron
# @Email : 1263270345@qq.com
#参考链接：https://blog.csdn.net/qq_40721694/article/details/82356815
#目标链接：https://music.163.com/song?id=460043703
#未加密的API:http://music.163.com/api/v1/resource/comments/R_SO_4_460043703
import requests,codecs
import json
import time
from lxml import etree
from wordcloud import WordCloud
import numpy as np
from PIL import Image
import os
import matplotlib.pyplot as plt
class Config(object):
@staticmethod
def get_station_file_path(name):
currrent_path = os.path.realpath(__file__) #文件绝对路径
current_dir = os.path.split(currrent_path)[0]
result = current_dir +""+str(name)+ ".json"
return result
class GetComments(object):
def __init__(self):
self.headers = {
'Referer': 'http://music.163.com/',
'Host': 'music.163.com',
'Accept-Language': "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
'Accept-Encoding': "gzip, deflate",
'Content-Type': "application/x-www-form-urlencoded",
'Origin': 'https://music.163.com',
'Connection': "keep-alive",
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
' (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
self.session = requests.session()
def get_json(self,song_id, offset):
"""
获取json数据
:param song_id: 歌曲id
:param offset: 评论偏移量
:return: json转成的dict
"""
url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_%s?limit=20&offset=%s' % (song_id, offset)
response = self.session.get(url=url,headers=self.headers).content
json_dict = json.loads(response)
return json_dict
def structure_url(self, song_id, song_name):
"""
先获取评论总数，再分页爬取
:param song_id: 歌曲id
:param song_name: 歌曲名字
:return:
"""
if os.path.exists(Config.get_station_file_path(song_name)):
print('正在读取 %s 的缓存评论' % songs_name)
with open(Config.get_station_file_path(song_name),"r",encoding="UTF-8") as f:
result = json.loads(f.read(),encoding="UTF-8")
return result
else:
comments_list = [] #评论保存列表
json_dict = self.get_json(song_id, 0)
print(json_dict)
comments_num = int(json_dict['total']) # 获取评论总数目
if not comments_num % 20:
page = comments_num / 20
else:
page = int(comments_num / 20) + 1
print("总计%s条评论"%(page * 20))
for i in range(page):
json_dict = self.get_json(song_id, i * 20)
for item in json_dict['comments']:
comment = item['content'].replace("\n", "") # 获取评论内容并去掉换行符
#liked_count = item['likedCount'] # 点赞总数
#comment_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(item['time'] / 1000)) # 获取评论时间
#comment_info = comment #comment_time + ' ' + str(liked_count) + ' ' +
comments_list.append(comment)
print('第 %s 页获取完成.' % i)
with open(Config.get_station_file_path(song_name),"w",encoding="UTF-8") as f:
json.dump(comments_list,f)
return comments_list
def wordcloud(self,name):
filr = ""
mask = np.array(Image.open("timg.jpg")) # 矢量化图片
with open(Config.get_station_file_path(name), "r", encoding="UTF-8") as f:
result = json.loads(f.read(), encoding="UTF-8")
for i in result:
filr += i
# print(filr[:50])
# 生成词云
wordcloud = WordCloud(
random_state=4,
scale=8, # 清晰度，越大越慢
background_color='white', # 设置背景颜色
max_words=3000, # 设置最大实现的字数
mask=mask, # 加载图片
font_path=r'FZSTK.TTF', # 设置字体格式，如不设置显示不了中文
).generate(filr)
# 展示图片
plt.imshow(wordcloud)
plt.axis('off') # 关闭x,y轴显示
plt.title("ciyun_one") # 次元标题
plt.show() # 展示词云
if __name__ == '__main__':
singer_url = 'https://music.163.com/song?id=460043703' # 记得要去掉#号，太那啥了
songs_name = "Perfect"
songs_id = "460043703"
spider = GetComments()
print('正在收集 %s 的评论' % songs_name)
json_dict = spider.structure_url(songs_id,songs_name)
print('准备生成 %s 的词云' % songs_name)
spider.wordcloud(songs_name)

复制代码

三月份的代码，不知道时效没有

账号		自动登录	找回密码
密码			立即注册