|
发表于 2019-9-28 17:15:58
|
显示全部楼层
- #!/usr/bin/env python2.7
- # -*- coding: utf-8 -*-
- # @Time : 2019/3/1 3:20
- # @Author : Stubbron
- # @Email : 1263270345@qq.com
- #参考链接:https://blog.csdn.net/qq_40721694/article/details/82356815
- #目标链接:https://music.163.com/song?id=460043703
- #未加密的API:http://music.163.com/api/v1/resource/comments/R_SO_4_460043703
- import requests,codecs
- import json
- import time
- from lxml import etree
- from wordcloud import WordCloud
- import numpy as np
- from PIL import Image
- import os
- import matplotlib.pyplot as plt
- class Config(object):
- @staticmethod
- def get_station_file_path(name):
- currrent_path = os.path.realpath(__file__) #文件绝对路径
- current_dir = os.path.split(currrent_path)[0]
- result = current_dir +""+str(name)+ ".json"
- return result
- class GetComments(object):
- def __init__(self):
- self.headers = {
- 'Referer': 'http://music.163.com/',
- 'Host': 'music.163.com',
- 'Accept-Language': "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
- 'Accept-Encoding': "gzip, deflate",
- 'Content-Type': "application/x-www-form-urlencoded",
- 'Origin': 'https://music.163.com',
- 'Connection': "keep-alive",
- 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
- ' (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
- }
- self.session = requests.session()
- def get_json(self,song_id, offset):
- """
- 获取json数据
- :param song_id: 歌曲id
- :param offset: 评论偏移量
- :return: json转成的dict
- """
- url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_%s?limit=20&offset=%s' % (song_id, offset)
- response = self.session.get(url=url,headers=self.headers).content
- json_dict = json.loads(response)
- return json_dict
- def structure_url(self, song_id, song_name):
- """
- 先获取评论总数,再分页爬取
- :param song_id: 歌曲id
- :param song_name: 歌曲名字
- :return:
- """
- if os.path.exists(Config.get_station_file_path(song_name)):
- print('正在读取 %s 的缓存评论' % songs_name)
- with open(Config.get_station_file_path(song_name),"r",encoding="UTF-8") as f:
- result = json.loads(f.read(),encoding="UTF-8")
- return result
- else:
- comments_list = [] #评论保存列表
- json_dict = self.get_json(song_id, 0)
- print(json_dict)
- comments_num = int(json_dict['total']) # 获取评论总数目
- if not comments_num % 20:
- page = comments_num / 20
- else:
- page = int(comments_num / 20) + 1
- print("总计%s条评论"%(page * 20))
- for i in range(page):
- json_dict = self.get_json(song_id, i * 20)
- for item in json_dict['comments']:
- comment = item['content'].replace("\n", "") # 获取评论内容 并去掉换行符
- #liked_count = item['likedCount'] # 点赞总数
- #comment_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(item['time'] / 1000)) # 获取评论时间
- #comment_info = comment #comment_time + ' ' + str(liked_count) + ' ' +
- comments_list.append(comment)
- print('第 %s 页获取完成.' % i)
- with open(Config.get_station_file_path(song_name),"w",encoding="UTF-8") as f:
- json.dump(comments_list,f)
- return comments_list
- def wordcloud(self,name):
- filr = ""
- mask = np.array(Image.open("timg.jpg")) # 矢量化图片
- with open(Config.get_station_file_path(name), "r", encoding="UTF-8") as f:
- result = json.loads(f.read(), encoding="UTF-8")
- for i in result:
- filr += i
- # print(filr[:50])
- # 生成词云
- wordcloud = WordCloud(
- random_state=4,
- scale=8, # 清晰度,越大越慢
- background_color='white', # 设置背景颜色
- max_words=3000, # 设置最大实现的字数
- mask=mask, # 加载图片
- font_path=r'FZSTK.TTF', # 设置字体格式,如不设置显示不了中文
- ).generate(filr)
- # 展示图片
- plt.imshow(wordcloud)
- plt.axis('off') # 关闭x,y轴显示
- plt.title("ciyun_one") # 次元标题
- plt.show() # 展示词云
- if __name__ == '__main__':
- singer_url = 'https://music.163.com/song?id=460043703' # 记得要去掉#号,太那啥了
- songs_name = "Perfect"
- songs_id = "460043703"
- spider = GetComments()
- print('正在收集 %s 的评论' % songs_name)
- json_dict = spider.structure_url(songs_id,songs_name)
- print('准备生成 %s 的词云' % songs_name)
- spider.wordcloud(songs_name)
复制代码
三月份的代码,不知道时效没有 |
|