|

楼主 |
发表于 2021-12-4 22:15:59
|
显示全部楼层
- import time
- import json
- import requests
- import pymysql
- from pyquery import PyQuery as pq
- def getNum(headers):
- # 获取投币、点赞、收藏数
- def get_num(clsName, doc):
- sign = doc(clsName)
- sign.find("i").remove()
- sign.find("canvas").remove()
- return sign.text()
- global like, coin, collect
- url = "https://www.bilibili.com/video/BV1s34y1R7u9?spm_id_from=333.851.b_7265636f6d6d656e64.3"
- r = requests.get(url, headers=headers)
- doc = pq(r.text)
- like = get_num(".like", doc)
- coin = get_num(".coin", doc)
- collect = get_num(".collect", doc)
- # 获取页面评论数据
- def get_html(url):
- try:
- r = requests.get(url, timeout=50, headers=headers)
- r.raise_for_status()
- r.encoding = 'utf-8'
- except:
- print("请求发生错误")
- return None
- else:
- return r.text
- def get_comments(url):
- global commentsNum
- comments = []
- html = get_html(url)
- if not html:
- return None
- try:
- data = json.loads(html)
- except:
- print("JSON数据转化错误")
- num = len(data['data']['replies']) # 获取每页评论栏的数量
- if not commentsNum:
- commentsNum = data['data']['cursor']['all_count']
- i = 0
- while i < num:
- comment = data['data']['replies'][i] # 获取每栏信息
- InfoDict = {} # 存储每组信息字典
- InfoDict['用户名'] = comment['member']['uname']
- InfoDict['评论内容'] = comment['content']['message']
- InfoDict['rpid'] = comment['rpid']
- comments.append(InfoDict)
- i += 1
- return comments
- def WriteToFile(comments):
- index = 0
- with open('评论.txt', 'a+', encoding='utf-8') as f:
- for comment in comments:
- try:
- i = 1
- error = 0
- page = 1
- index += 1
- f.write('姓名:{}\t 评论内容:{}\t rpid: {} \n'
- .format(comment['用户名'], comment['评论内容'], comment['rpid']))
- while error == 0:
- url = "https://api.bilibili.com/x/v2/reply/reply?&jsonp=jsonp&pn=" + str(i) + \
- "&type=1&oid=807015981&ps=10&root=" + str(comment['rpid']) + \
- "&_=1638614197025"
- try:
- childComments = get_comments(url)
- for childComment in childComments:
- with open('子评论.txt', 'a+', encoding='utf-8') as f1:
- f1.write('姓名:{}\t 评论内容:{}\t rpid: {} \n'
- .format(childComment['用户名'], childComment['评论内容'], childComment['rpid']))
- i += 1
- print("\t正在爬取第", index, "条评论", "\t第", page, "子页")
- page = page + 1
- # 降低ip被封的风险,每爬2页 就歇1秒。
- if page % 2 == 0:
- time.sleep(1)
- except:
- error = 1
- except:
- print("写文件时发生错误")
- print('当前页面保存完成')
- def main():
- getNum(headers)
- error = 0
- page = 1
- endPage = 5 # 爬取评论的页数
- while error == 0 and page <= endPage:
- url = "https://api.bilibili.com/x/v2/reply/main?&jsonp=jsonp&next=" + str(
- page) + "&type=1&oid=807015981&mode=3&plat=1&_=1634475863039"
- try:
- comments = get_comments(url)
- if not comments:
- raise Exception
- print("正在爬取第", page, "页")
- WriteToFile(comments)
- page = page + 1
- # 降低ip被封的风险,每爬10页 就歇5秒。
- if page % 10 == 0:
- time.sleep(5)
- except:
- error = 1
- with open("(点赞 投币 收藏 评论)总数.txt", 'a+', encoding='utf-8') as f:
- f.write("点赞数:{}\t 投币数:{}\t 收藏数:{}\t 评论数:{}\n".format(like, coin, collect, commentsNum))
- if __name__ == '__main__':
- like = 0 # 点赞总数
- coin = 0 # 投币总数
- collect = 0 # 收藏总数
- commentsNum = 0 # 评论总数
- headers = {
- 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
- 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) appleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
- }
- main()
复制代码 |
|