import time
import json
import requests
import pymysql
from pyquery import PyQuery as pq
def getNum(headers):
# 获取投币、点赞、收藏数
def get_num(clsName, doc):
sign = doc(clsName)
sign.find("i").remove()
sign.find("canvas").remove()
return sign.text()
global like, coin, collect
url = "https://www.bilibili.com/video/BV1s34y1R7u9?spm_id_from=333.851.b_7265636f6d6d656e64.3"
r = requests.get(url, headers=headers)
doc = pq(r.text)
like = get_num(".like", doc)
coin = get_num(".coin", doc)
collect = get_num(".collect", doc)
# 获取页面评论数据
def get_html(url):
try:
r = requests.get(url, timeout=50, headers=headers)
r.raise_for_status()
r.encoding = 'utf-8'
except:
print("请求发生错误")
return None
else:
return r.text
def get_comments(url):
global commentsNum
comments = []
html = get_html(url)
if not html:
return None
try:
data = json.loads(html)
except:
print("JSON数据转化错误")
num = len(data['data']['replies']) # 获取每页评论栏的数量
if not commentsNum:
commentsNum = data['data']['cursor']['all_count']
i = 0
while i < num:
comment = data['data']['replies'][i] # 获取每栏信息
InfoDict = {} # 存储每组信息字典
InfoDict['用户名'] = comment['member']['uname']
InfoDict['评论内容'] = comment['content']['message']
InfoDict['rpid'] = comment['rpid']
comments.append(InfoDict)
i += 1
return comments
def WriteToFile(comments):
index = 0
with open('评论.txt', 'a+', encoding='utf-8') as f:
for comment in comments:
try:
i = 1
error = 0
page = 1
index += 1
f.write('姓名:{}\t 评论内容:{}\t rpid: {} \n'
.format(comment['用户名'], comment['评论内容'], comment['rpid']))
while error == 0:
url = "https://api.bilibili.com/x/v2/reply/reply?&jsonp=jsonp&pn=" + str(i) + \
"&type=1&oid=807015981&ps=10&root=" + str(comment['rpid']) + \
"&_=1638614197025"
try:
childComments = get_comments(url)
for childComment in childComments:
with open('子评论.txt', 'a+', encoding='utf-8') as f1:
f1.write('姓名:{}\t 评论内容:{}\t rpid: {} \n'
.format(childComment['用户名'], childComment['评论内容'], childComment['rpid']))
i += 1
print("\t正在爬取第", index, "条评论", "\t第", page, "子页")
page = page + 1
# 降低ip被封的风险,每爬2页 就歇1秒。
if page % 2 == 0:
time.sleep(1)
except:
error = 1
except:
print("写文件时发生错误")
print('当前页面保存完成')
def main():
getNum(headers)
error = 0
page = 1
endPage = 5 # 爬取评论的页数
while error == 0 and page <= endPage:
url = "https://api.bilibili.com/x/v2/reply/main?&jsonp=jsonp&next=" + str(
page) + "&type=1&oid=807015981&mode=3&plat=1&_=1634475863039"
try:
comments = get_comments(url)
if not comments:
raise Exception
print("正在爬取第", page, "页")
WriteToFile(comments)
page = page + 1
# 降低ip被封的风险,每爬10页 就歇5秒。
if page % 10 == 0:
time.sleep(5)
except:
error = 1
with open("(点赞 投币 收藏 评论)总数.txt", 'a+', encoding='utf-8') as f:
f.write("点赞数:{}\t 投币数:{}\t 收藏数:{}\t 评论数:{}\n".format(like, coin, collect, commentsNum))
if __name__ == '__main__':
like = 0 # 点赞总数
coin = 0 # 投币总数
collect = 0 # 收藏总数
commentsNum = 0 # 评论总数
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) appleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
main()
|