爬取B站评论,下面代码只能爬取一级评论,有没有大佬指导以下如何爬取交互回复信息.
import requestsimport re
import time
import csv
# 消息头信息
header = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
}
# 获取评论API
original_url = 'https://api.bilibili.com/x/v2/reply/main?jsonp=jsonp&next={}&type=1&oid={}&mode=3'
# 时间戳转换成日期
def get_time(ctime):
timeArray = time.localtime(ctime)
otherStyleTime = time.strftime("%Y.%m.%d", timeArray)
return str(otherStyleTime)
# 获取aid
def get_oid(bvid):
video_url = 'https://www.bilibili.com/video/' + bvid
page = requests.get(video_url, headers=header).text
aid = re.search(r'"aid":+', page).group()
return aid
# 边爬取评论边保存文件
def online_save(bvid):
all_count = 0
oid = get_oid(bvid)
page = 1
url = original_url.format(page, oid)
html = requests.get(url, headers=header)
data = html.json()
count = int(data['data']['cursor']['all_count'])
fname = bvid + '_评论.csv'
with open(fname, 'w+', newline='', encoding='utf_8_sig') as f:
csv_writer = csv.writer(f)
csv_writer.writerow(["时间", "点赞", "用户名", "评论"])# Added "用户名" header
for i in data['data']['replies']:
message = i['content']['message']
message = re.sub('\s+', '', message)
ctime = get_time(i['ctime'])
like = i['like']
username = i['member']['uname']# Added to get username
csv_writer.writerow()# Added username
all_count = all_count + 1
# Check for and collect reply comments
if 'replies' in i:
for reply in i['replies']:
reply_message = reply['content']['message']
reply_message = re.sub('\s+', '', reply_message)
reply_like = reply['like']
reply_username = reply['member']['uname']
csv_writer.writerow(["REPLY", str(reply_like), reply_username, reply_message])
print('总评论数:{},当前评论数:{},爬取Page{}完毕。'.format(count, all_count, page))
time.sleep(5)
while all_count < count:
page += 1
url = original_url.format(page, oid)
try:
html = requests.get(url, headers=header)
data = html.json()
for i in data['data']['replies']:
message = i['content']['message']
ctime = get_time(i['ctime'])
like = i['like']
username = i['member']['uname']
csv_writer.writerow()
all_count = all_count + 1
# Check for and collect reply comments
if 'replies' in i:
for reply in i['replies']:
reply_message = reply['content']['message']
reply_message = re.sub('\s+', '', reply_message)
reply_like = reply['like']
reply_username = reply['member']['uname']
csv_writer.writerow(["REPLY", str(reply_like), reply_username, reply_message])
print('总评论数:{},当前评论数:{},爬取Page{}完毕。'.format(count, all_count, page))
time.sleep(5)
except:
break
f.close()
if __name__ == '__main__':
bvid = input('输入视频Bvid:')
online_save(bvid)
print('完成!')
@不二如是
页:
[1]