B站的评论爬取

1248762042 · 发表于 2021-11-28 12:49:32

爬取视频评论爬取不出来

1248762042 · 发表于 2021-12-4 22:15:59

import time
import json
import requests
import pymysql
from pyquery import PyQuery as pq
def getNum(headers):
# 获取投币、点赞、收藏数
def get_num(clsName, doc):
sign = doc(clsName)
sign.find("i").remove()
sign.find("canvas").remove()
return sign.text()
global like, coin, collect
url = "https://www.bilibili.com/video/BV1s34y1R7u9?spm_id_from=333.851.b_7265636f6d6d656e64.3"
r = requests.get(url, headers=headers)
doc = pq(r.text)
like = get_num(".like", doc)
coin = get_num(".coin", doc)
collect = get_num(".collect", doc)
# 获取页面评论数据
def get_html(url):
try:
r = requests.get(url, timeout=50, headers=headers)
r.raise_for_status()
r.encoding = 'utf-8'
except:
print("请求发生错误")
return None
else:
return r.text
def get_comments(url):
global commentsNum
comments = []
html = get_html(url)
if not html:
return None
try:
data = json.loads(html)
except:
print("JSON数据转化错误")
num = len(data['data']['replies']) # 获取每页评论栏的数量
if not commentsNum:
commentsNum = data['data']['cursor']['all_count']
i = 0
while i < num:
comment = data['data']['replies'][i] # 获取每栏信息
InfoDict = {} # 存储每组信息字典
InfoDict['用户名'] = comment['member']['uname']
InfoDict['评论内容'] = comment['content']['message']
InfoDict['rpid'] = comment['rpid']
comments.append(InfoDict)
i += 1
return comments
def WriteToFile(comments):
index = 0
with open('评论.txt', 'a+', encoding='utf-8') as f:
for comment in comments:
try:
i = 1
error = 0
page = 1
index += 1
f.write('姓名：{}\t 评论内容：{}\t rpid: {} \n'
.format(comment['用户名'], comment['评论内容'], comment['rpid']))
while error == 0:
url = "https://api.bilibili.com/x/v2/reply/reply?&jsonp=jsonp&pn=" + str(i) + \
"&type=1&oid=807015981&ps=10&root=" + str(comment['rpid']) + \
"&_=1638614197025"
try:
childComments = get_comments(url)
for childComment in childComments:
with open('子评论.txt', 'a+', encoding='utf-8') as f1:
f1.write('姓名：{}\t 评论内容：{}\t rpid: {} \n'
.format(childComment['用户名'], childComment['评论内容'], childComment['rpid']))
i += 1
print("\t正在爬取第", index, "条评论", "\t第", page, "子页")
page = page + 1
# 降低ip被封的风险，每爬2页就歇1秒。
if page % 2 == 0:
time.sleep(1)
except:
error = 1
except:
print("写文件时发生错误")
print('当前页面保存完成')
def main():
getNum(headers)
error = 0
page = 1
endPage = 5 # 爬取评论的页数
while error == 0 and page <= endPage:
url = "https://api.bilibili.com/x/v2/reply/main?&jsonp=jsonp&next=" + str(
page) + "&type=1&oid=807015981&mode=3&plat=1&_=1634475863039"
try:
comments = get_comments(url)
if not comments:
raise Exception
print("正在爬取第", page, "页")
WriteToFile(comments)
page = page + 1
# 降低ip被封的风险，每爬10页就歇5秒。
if page % 10 == 0:
time.sleep(5)
except:
error = 1
with open("(点赞投币收藏评论)总数.txt", 'a+', encoding='utf-8') as f:
f.write("点赞数:{}\t 投币数:{}\t 收藏数:{}\t 评论数:{}\n".format(like, coin, collect, commentsNum))
if __name__ == '__main__':
like = 0 # 点赞总数
coin = 0 # 投币总数
collect = 0 # 收藏总数
commentsNum = 0 # 评论总数
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) appleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
main()

复制代码

1248762042 · 发表于 2021-11-28 12:56:20

用Selenium爬取时, 爬出不出评论部分的html代码

suchocolate · 发表于 2021-11-28 13:22:35

你的代码呢

1248762042 · 发表于 2021-11-28 16:07:21

suchocolate 发表于 2021-11-28 13:22
你的代码呢

import requests
from requests.exceptions import RequestException
import pymysql
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def get_num(clsName, doc):
sign = doc(clsName)
sign.find("i").remove()
sign.find("canvas").remove()
return sign.text()
browser = webdriver.Chrome()
browser.get('https://www.bilibili.com/video/BV17P4y1V7BX?spm_id_from=333.851.b_7265636f6d6d656e64.1')
wait = WebDriverWait(browser, 10)
div = wait.until( EC.presence_of_element_located( (By.CSS_SELECTOR, ".comment-list") ))
doc1 = pq(browser.page_source)
div = doc1(".comment-list")
print(div)
like = get_num(".like", doc1)
coin = get_num(".coin", doc1)
collect = get_num(".collect", doc1)
print(like, coin, collect)
browser.close()
def main():
pass
if __name__ == '__main__':
main()

复制代码

1248762042 · 发表于 2021-11-28 16:08:02

1248762042 发表于 2021-11-28 16:07

<div class="comment-list "/>
结果只有一个这个

复制代码

路神 · 发表于 2021-11-29 09:11:34

直接请求接口省事

1248762042 · 发表于 2021-12-4 22:15:25

import time
import json
import requests
import pymysql
from pyquery import PyQuery as pq
def getNum(headers):
# 获取投币、点赞、收藏数
def get_num(clsName, doc):
sign = doc(clsName)
sign.find("i").remove()
sign.find("canvas").remove()
return sign.text()
global like, coin, collect
url = "https://www.bilibili.com/video/BV1s34y1R7u9?spm_id_from=333.851.b_7265636f6d6d656e64.3"
r = requests.get(url, headers=headers)
doc = pq(r.text)
like = get_num(".like", doc)
coin = get_num(".coin", doc)
collect = get_num(".collect", doc)
# 获取页面评论数据
def get_html(url):
try:
r = requests.get(url, timeout=50, headers=headers)
r.raise_for_status()
r.encoding = 'utf-8'
except:
print("请求发生错误")
return None
else:
return r.text
def get_comments(url):
global commentsNum
comments = []
html = get_html(url)
if not html:
return None
try:
data = json.loads(html)
except:
print("JSON数据转化错误")
num = len(data['data']['replies']) # 获取每页评论栏的数量
if not commentsNum:
commentsNum = data['data']['cursor']['all_count']
i = 0
while i < num:
comment = data['data']['replies'][i] # 获取每栏信息
InfoDict = {} # 存储每组信息字典
InfoDict['用户名'] = comment['member']['uname']
InfoDict['评论内容'] = comment['content']['message']
InfoDict['rpid'] = comment['rpid']
comments.append(InfoDict)
i += 1
return comments
def WriteToFile(comments):
index = 0
with open('评论.txt', 'a+', encoding='utf-8') as f:
for comment in comments:
try:
i = 1
error = 0
page = 1
index += 1
f.write('姓名：{}\t 评论内容：{}\t rpid: {} \n'
.format(comment['用户名'], comment['评论内容'], comment['rpid']))
while error == 0:
url = "https://api.bilibili.com/x/v2/reply/reply?&jsonp=jsonp&pn=" + str(i) + \
"&type=1&oid=807015981&ps=10&root=" + str(comment['rpid']) + \
"&_=1638614197025"
try:
childComments = get_comments(url)
for childComment in childComments:
with open('子评论.txt', 'a+', encoding='utf-8') as f1:
f1.write('姓名：{}\t 评论内容：{}\t rpid: {} \n'
.format(childComment['用户名'], childComment['评论内容'], childComment['rpid']))
i += 1
print("\t正在爬取第", index, "条评论", "\t第", page, "子页")
page = page + 1
# 降低ip被封的风险，每爬2页就歇1秒。
if page % 2 == 0:
time.sleep(1)
except:
error = 1
except:
print("写文件时发生错误")
print('当前页面保存完成')
def main():
getNum(headers)
error = 0
page = 1
endPage = 5 # 爬取评论的页数
while error == 0 and page <= endPage:
url = "https://api.bilibili.com/x/v2/reply/main?&jsonp=jsonp&next=" + str(
page) + "&type=1&oid=807015981&mode=3&plat=1&_=1634475863039"
try:
comments = get_comments(url)
if not comments:
raise Exception
print("正在爬取第", page, "页")
WriteToFile(comments)
page = page + 1
# 降低ip被封的风险，每爬10页就歇5秒。
if page % 10 == 0:
time.sleep(5)
except:
error = 1
with open("(点赞投币收藏评论)总数.txt", 'a+', encoding='utf-8') as f:
f.write("点赞数:{}\t 投币数:{}\t 收藏数:{}\t 评论数:{}\n".format(like, coin, collect, commentsNum))
if __name__ == '__main__':
like = 0 # 点赞总数
coin = 0 # 投币总数
collect = 0 # 收藏总数
commentsNum = 0 # 评论总数
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) appleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
main()

复制代码

账号		自动登录	找回密码
密码			立即注册

B站的评论爬取

浏览过的版块