B站的评论爬取
爬取视频评论爬取不出来 import timeimport json
import requests
import pymysql
from pyquery importPyQuery as pq
def getNum(headers):
# 获取投币、点赞、收藏数
def get_num(clsName, doc):
sign = doc(clsName)
sign.find("i").remove()
sign.find("canvas").remove()
return sign.text()
global like, coin, collect
url = "https://www.bilibili.com/video/BV1s34y1R7u9?spm_id_from=333.851.b_7265636f6d6d656e64.3"
r = requests.get(url, headers=headers)
doc = pq(r.text)
like = get_num(".like", doc)
coin = get_num(".coin", doc)
collect = get_num(".collect", doc)
# 获取页面评论数据
def get_html(url):
try:
r = requests.get(url, timeout=50, headers=headers)
r.raise_for_status()
r.encoding = 'utf-8'
except:
print("请求发生错误")
return None
else:
return r.text
def get_comments(url):
global commentsNum
comments = []
html = get_html(url)
if not html:
return None
try:
data = json.loads(html)
except:
print("JSON数据转化错误")
num = len(data['data']['replies'])# 获取每页评论栏的数量
if not commentsNum:
commentsNum = data['data']['cursor']['all_count']
i = 0
while i < num:
comment = data['data']['replies']# 获取每栏信息
InfoDict = {}# 存储每组信息字典
InfoDict['用户名'] = comment['member']['uname']
InfoDict['评论内容'] = comment['content']['message']
InfoDict['rpid'] = comment['rpid']
comments.append(InfoDict)
i += 1
return comments
def WriteToFile(comments):
index = 0
with open('评论.txt', 'a+', encoding='utf-8') as f:
for comment in comments:
try:
i = 1
error = 0
page = 1
index += 1
f.write('姓名:{}\t 评论内容:{}\t rpid: {} \n'
.format(comment['用户名'], comment['评论内容'], comment['rpid']))
while error == 0:
url = "https://api.bilibili.com/x/v2/reply/reply?&jsonp=jsonp&pn=" + str(i) + \
"&type=1&oid=807015981&ps=10&root=" + str(comment['rpid']) + \
"&_=1638614197025"
try:
childComments = get_comments(url)
for childComment in childComments:
with open('子评论.txt', 'a+', encoding='utf-8') as f1:
f1.write('姓名:{}\t 评论内容:{}\t rpid: {} \n'
.format(childComment['用户名'], childComment['评论内容'], childComment['rpid']))
i += 1
print("\t正在爬取第", index, "条评论", "\t第", page, "子页")
page = page + 1
# 降低ip被封的风险,每爬2页 就歇1秒。
if page % 2 == 0:
time.sleep(1)
except:
error = 1
except:
print("写文件时发生错误")
print('当前页面保存完成')
def main():
getNum(headers)
error = 0
page = 1
endPage = 5 # 爬取评论的页数
while error == 0 and page <= endPage:
url = "https://api.bilibili.com/x/v2/reply/main?&jsonp=jsonp&next=" + str(
page) + "&type=1&oid=807015981&mode=3&plat=1&_=1634475863039"
try:
comments = get_comments(url)
if not comments:
raise Exception
print("正在爬取第", page, "页")
WriteToFile(comments)
page = page + 1
# 降低ip被封的风险,每爬10页 就歇5秒。
if page % 10 == 0:
time.sleep(5)
except:
error = 1
with open("(点赞 投币 收藏 评论)总数.txt", 'a+', encoding='utf-8') as f:
f.write("点赞数:{}\t 投币数:{}\t 收藏数:{}\t 评论数:{}\n".format(like, coin, collect, commentsNum))
if __name__ == '__main__':
like = 0# 点赞总数
coin = 0# 投币总数
collect = 0# 收藏总数
commentsNum = 0# 评论总数
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) appleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
main() 用Selenium爬取时, 爬出不出评论部分的html代码 你的代码呢 suchocolate 发表于 2021-11-28 13:22
你的代码呢
importrequests
from requests.exceptions import RequestException
importpymysql
from pyquery importPyQuery as pq
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def get_num(clsName, doc):
sign = doc(clsName)
sign.find("i").remove()
sign.find("canvas").remove()
return sign.text()
browser = webdriver.Chrome()
browser.get('https://www.bilibili.com/video/BV17P4y1V7BX?spm_id_from=333.851.b_7265636f6d6d656e64.1')
wait = WebDriverWait(browser, 10)
div = wait.until( EC.presence_of_element_located( (By.CSS_SELECTOR, ".comment-list") ))
doc1 = pq(browser.page_source)
div = doc1(".comment-list")
print(div)
like = get_num(".like", doc1)
coin = get_num(".coin", doc1)
collect = get_num(".collect", doc1)
print(like, coin, collect)
browser.close()
def main():
pass
if __name__ == '__main__':
main() 1248762042 发表于 2021-11-28 16:07
<div class="comment-list "/>
结果只有一个这个 直接请求接口省事 import time
import json
import requests
import pymysql
from pyquery importPyQuery as pq
def getNum(headers):
# 获取投币、点赞、收藏数
def get_num(clsName, doc):
sign = doc(clsName)
sign.find("i").remove()
sign.find("canvas").remove()
return sign.text()
global like, coin, collect
url = "https://www.bilibili.com/video/BV1s34y1R7u9?spm_id_from=333.851.b_7265636f6d6d656e64.3"
r = requests.get(url, headers=headers)
doc = pq(r.text)
like = get_num(".like", doc)
coin = get_num(".coin", doc)
collect = get_num(".collect", doc)
# 获取页面评论数据
def get_html(url):
try:
r = requests.get(url, timeout=50, headers=headers)
r.raise_for_status()
r.encoding = 'utf-8'
except:
print("请求发生错误")
return None
else:
return r.text
def get_comments(url):
global commentsNum
comments = []
html = get_html(url)
if not html:
return None
try:
data = json.loads(html)
except:
print("JSON数据转化错误")
num = len(data['data']['replies'])# 获取每页评论栏的数量
if not commentsNum:
commentsNum = data['data']['cursor']['all_count']
i = 0
while i < num:
comment = data['data']['replies']# 获取每栏信息
InfoDict = {}# 存储每组信息字典
InfoDict['用户名'] = comment['member']['uname']
InfoDict['评论内容'] = comment['content']['message']
InfoDict['rpid'] = comment['rpid']
comments.append(InfoDict)
i += 1
return comments
def WriteToFile(comments):
index = 0
with open('评论.txt', 'a+', encoding='utf-8') as f:
for comment in comments:
try:
i = 1
error = 0
page = 1
index += 1
f.write('姓名:{}\t 评论内容:{}\t rpid: {} \n'
.format(comment['用户名'], comment['评论内容'], comment['rpid']))
while error == 0:
url = "https://api.bilibili.com/x/v2/reply/reply?&jsonp=jsonp&pn=" + str(i) + \
"&type=1&oid=807015981&ps=10&root=" + str(comment['rpid']) + \
"&_=1638614197025"
try:
childComments = get_comments(url)
for childComment in childComments:
with open('子评论.txt', 'a+', encoding='utf-8') as f1:
f1.write('姓名:{}\t 评论内容:{}\t rpid: {} \n'
.format(childComment['用户名'], childComment['评论内容'], childComment['rpid']))
i += 1
print("\t正在爬取第", index, "条评论", "\t第", page, "子页")
page = page + 1
# 降低ip被封的风险,每爬2页 就歇1秒。
if page % 2 == 0:
time.sleep(1)
except:
error = 1
except:
print("写文件时发生错误")
print('当前页面保存完成')
def main():
getNum(headers)
error = 0
page = 1
endPage = 5 # 爬取评论的页数
while error == 0 and page <= endPage:
url = "https://api.bilibili.com/x/v2/reply/main?&jsonp=jsonp&next=" + str(
page) + "&type=1&oid=807015981&mode=3&plat=1&_=1634475863039"
try:
comments = get_comments(url)
if not comments:
raise Exception
print("正在爬取第", page, "页")
WriteToFile(comments)
page = page + 1
# 降低ip被封的风险,每爬10页 就歇5秒。
if page % 10 == 0:
time.sleep(5)
except:
error = 1
with open("(点赞 投币 收藏 评论)总数.txt", 'a+', encoding='utf-8') as f:
f.write("点赞数:{}\t 投币数:{}\t 收藏数:{}\t 评论数:{}\n".format(like, coin, collect, commentsNum))
if __name__ == '__main__':
like = 0# 点赞总数
coin = 0# 投币总数
collect = 0# 收藏总数
commentsNum = 0# 评论总数
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) appleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
main()
页:
[1]