|
楼主 |
发表于 2022-5-1 01:09:41
|
显示全部楼层
嘻嘻,我根据您给我找到的response 刚写了一个导出排行的脚本!完全各种卡壳 json导出来的文件也好难读懂哦, 其实我是不是用正则表达式会更快找到我需要的啊
import requests
import json
import re
import openpyxl as xl
import time
def get_input():
keywords = input("请输入关键词:")
pages = int(input("请输入要爬取得页数(1~28):"))
while pages not in range(1, 28):
pages = int(input("请输入正确的页数:"))
return keywords, pages
def get_res(url):
headers = {
'uesr-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
}
res = requests.get(url, headers = headers )
soup = json.loads(res.text) ##转换成json
data = (soup['data']['result'])[-1]['data'] ##定位到数据结构层
result = []
for i in data:
i['title'] = i['title'].replace('<em class="keyword">','')
i['title'] = i['title'].replace('</em>','')##删除掉一些没过滤掉的信息
result.append([i['title'],i['play'],i['favorites'],i['author'],i['arcurl'],i['description']]) ## 以列表的形式导出需要的数据【标题,播放量,点赞,UP主,URL,简介】
return result
def save_excel(result,keywords):
wb = xl.Workbook()
wb.guess_types = True ## 1 是数字 0 代表是文本
ws = wb.active
ws.append(['标题','播放量','点赞','UP主','URL','简介'])
for each in result:
ws.append(each)
##排版
ws.column_dimensions['A'].width = 80
ws.column_dimensions['E'].width = 45
ws.freeze_panes = 'F1'
for i in ws['A1':'F1']:
for j in i:
j.font = xl.styles.Font(size = 16 ,bold = True , color = 'FF0000')
wb.save(f'B站{keywords}排行版.xlsx')
def main():
keywords , pages = get_input()
result_pages = []
for i in range(1,pages+1):
url = f'https://api.bilibili.com/x/web-interface/search/all/v2?__refresh__=true&_extra=&context=&page={i}&page_size=42&order=&duration=&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword={keywords}&preload=true&com2co=true'
result_pages.extend(get_res(url))
save_excel(result_pages,keywords)
if __name__ =='__main__':
main()
|
|