b站视频下载 爬虫
单线程版本#单线程版本
import requests,re,json
from pathlib import Path
from ffmpy3 import FFmpeg
from tqdm import tqdm
headers={"Cookie":"SESSDATA=这里填自己的",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36 Edg/96.0.1054.41",
"Referer":"https://www.bilibili.com/"}
def get_video_info(video_url:str):
"""[通过正则表达式定位文本 得到视频信息]
Args:
video_url (str): [视频链接]
Returns:
: [包含视频信息的字典]
"""
video_html=requests.get(video_url,headers=headers)
#<span class="cur-page">(1/12)</span>
pages=re.findall(r'<span class="cur-page">\((.*?)\)</span>', video_html.text)#分页视频总数
total_page=1
if len(pages) !=0:
cur_page=str(pages).split('/')
total_page=int(str(pages).split('/'))
print(f'当前为多p视频总页数为{total_page},当前页数为{cur_page}')
else:
print('当前为单p视频')
video_info={}
for p in range(1,total_page+1):
video_url=re.sub('p=.*', f'p={p}', video_url)
video_html=requests.get(video_url,headers=headers)
video_down_re=re.findall(r'<script>window\.__playinfo__=(.*?)</script>', video_html.text) #下载信息
video_down_json=json.loads(video_down_re)
video_base_re=re.findall(r'<script>window.__INITIAL_STATE__=(.*?);\(function\(\)', video_html.text) #基本信息
video_base_json=json.loads(video_base_re)
video_title=video_base_json['videoData']['pages']['part']
video_info=dict(video_download_url=video_down_json['data']['dash']['video']['baseUrl'],audio_download_url=video_down_json['data']['dash']['audio']['baseUrl'])
return video_info
def video_download(video_title:str,video_download_url:str,audio_download_url:str):
"""[下载并合拼视频和音频]
Args:
video_title:str 视频标题
video_download_url (str): [视频下载链接]
audio_download_url (str): [音频下载链接]
"""
res=requests.get(url=video_download_url,headers=headers,stream=True)
chunk_size=1024 #每次下载大小byte
content_size=int(res.headers['Content-Length']) # 总大小 byte
# pbar=tqdm(total=content_size,desc=f'{video_title}.mp4',ncols=100,unit='byte',unit_scale=True)#进度条
# with open(f'{video_title}.mp4','wb') as stream:
# for data in res.iter_content(chunk_size=chunk_size):#分块读取数据
# stream.write(data)
# pbar.update(len(data))
# pbar.close()
res=requests.get(url=audio_download_url,headers=headers)
content_size=int(res.headers['Content-Length']) # 总大小 byte
pbar=tqdm(total=content_size,desc=f'{video_title}.mp3',ncols=100,unit='byte',unit_scale=True)#进度条
with open(f'{video_title}.mp3','wb') as stream:
for data in res.iter_content(chunk_size=chunk_size):#分块读取数据
stream.write(data)
pbar.update(len(data))
pbar.close()
# #合并文件
# ff = FFmpeg(inputs={f'{video_title}.mp4':None,f'{video_title}.mp3':None},outputs={f'{video_title}.mkv':'-codec copy'})
# ff.run()
# #删除文件
# Path(f'{video_title}.mp3').unlink()
# Path(f'{video_title}.mp4').unlink()
if __name__=='__main__':
url='https://www.bilibili.com/video/BV1f3411B7HK?p=1'
video_info=get_video_info(video_url=url)
import time
start_time=time.time()
video_titles=video_info.keys()
for title in video_titles:
video_download(title,video_info['video_download_url'],video_info['audio_download_url'])
print(f'time={time.time()-start_time}')
异步协程版本
# 单线程异步协程版本
import asyncio
import re
import aiohttp
import aiofiles
import json
import requests
from tqdm.asyncio import tqdm
from pathlib import path
from ffmpy3 import FFmpeg
headers = {"Cookie": "SESSDATA=这里填自己的",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36 Edg/96.0.1054.41",
"Referer": "https://www.bilibili.com/"}
async def get_video_pages(video_url: str) -> int:
'''
得到多p视频 总p数
:param video_url:当前视频链接
:return:总p数
'''
video_html = requests.get(url=video_url, headers=headers).text
# <span class="cur-page">(1/12)</span>
pages = re.findall(r'<span class="cur-page">\((.*?)\)</span>', video_html)# 分页视频总数
total_page = 1
if len(pages) != 0:
cur_page = str(pages).split('/')
total_page = int(str(pages).split('/'))
print(f'当前为多p视频总页数为{total_page},当前页数为{cur_page}')
else:
print('当前为单p视频')
return total_page
async def video_download(video_url: str, page: int):
'''
下载视频,音频,合并视频使用FFmpeg
:param video_url:当前链接
:param page: 当前p数
:return: 无
'''
async with aiohttp.ClientSession() as session:
async with session.get(url=video_url, headers=headers) as res:
video_html = await res.text()
video_down_re = re.findall(r'<script>window\.__playinfo__=(.*?)</script>', video_html)# 下载信息
video_down_json = json.loads(video_down_re)
video_base_re = re.findall(r'<script>window.__INITIAL_STATE__=(.*?);\(function\(\)', video_html)# 基本信息
video_base_json = json.loads(video_base_re)
video_title = video_base_json['videoData']['pages']['part']# 标题
video_download_url = video_down_json['data']['dash']['video']['baseUrl']# 视频链接
audio_download_url = video_down_json['data']['dash']['audio']['baseUrl']# 音频链接
# async with session.get(url=video_download_url, headers=headers) as res_video:
# video_byte=await res_video.read()
async with session.get(url=audio_download_url, headers=headers) as res_audio:
audio_byte = await res_audio.read()
# async with aiofiles.open(f'{video_title}.mp4','wb') as stream:
# await stream.write(video_byte)
async with aiofiles.open(f'{video_title}.mp3', 'wb') as stream:
await stream.write(audio_byte)
# # 合并文件
# ff = FFmpeg(inputs={f'{video_title}.mp4': None, f'{video_title}.mp3': None},outputs={f'{video_title}.mkv': '-codec copy'})
# ff.run()
# # 删除文件
# Path(f'{video_title}.mp3').unlink()
# Path(f'{video_title}.mp4').unlink()
async def main():
url = 'https://www.bilibili.com/video/BV1f3411B7HK?p=1'
task_one = asyncio.create_task(get_video_pages(video_url=url))
toatl_pages = await task_one
tasks=[]
for p in range(1, toatl_pages + 1):
video_url = re.sub('p=.*', f'p={p}', url)
tasks.append(video_download(video_url,p))
import time
start = time.time()
with tqdm(asyncio.as_completed(tasks),total=len(tasks),unit='byte',unit_scale=True) as pbar:
for j in pbar:
await j
print(f'time={time.time() - start}')
if __name__ == '__main__':
loop=asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(main())
厉害,可以请楼主帮忙看看这个问题吗爬虫代码有问题请各位大佬指点 https://fishc.com.cn/thread-207084-1-1.html (出处: 鱼C论坛)
页:
[1]