爬取视频的数据为仅仅有1Kb
from lxml import etreeimport requests
from multiprocessing.dummy import Pool
if __name__ == "__main__":
def spider(video_imfo):
print(video_imfo['video_name'],'————正在下载————')
url = 'https://www.pearvideo.com/videoStatus.jsp'
headers = {
'Referer': 'https://www.pearvideo.com/video_' + video_imfo['video_id'][-7:],
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43"
}
param = {
'contId': video_imfo['video_id'][-7:],
'mrd': '0.5326134291313338',
}
data = requests.get(url=url,params=param,headers=headers).json()
video_url = data['videoInfo']['videos']['srcUrl']
print(video_url)
response = requests.get(headers=headers,url=video_url).content
file_name = './' + video_imfo['video_name'] + '.mp4'
with open(file_name,'wb') as fp:
fp.write(response)
print(video_imfo['video_name'] + '.mp4' + '————下载好了————')
# 获取梨视频的视频id以及视频的名称
url = 'https://www.pearvideo.com/category_8'
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43"
}
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
video_ids = []
video_names = []
video_imfos = []
for li in li_list:
video_id = li.xpath('./div/a/@href')
video_name = li.xpath('./div/a/div/text()')
video_imfo = {
'video_id': video_id,
'video_name': video_name
}
video_imfos.append(video_imfo)
pool = Pool(4)
pool.map(spider,video_imfos)
pool.close()
pool.join()
链接已经打不开了 bonst 发表于 2021-1-18 13:58
链接已经打不开了
对,我其实很好奇,这种怎么办?我是爬取的梨视频的信息
https://www.pearvideo.com/category_8 你是要获取下拉页面的全部视频的url吗 这个网站请求视频是206响应,你需要在headers中添加Range请求头,告诉服务器你需要请求哪一部分信息,你需要下载整个视频,那Range的值就是bytes=0-。
header = {
'Range': 'bytes=0-'
} 因为 mp4 地址被加密了,是 js 加密。 bonst 发表于 2021-1-18 14:12
你是要获取下拉页面的全部视频的url吗
对,我想尝试获取下 YunGuo 发表于 2021-1-18 15:37
这个网站请求视频是206响应,你需要在headers中添加Range请求头,告诉服务器你需要请求哪一部分信息,你需 ...
添加了,但是获取到的还不是真实的视频信息 https://video.pearvideo.com/mp4/adshort/20210118/1610969142552-15578665_adpkg-ad_hd.mp4"
这个时存在headers上的地址,这个视频的地址是无效的,
通过审查元素可以获取到视频的地址为 https://video.pearvideo.com/mp4/adshort/20210118/cont-1716833-15578665_adpkg-ad_hd.mp4
可以看出,真实地址中的cont-1716833被替换为1610969142552,
所以为获取有效地址,需要将response中的1610969142552进行替换,但是现在不清楚怎么获取 cont-1716833 的值,
本帖最后由 YunGuo 于 2021-1-19 15:20 编辑
学知识拯救世界 发表于 2021-1-18 19:40
https://video.pearvideo.com/mp4/adshort/20210118/1610969142552-15578665_adpkg-ad_hd.mp4"
这个时存在 ...
这么说吧,比如从接口中获取的视频地址是这个:
https://video.pearvideo.com/mp4/adshort/20210118/1611040271440-15578857_adpkg-ad_hd.mp4
而真实请求地址是这个:
https://video.pearvideo.com/mp4/adshort/20210118/cont-1716868-15578857_adpkg-ad_hd.mp4
你只需要把1611040271440替换成cont-1716868,而1716868其实就是这个视频的id而已,id从列表页中就能获取到。
YunGuo 发表于 2021-1-19 15:17
这么说吧,比如从接口中获取的视频地址是这个:
https://video.pearvideo.com/mp4/adshort/20210118/1 ...
对 from lxml import etree
import requests
from multiprocessing.dummy import Pool
import re
if __name__ == "__main__":
def spider(video_imfo):
print(video_imfo['video_name'],'————正在下载————')
url = 'https://www.pearvideo.com/videoStatus.jsp'
headers = {
'Range': 'bytes=0-',
'Referer': 'https://www.pearvideo.com/video_' + video_imfo['video_id'][-7:],
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43"
}
param = {
'contId': video_imfo['video_id'][-7:],
'mrd': '0.5326134291313338',
}
data = requests.get(url=url,params=param,headers=headers).json()
video_url = data['videoInfo']['videos']['srcUrl']
image_url = data['videoInfo']['video_image']
ex = r'-'
result = re.search(ex, video_url).start()
end_url = video_url
ex = r'(\d){9,10}'
result = re.search(ex, video_url).start()
start_url = video_url[:result]
ex = 'cont-'
result = re.search(ex, image_url).end()
middle_url = image_url
true_url = start_url + 'cont-' + middle_url + end_url
try:
response = requests.get(headers=headers,url=true_url).content
file_name = './' + video_imfo['video_name'] + '.mp4'
with open(file_name,'wb') as fp:
fp.write(response)
print(video_imfo['video_name'] + '.mp4' + '————下载好了————')
except:
pass
# 获取梨视频的视频id以及视频的名称
url = 'https://www.pearvideo.com/category_8'
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43"
}
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
video_ids = []
video_names = []
video_imfos = []
for li in li_list:
video_id = li.xpath('./div/a/@href')
video_name = li.xpath('./div/a/div/text()')
video_imfo = {
'video_id': video_id,
'video_name': video_name
}
video_imfos.append(video_imfo)
pool = Pool(4)
pool.map(spider,video_imfos)
pool.close()
pool.join()
学知识拯救世界 发表于 2021-1-20 17:33
from lxml import etree
import requests
from multiprocessing.dummy import Pool
最后写的代码,可以实现功能了
页:
[1]