有大佬看看 哪里出错了吗? 我
import requestsfrom lxml import etree
from multiprocessing.dummy import Pool
import random
import os
if not os.path.exists('./video'):
os.mkdir('./video')
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.50'
}
#对下述url发出请求 解析出梨视频的详情页url 与 视频的名称
url = 'https://www.pearvideo.com/'
page_text = requests.get(url=url , headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@class="vervideo-tlist-bd recommend-btbg clearfix"]/ul/li')
urls = []
for li in li_list:
detail_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')
new_name = li.xpath('./div/a/div/div[@class="vervideo-name"]/text()') + 'mp4.'
#对详情页的url发起请求
detail_page_text = requests.get(url=detail_url, headers=headers, ).text
new_tree = etree.HTML(detail_page_text)
name = new_tree.xpath('//*[@id="detailsbd"]/div/div/div/div/h1/text()')
id = str(li.xpath('./div/a/@href')).split('_')
ajax_url = 'https://www.pearvideo.com/videoStatus.jsp?'
parames ={
'contId': id,
'mrd': str(random.random())#随机数这样子处理
}
ajax_headers = {
"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 Edg/88.0.705.63',
'Referer': 'https://www.pearvideo.com/video_' + id
}
new_page_json = requests.get(url=ajax_url, params=parames, headers=ajax_headers).json()
video_url =new_page_json["videoInfo"]['videos']["srcUrl"]
#print(video_url)
# 此处视频地址做了加密即ajax中得到的地址需要加上cont-,并且修改一段数字为id才是真地址
# 真地址:"https://video.pearvideo.com/mp4/third/20201120/cont-1708144-10305425-222728-hd.mp4"
# 伪地址:"https://video.pearvideo.com/mp4/third/20201120/1606132035863-10305425-222728-hd.mp4"
video_true_data = ''
s_list = str(video_url).split('/')
for i in range(0, len(s_list)):
if i < len(s_list)-1:
video_true_data += s_list + '/'
else:
ss_list = s_list.split('-')
for j in range(0, len(ss_list)):
if j == 0:
video_true_data +='cont-' +id + '-'
elif j == len(ss_list)-1:
video_true_data += ss_list
else:
video_true_data += ss_list + '-'
#print(video_true_data)
dict = {
'name': name,
'video': video_true_data
}
urls.append(dict)
print(dict)
def get_video_data(urls):
url_ = urls['video']
print(urls['name'] ,'正在下载!......')
file_path = './video/' + urls['name'] + '.mp4'
video_data_shiping = requests.get(url=url_ , headers=headers).content
with open(file_path, 'wb') as fp:
fp.write(video_data_shiping)
print(urls['name'], '下载成功!!!!')
pool = Pool(4)
pool.map(get_video_data, urls)
pool.close()
pool.join()
#爬取梨视频 热门视频
我感觉视频类网站,反爬措施都挺严厉的
网上教程未必能与时俱进 https://blog.csdn.net/weixin_51211600/article/details/109289024
去看看吧,也是下载梨视频的,应该会有帮助 ./是什么路径 for li in li_list:
detail_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')
new_name = li.xpath('./div/a/div/div[@class="vervideo-name"]/text()') + '.mp4' #保存的文件名错了,改为.mp4
#对详情页的url发起请求 v.ki 发表于 2021-2-7 09:24
./是什么路径
当前文件所在目录,一般Windows PowerShell支持这么写,但更推荐用反斜杠.\,cmd会报错,Python不知道可不可以 v.ki 发表于 2021-2-7 09:24
./是什么路径
一个点。是指代当前路径 mp4的命名问题,有非法字符。
页:
[1]