|
楼主 |
发表于 2021-1-20 17:33:23
|
显示全部楼层
from lxml import etree
import requests
from multiprocessing.dummy import Pool
import re
if __name__ == "__main__":
def spider(video_imfo):
print(video_imfo['video_name'],'————正在下载————')
url = 'https://www.pearvideo.com/videoStatus.jsp'
headers = {
'Range': 'bytes=0-',
'Referer': 'https://www.pearvideo.com/video_' + video_imfo['video_id'][-7:],
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43"
}
param = {
'contId': video_imfo['video_id'][-7:],
'mrd': '0.5326134291313338',
}
data = requests.get(url=url,params=param,headers=headers).json()
video_url = data['videoInfo']['videos']['srcUrl']
image_url = data['videoInfo']['video_image']
ex = r'-'
result = re.search(ex, video_url).start()
end_url = video_url[result:]
ex = r'(\d){9,10}'
result = re.search(ex, video_url).start()
start_url = video_url[:result]
ex = 'cont-'
result = re.search(ex, image_url).end()
middle_url = image_url[result:result + 7]
true_url = start_url + 'cont-' + middle_url + end_url
try:
response = requests.get(headers=headers,url=true_url).content
file_name = './' + video_imfo['video_name'] + '.mp4'
with open(file_name,'wb') as fp:
fp.write(response)
print(video_imfo['video_name'] + '.mp4' + '————下载好了————')
except:
pass
# 获取梨视频的视频id以及视频的名称
url = 'https://www.pearvideo.com/category_8'
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43"
}
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
video_ids = []
video_names = []
video_imfos = []
for li in li_list:
video_id = li.xpath('./div/a/@href')[0]
video_name = li.xpath('./div/a/div[2]/text()')[0]
video_imfo = {
'video_id': video_id,
'video_name': video_name
}
video_imfos.append(video_imfo)
pool = Pool(4)
pool.map(spider,video_imfos)
pool.close()
pool.join()
|
|