python 爬取腾讯视频500部电视剧
本帖最后由 不二如是 于 2019-10-11 16:09 编辑学习python一个月,还请大家多多指教{:10_254:}
实现如下
源代码:
import requests
import os
import time
from bs4 import BeautifulSoup
def open_url(url):
header={'User-Agent':'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'}
response = requests.get(url,headers = header)
return response
def find_videos(url):
time.sleep(2)
html = open_url(url).text
soup =BeautifulSoup(html,'lxml')
videos_url=[]
for each in soup.select('."list_item"'):
#爬取导航页的视频地址
video1_url=each.select('a')['href']
#将地址转为手机页面
a=video1_url.replace('https://v.qq.com/x/cover/','')
b=a.replace('.html','')
video_url='https://m.v.qq.com/play.html?cid='+b
video_update=each.select('span').text
videos_url.append(video_url)
for each_url in videos_url:
videos_each=''
video_html =open_url(each_url).content
video_soup=BeautifulSoup(video_html,'lxml')
#爬取视频名字
videos_name=video_soup.select('div > div > div.video_tit.U_color_a')
#爬取视频封面
try:
if video_soup.select('div > div > div.player_poster > img')==[]:
videos_img=video_soup.select('div').get('style')
videos_img=videos_img.replace('background-image:url(','')
videos_img=videos_img.replace(')','')
else:
videos_img=video_soup.select('div > div > div.player_poster > img').get('src')
videos_img= 'https:'+ videos_img
except IndexError:
#print('这里图片出错了')
pass
#爬取视频评分
videos_score=video_soup.select('div > div > div.video_types.U_color_b')
#爬取视频主演
if video_soup.select('div > div > div.video_starring.U_color_b')==[]:
videos_actor=video_soup.select('div > div > div.video_intro._desc > div:nth-of-type(1)')
else:
videos_actor=video_soup.select('div > div > div.video_starring.U_color_b')
#爬取视频导演
videos_director=video_soup.select('div > div > div.video_intro._desc > div:nth-of-type(2)')
#爬取视频简介
if video_soup.select('div > div > div.video_detail > div.detail_txt.U_color_b._desc')==[]:
videos_content=video_soup.select('div > div > div.video_intro._desc > div:nth-of-type(3)')
else:
videos_content=video_soup.select('div > div > div.video_detail > div.detail_txt.U_color_b._desc')
#爬取视频每一集的地址
for each_video in video_soup.select('a'):
#筛选掉预告
if '预'in each_video.get_text():
pass
else:
c=each_video.get_text()
c=c.replace('会员','')
c=c.replace('新','')
d=each_video.get('href')
videos_each +=str('第%s集地址是:%s'%(c,'https://m.v.qq.com'+d+'\n'))
for video_name,video_actor,video_score,video_content in zip(videos_name,videos_actor,videos_score,videos_content):
with open('腾讯视频','a') as f:
try:
f.write(str('片名:%s\n'%video_name.get_text()))
f.write(str('图片:%s\n'%videos_img))
f.write(str(video_actor.get_text()+'\n'))
f.write(str('评分:%s\n'%video_score.get_text()))
f.write(str('简介:%s\n'%video_content.get_text()))
f.write(str('地址:\n%s\n\n'%videos_each))
except UnicodeEncodeError:
pass
def spider_tx():
urls=['http://v.qq.com/x/list/tv?iarea=-1&offset={}'.format(str(i)) for i in range(0,90,30)]
count=1
for url in urls:
print('开始爬第%d页'% count)
find_videos(url)
count +=1
if __name__=='__main__':
spider_tx()
学习一下 强哥!!! 爬来的地址可以下载吗? 看一下 学习一下 aiesky 发表于 2018-7-9 16:00
爬来的地址可以下载吗?
不能下载的,不过可以在深爬一下,应该就可以了{:10_264:} 好 {:10_254:} 1 大佬活学活用 感谢大佬{:5_105:} 99999999999999999999999 看一下 谢谢分享 lihai {:10_254:} learn learn 朕想知道 前来学习