|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
跟着B站上的教程写的 请大佬们指点看看有什么不足之处
- from lxml import etree
- import requests
- BASE_DOMAIN = "https://dytt8.net"
- HEADERS = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36'
- }
- def get_detail_urls(url):
- response = requests.get(url, headers=HEADERS)
- text = response.content.decode('gbk', 'ignore')
- html = etree.HTML(text)
- detail_URLS = html.xpath("//table[@class='tbspan']//a/@href")
- detail_URLS=map(lambda url:BASE_DOMAIN+url,detail_URLS)
- return detail_URLS
- def parse_detail_page(url):
- movie = {}
- response = requests.get(url,headers=HEADERS)
- text = response.content.decode('gbk', 'ignore')
- html = etree.HTML(text)
- title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
- movie['title'] = title
- zoomE = html.xpath("//div[@id='Zoom']")[0]
- imgs = zoomE.xpath(".//img/@src")
- movie['imgs'] = imgs
- def parse_info(info,rule):
- return info.replace(rule,"").strip()
- infos = zoomE.xpath(".//text()")
- for index,info in enumerate(infos):
- # print(info)
- # print(index)
- # print('='*30)
- if info.startswith("◎年 代"):
- info = parse_info(info,"◎年 代")
- movie['year'] = info
- elif info.startswith("◎产 地"):
- info = parse_info(info,"◎产 地")
- movie['country'] = info
- elif info.startswith("◎类 别"):
- info = parse_info(info,"◎类 别")
- movie['category'] = info
- elif info.startswith("◎豆瓣评分"):
- info = parse_info(info,"◎豆瓣评分")
- movie['douban rating'] = info
- elif info.startswith("◎片 长"):
- info = parse_info(info,"◎片 长")
- movie['duration'] = info
- elif info.startswith("◎导 演"):
- info = parse_info(info,"◎导 演")
- movie['director'] = info
- elif info.startswith("◎主 演"):
- info = parse_info(info,"◎主 演")
- actors = []
- for x in range(index+1,len(infos)):
- actor = infos[x].strip()
- if actor.startswith("◎"):
- break
- actors.append(actor)
- movie['actors'] = actors
- elif info.startswith("◎标 签"):
- info = parse_info(info,"◎标 签")
- movie['label'] = info
- elif info.startswith("◎简 介"):
- info = parse_info(info, "◎简 介")
- for x in range(index + 1, len(infos)):
- profile = infos[x].strip()
- if profile.startswith("【下载地址】"):
- break
- movie['profile'] = profile
- download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")
- movie['download_url'] = download_url
- return movie
- def spider():
- base_url = 'https://dytt8.net/html/gndy/dyzz/list_23_{}.html'
- movies = []
- for x in range(1,8):
- #第一个for循环是来控制总共有几页的
- url = base_url.format(x)
- detail_URLS = get_detail_urls(url)
- for detail_URL in detail_URLS:
- #第二个for循环是用来遍历一夜中所用电影详情的url
- movie = parse_detail_page(detail_URL)
- movies.append(movie)
- print(movies)
- if __name__ == '__main__':
- spider()
复制代码 |
评分
-
查看全部评分
|