|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
豆瓣爬取电影TOP250遇到匹配的问题,标签下内容数量不等,按照现有代码只可以匹配一个信息,麻烦大神解答下
需要爬取的内容如下:
<br>
<span class="pl">类型:</span>
<span property="v:genre">剧情</span>
/
<span property="v:genre">喜剧</span>
/
<span property="v:genre">爱情</span>
/
<span property="v:genre">战争</span>
<br>
爬取的代码如下:
- import requests
- from lxml import etree
- import time
- import csv
- import re
- headers={
- 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
- }
- #写入请求头
- fp = open('C:\doupoxiaoshuo\doubanshoudong.csv','wt',newline='',encoding='utf-8')
- writer= csv.writer(fp)
- writer.writerow(('title','leixing','time_out','lanuage'))
- #创建CSV文件,写入首行
- def get_movie_url(url):
- res=requests.get(url,headers=headers)
- response=etree.HTML(res.text)
- movie_hrefs=response.xpath('//div[@class="hd"]/a/@href')#请求网站信息爬取需要的URL
- for movie_href in movie_hrefs:
- get_movie_info(movie_href)#循环爬取URL
- def get_movie_info(url):
- res=requests.get(url,headers=headers)
- response=etree.HTML(res.text)#进入需要爬取的URL页面
- try:
- title=response.xpath('//*[@id="content"]/h1/span[1]/text()')[0]
- leixing=re.findall('<span property="v:genre">(.*?)分钟</span>',res.text)[0]
- time_out=re.findall('<span property="v:runtime" content=".*?">(.*?)分钟</span>',res.text,re.S)[0]
- lanuage=re.findall('<span class="pl">制片国家/地区:</span>(.*?)<br/>',res.text,re.S)[0]
- writer.writerow((title,leixing,time_out,lanuage))
- #爬取需要的信息并写入CSV
- except IndexError:
- pass
- #出现错误跳过
- if __name__ =='__main__':
- urls=['https://movie.douban.com/top250?start={}'.format(str(i))for i in range(0,50,25)]
- for url in urls:
- get_movie_url(url)
- time.sleep(2)
- #循环爬取所有电影的详细信息
复制代码
pl_list = []
pl = response.xpath('//*[@id="info"]/span[@property="v:genre"]')
for i in pl:
pl_list.append(i.text)
|
|