|
发表于 2019-10-6 15:26:04
|
显示全部楼层
练习一下:爬豆瓣- import requests
- from lxml import etree
- import csv
- import time
- # 获取所有页面url,存放到urls列表中
- urls = []
- def get_url():
- # 爬取页数最大为10
- for i in range(10):
- url = 'https://movie.douban.com/top250?start=%d' % (i*25)
- urls.append(url)
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'}
- # 提取指定数据,存放到datas和imgs列表中
- datas = []
- imgs = []
- i = 1 # 电影序号,由开始爬取的页码决定(从第一页开始爬为1,第二页开始为26,第三页开始为51,以此类推)
- def get_data():
- global i
- for url in urls:
- response = requests.get(url, headers=headers).text
- html = etree.HTML(response)
- for index in range(1, 26):
- data = []
- title = html.xpath('//ol[@class="grid_view"]/li[%d]//span[1][@class="title"]/text()' % index)[0] # 电影标题
- try:
- introduction = html.xpath('//ol[@class="grid_view"]/li[%d]//span[@class="inq"]/text()' % index)[0] # 电影简介,没有简介用空字符串替代
- except:
- introduction = ''
- rating = html.xpath('//ol[@class="grid_view"]/li[%d]//span[@class="rating_num"]/text()' % index)[0] # 电影评分
- num = html.xpath('//ol[@class="grid_view"]/li[%d]//div[@class="star"]/span[4]/text()' % index)[0] # 评分人数
- img_link = html.xpath('//ol[@class="grid_view"]/li[%d]//img/@src' % index)[0] # 图片url
- img = requests.get(img_link).content
- data.extend(['No.%d'%i, title, introduction, rating, num])
- datas.append(data)
- imgs.append(img)
- print('\r已获取第 %d 条数据' % i, end='')
- i += 1
- time.sleep(3)
- # 保存图片在当前目录下的imgs文件夹中
- def save_img():
- i = 1 # 图片序号,由开始爬取的页码决定(从第一页开始爬为1,第二页开始为26,第三页开始为51,以此类推)
- for img in imgs:
- with open('imgs/%d.jpg'%i, 'wb') as f:
- f.write(img)
- print('\r已保存第 %d 张图片' % i, end='')
- i += 1
- # 保存数据到csv文件中
- def save_data():
- headers = ['排名', '电影名', '简介', '评分', '评分人数']
- with open('doubanmovietop250.csv', 'a', encoding='utf-8', newline='') as f:
- f_csv = csv.writer(f)
- f_csv.writerow(headers)
- f_csv.writerows(datas)
- print('\n已保存data')
- def main():
- get_url()
- get_data()
- save_data()
- save_img()
- if __name__ == '__main__':
- main()
复制代码 |
评分
-
查看全部评分
|