练习一下:爬豆瓣import requests
from lxml import etree
import csv
import time
# 获取所有页面url,存放到urls列表中
urls = []
def get_url():
# 爬取页数最大为10
for i in range(10):
url = 'https://movie.douban.com/top250?start=%d' % (i*25)
urls.append(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36'}
# 提取指定数据,存放到datas和imgs列表中
datas = []
imgs = []
i = 1 # 电影序号,由开始爬取的页码决定(从第一页开始爬为1,第二页开始为26,第三页开始为51,以此类推)
def get_data():
global i
for url in urls:
response = requests.get(url, headers=headers).text
html = etree.HTML(response)
for index in range(1, 26):
data = []
title = html.xpath('//ol[@class="grid_view"]/li[%d]//span[1][@class="title"]/text()' % index)[0] # 电影标题
try:
introduction = html.xpath('//ol[@class="grid_view"]/li[%d]//span[@class="inq"]/text()' % index)[0] # 电影简介,没有简介用空字符串替代
except:
introduction = ''
rating = html.xpath('//ol[@class="grid_view"]/li[%d]//span[@class="rating_num"]/text()' % index)[0] # 电影评分
num = html.xpath('//ol[@class="grid_view"]/li[%d]//div[@class="star"]/span[4]/text()' % index)[0] # 评分人数
img_link = html.xpath('//ol[@class="grid_view"]/li[%d]//img/@src' % index)[0] # 图片url
img = requests.get(img_link).content
data.extend(['No.%d'%i, title, introduction, rating, num])
datas.append(data)
imgs.append(img)
print('\r已获取第 %d 条数据' % i, end='')
i += 1
time.sleep(3)
# 保存图片在当前目录下的imgs文件夹中
def save_img():
i = 1 # 图片序号,由开始爬取的页码决定(从第一页开始爬为1,第二页开始为26,第三页开始为51,以此类推)
for img in imgs:
with open('imgs/%d.jpg'%i, 'wb') as f:
f.write(img)
print('\r已保存第 %d 张图片' % i, end='')
i += 1
# 保存数据到csv文件中
def save_data():
headers = ['排名', '电影名', '简介', '评分', '评分人数']
with open('doubanmovietop250.csv', 'a', encoding='utf-8', newline='') as f:
f_csv = csv.writer(f)
f_csv.writerow(headers)
f_csv.writerows(datas)
print('\n已保存data')
def main():
get_url()
get_data()
save_data()
save_img()
if __name__ == '__main__':
main()
|