马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import requests
import lxml.html
import csv
#获取目标网页
doubanUrl = "https://movie.douban.com/top250?start={}&filter="
# 解析网页
# 获取网页数据
def getSource(url):
response = requests.get(url)
response.encoding = "utf-8"
return response.content
# 定义函数获取每一条电影的信息
def getEveryItem(source):
selector = lxml.html.document_fromstring(source)
# 获取每一条电影的信息,并放在一个集合里面
movieItemList = selector.xpath('//div[class="info"]') # 获取所有属性info的div标签的信息
# 定义一个列表 目的:展示信息
movieList = []
# 用for循环把电影信息展开
for eachMovie in movieItemList:
# 保存电影 名字 地址 评分----
# 把字典里的信息用列表展示[{ movieDict1},{movieDict2}]
movieDict = {}
title = eachMovie.xpath('div[@class="hd"]/a/span[@class="title"]/text()') # 标题
otherTitle = eachMovie.xpath('div[@class="hd"]/a/span[@class="other"]/text()') # 副标题
link = eachMovie.xpath('div[@class="hd"]/a/@href')[0] # url
star = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span[@class=""]/text()')[0]
quote = eachMovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()')[0]
print(title)
print("信息展开成功")
#保存到字典中
movieDict['title'] = ''.join(title + otherTitle)
movieDict['url'] = link
movieDict['star'] = star
movieDict['quote'] = quote
print(movieDict)
movieList.append(movieDict)
print("保存字典成功")
return movieList
#下载目标网页数据
def writeData(movieList):
with open('./MovieDouban.csv', 'w', encoding = 'utf-8') as f:
writer = csv.DictWriter(f,fieldnames=['title','star','quote','url'])
writer.writeheader() # 写表头
# 传入一个列表,每个元素代表一行
for each in movieList:
writer.writerow(each)
if __name__ == '__main__':
movieList = []
# 分析 250部电影每页25个
for i in range(10):
# 处理定义好的url
pageLink = doubanUrl.format(i*25)
#print(pageLink)
# 调用获取网页信息函数
source = getSource(pageLink)
# 获取网页数据
movieList += getEveryItem(source)
print(movieList[:10])
writeData(movieList)
print('成功')
大体步骤及其相关函数:
1.获取目标网页:doubanUrl = "https://movie.douban.com/top250?start={}&filter="
2.解析网页:getSource(url):
2.1得到网页电影信息
2.2展开网页电影信息
2.3将展开的信息用字典保存
3.下载网页数据 :writeData(movieList)
测试结果为,未爬取到数据 |