|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- import requests
- import lxml.html
- import csv
- #获取目标网页
- doubanUrl = "https://movie.douban.com/top250?start={}&filter="
- # 解析网页
- # 获取网页数据
- def getSource(url):
- response = requests.get(url)
- response.encoding = "utf-8"
- return response.content
- # 定义函数获取每一条电影的信息
- def getEveryItem(source):
- selector = lxml.html.document_fromstring(source)
- # 获取每一条电影的信息,并放在一个集合里面
- movieItemList = selector.xpath('//div[class="info"]') # 获取所有属性info的div标签的信息
- # 定义一个列表 目的:展示信息
- movieList = []
- # 用for循环把电影信息展开
- for eachMovie in movieItemList:
- # 保存电影 名字 地址 评分----
- # 把字典里的信息用列表展示[{ movieDict1},{movieDict2}]
- movieDict = {}
- title = eachMovie.xpath('div[@class="hd"]/a/span[@class="title"]/text()') # 标题
- otherTitle = eachMovie.xpath('div[@class="hd"]/a/span[@class="other"]/text()') # 副标题
- link = eachMovie.xpath('div[@class="hd"]/a/@href')[0] # url
- star = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span[@class=""]/text()')[0]
- quote = eachMovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()')[0]
- print(title)
- print("信息展开成功")
- #保存到字典中
- movieDict['title'] = ''.join(title + otherTitle)
- movieDict['url'] = link
- movieDict['star'] = star
- movieDict['quote'] = quote
- print(movieDict)
- movieList.append(movieDict)
- print("保存字典成功")
- return movieList
- #下载目标网页数据
- def writeData(movieList):
- with open('./MovieDouban.csv', 'w', encoding = 'utf-8') as f:
- writer = csv.DictWriter(f,fieldnames=['title','star','quote','url'])
- writer.writeheader() # 写表头
- # 传入一个列表,每个元素代表一行
- for each in movieList:
- writer.writerow(each)
- if __name__ == '__main__':
- movieList = []
- # 分析 250部电影每页25个
- for i in range(10):
- # 处理定义好的url
- pageLink = doubanUrl.format(i*25)
- #print(pageLink)
- # 调用获取网页信息函数
- source = getSource(pageLink)
- # 获取网页数据
- movieList += getEveryItem(source)
- print(movieList[:10])
- writeData(movieList)
- print('成功')
复制代码
大体步骤及其相关函数:
1.获取目标网页:doubanUrl = "https://movie.douban.com/top250?start={}&filter="
2.解析网页:getSource(url):
2.1得到网页电影信息
2.2展开网页电影信息
2.3将展开的信息用字典保存
3.下载网页数据 :writeData(movieList)
测试结果为,未爬取到数据 |
|