马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import requests
from lxml import html
def open_url(url):
# 使用代理
# proxies = {"http": "127.0.0.1:1080", "https": "127.0.0.1:1080"}
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}
# res = requests.get(url, headers=headers, proxies=proxies)
res = requests.get(url, headers=headers)
return res
#获取每一页的url
def getPage():
urls=[]
for i in range(10):
baseurl="https://movie.douban.com/top250?start={}&filter=".format(i*25)
urls.append(baseurl)
print(urls)
return urls
#爬取每一页的相关内容
def crawl(url):
res=open_url(url)
selector=html.fromstring(res.content)
Movies = selector.xpath('//div[@class="info"]')
result=[]
for eachMovie in Movies:
#用xpath爬取内容
title = eachMovie.xpath('//div[@class="hd"]/a/span[@class="title"]/text()')[0]
movieInfo = eachMovie.xpath('//div[@class="bd"]/p/text()')[0]
star = eachMovie.xpath('//div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]
quote = eachMovie.xpath('//div[@class="bd"]/p[@class="quote"]/span/text()')[0]
result.append(title+movieInfo+star+quote+'\n')
filename='豆瓣top250.txt'
with open(filename,'w',encoding="utf-8") as f:
for each in result:
f.write(each)
if __name__=="__main__":
for i in getPage():
crawl(i)
用了一点xpath,不知道为什么什么都爬取不到 |