|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- import requests
- from lxml import html
- def open_url(url):
- # 使用代理
- # proxies = {"http": "127.0.0.1:1080", "https": "127.0.0.1:1080"}
- headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}
- # res = requests.get(url, headers=headers, proxies=proxies)
- res = requests.get(url, headers=headers)
- return res
- #获取每一页的url
- def getPage():
- urls=[]
- for i in range(10):
- baseurl="https://movie.douban.com/top250?start={}&filter=".format(i*25)
- urls.append(baseurl)
- print(urls)
- return urls
- #爬取每一页的相关内容
- def crawl(url):
- res=open_url(url)
- selector=html.fromstring(res.content)
- Movies = selector.xpath('//div[@class="info"]')
- result=[]
- for eachMovie in Movies:
- #用xpath爬取内容
- title = eachMovie.xpath('//div[@class="hd"]/a/span[@class="title"]/text()')[0]
- movieInfo = eachMovie.xpath('//div[@class="bd"]/p/text()')[0]
- star = eachMovie.xpath('//div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]
- quote = eachMovie.xpath('//div[@class="bd"]/p[@class="quote"]/span/text()')[0]
- result.append(title+movieInfo+star+quote+'\n')
- filename='豆瓣top250.txt'
- with open(filename,'w',encoding="utf-8") as f:
- for each in result:
- f.write(each)
-
- if __name__=="__main__":
- for i in getPage():
- crawl(i)
复制代码
用了一点xpath,不知道为什么什么都爬取不到 |
|