import requests
from lxml import etree
import openpyxl
url = 'https://www.shanghairanking.cn/rankings/bcur/202011'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'}
r = requests.get(url, headers=headers)
r.encoding = 'utf-8'
html = etree.HTML(r.text)#利用etree.HTML,将字符串解析为HTML文档
x = html.xpath('//*[@id="content-box"]/div[2]/table/tbody/tr/td[2]/a/text()')#//获取文档中所有匹配的节点
d = html.xpath('//*[@id="content-box"]/div[2]/table/tbody/tr/td[3]/text()')
l = html.xpath('//*[@id="content-box"]/div[2]/table/tbody/tr/td[4]/text()')
z = html.xpath('//*[@id="content-box"]/div[2]/table/tbody/tr/td[5]/text()')
c = html.xpath('//*[@id="content-box"]/div[2]/table/tbody/tr/td[6]/text()')
wb = openpyxl.Workbook()
ws = wb.active
title = ['排名','学校','总分','类型','省','层次']
print(title)
ws.append(title)
for i in range(len(x)):
lst=[i+1,x[i].strip(),z[i].strip(),l[i].strip(),d[i].strip(),c[i].strip()]
print(lst)
ws.append(lst)
wb.save('大学排名.xlsx')
|