|

楼主 |
发表于 2021-9-10 14:17:54
|
显示全部楼层
- import re
- import urllib.request,urllib.error
- import bs4
- baseUrl = "https://www.luogu.com.cn/problem/P"
- savePath = "C:\\Users\\Sagiri\\Desktop\\p\"
- maxn = 1005
- def main():
- print("计划爬取到P{}".format(maxn))
- for i in range(1000,maxn+1):
- try:
- print("正在爬取P{}".format(i))
- html = getHTML(baseUrl + str(i))
- problemMD = getMD(html)
- saveData(problemMD,"P"+str(i)+".md")
- except Exception:
- pass
- continue
- print("爬取完毕")
- def getHTML(url):
- headers = {
- "user-agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 85.0.4183.121 Safari / 537.36"
- }
- request = urllib.request.Request(url = url,headers = headers)
- response = urllib.request.urlopen(request)
- html = response.read().decode('utf-8')
- return html
- def getMD(html):
- bs = bs4.BeautifulSoup(html,"html.parser")
- core = bs.select("article")[0]
- md = str(core)
- md = re.sub("<h1>","# ",md)
- md = re.sub("<h2>","## ",md)
- md = re.sub("<h3>","#### ",md)
- md = re.sub("</?[a-zA-Z]+[^<>]*>","",md)
- return md
- def saveData(data,filename):
- cfilename = savePath + filename
- file = open(cfilename,"w",encoding="utf-8")
- for d in data:
- file.writelines(d)
- file.close()
- if __name__ == '__main__':
- main()
复制代码
爬洛谷的题库,因为洛谷用的md语法编辑的,变量全是$...$形式,必须要加`$...$`才能在编辑器里正常读取 |
|