| 
 | 
 
 
 楼主 |
发表于 2021-9-10 14:17:54
|
显示全部楼层
 
 
 
- import re
 
 - import urllib.request,urllib.error
 
 - import bs4
 
  
- baseUrl = "https://www.luogu.com.cn/problem/P"
 
 - savePath = "C:\\Users\\Sagiri\\Desktop\\p\"
 
 - maxn = 1005
 
  
- def main():
 
 -     print("计划爬取到P{}".format(maxn))
 
 -     for i in range(1000,maxn+1):
 
 -         try:
 
 -             print("正在爬取P{}".format(i))
 
 -             html = getHTML(baseUrl + str(i))
 
 -             problemMD = getMD(html)
 
 -             saveData(problemMD,"P"+str(i)+".md")
 
 -         except Exception:
 
 -             pass
 
 -         continue
 
 -     print("爬取完毕")
 
  
- def getHTML(url):
 
 -     headers = {
 
 -         "user-agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 85.0.4183.121 Safari / 537.36"
 
 -     }
 
 -     request = urllib.request.Request(url = url,headers = headers)
 
 -     response = urllib.request.urlopen(request)
 
 -     html = response.read().decode('utf-8')
 
 -     return html
 
  
- def getMD(html):
 
 -     bs = bs4.BeautifulSoup(html,"html.parser")
 
 -     core = bs.select("article")[0]
 
 -     md = str(core)
 
 -     md = re.sub("<h1>","# ",md)
 
 -     md = re.sub("<h2>","## ",md)
 
 -     md = re.sub("<h3>","#### ",md)
 
 -     md = re.sub("</?[a-zA-Z]+[^<>]*>","",md)
 
 -     return md
 
  
- def saveData(data,filename):
 
 -     cfilename = savePath + filename
 
 -     file = open(cfilename,"w",encoding="utf-8")
 
 -     for d in data:
 
 -         file.writelines(d)
 
 -     file.close()
 
  
- if __name__ == '__main__':
 
 -     main()
 
  
 
  复制代码 
 
爬洛谷的题库,因为洛谷用的md语法编辑的,变量全是$...$形式,必须要加`$...$`才能在编辑器里正常读取 |   
 
 
 
 |