| 
 | 
 
 
发表于 2020-12-22 13:19:11
|
显示全部楼层
   本楼为最佳答案    
 
 
 本帖最后由 suchocolate 于 2020-12-22 13:26 编辑  
 
- import requests
 
 - from lxml import etree
 
 - import os
 
  
 
- def main():
 
 -     folder = 'docs'
 
 -     if not os.path.exists(folder):
 
 -         os.mkdir(folder)
 
 -     os.chdir(folder)
 
 -     headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
 
 -     result = []
 
 -     url = "https://sc.chinaz.com/jianli/free.html"
 
 -     num = int(input('请输入查询的页数:'))
 
 -     for i in range(num):
 
 -         r = requests.get(url, headers=headers)
 
 -         html = etree.HTML(r.text)
 
 -         temp = html.xpath('//div[@id="main"]/div/div/a/@href')
 
 -         result.extend(temp)
 
 -         nx_url = html.xpath('//a[@class="nextpage"]/@href')[0]
 
 -         url = 'https://sc.chinaz.com/jianli/' + nx_url
 
 -     # print(result)
 
 -     file_counter = 1
 
 -     for i in result:
 
 -         url = 'https:' + i
 
 -         r = requests.get(url, headers=headers)
 
 -         html = etree.HTML(r.text)
 
 -         doc = html.xpath('//ul[@class="clearfix"]/li[1]/a/@href')[0]
 
 -         doc_name = doc.split('/')[-1]
 
 -         r = requests.get(doc, headers=headers)
 
 -         with open(doc_name, 'wb') as f:
 
 -             f.write(r.content)
 
 -             print(f'已下载{doc_name}, 共下载{file_counter}个简历。')
 
 -             file_counter += 1
 
  
 
- if __name__ == "__main__":
 
 -     main()
 
  复制代码 |   
 
 
 
 |