|
发表于 2020-12-22 13:19:11
|
显示全部楼层
本楼为最佳答案
本帖最后由 suchocolate 于 2020-12-22 13:26 编辑
- import requests
- from lxml import etree
- import os
- def main():
- folder = 'docs'
- if not os.path.exists(folder):
- os.mkdir(folder)
- os.chdir(folder)
- headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
- result = []
- url = "https://sc.chinaz.com/jianli/free.html"
- num = int(input('请输入查询的页数:'))
- for i in range(num):
- r = requests.get(url, headers=headers)
- html = etree.HTML(r.text)
- temp = html.xpath('//div[@id="main"]/div/div/a/@href')
- result.extend(temp)
- nx_url = html.xpath('//a[@class="nextpage"]/@href')[0]
- url = 'https://sc.chinaz.com/jianli/' + nx_url
- # print(result)
- file_counter = 1
- for i in result:
- url = 'https:' + i
- r = requests.get(url, headers=headers)
- html = etree.HTML(r.text)
- doc = html.xpath('//ul[@class="clearfix"]/li[1]/a/@href')[0]
- doc_name = doc.split('/')[-1]
- r = requests.get(doc, headers=headers)
- with open(doc_name, 'wb') as f:
- f.write(r.content)
- print(f'已下载{doc_name}, 共下载{file_counter}个简历。')
- file_counter += 1
- if __name__ == "__main__":
- main()
复制代码 |
|