本帖最后由 suchocolate 于 2020-12-22 13:26 编辑 import requests
from lxml import etree
import os
def main():
folder = 'docs'
if not os.path.exists(folder):
os.mkdir(folder)
os.chdir(folder)
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
result = []
url = "https://sc.chinaz.com/jianli/free.html"
num = int(input('请输入查询的页数:'))
for i in range(num):
r = requests.get(url, headers=headers)
html = etree.HTML(r.text)
temp = html.xpath('//div[@id="main"]/div/div/a/@href')
result.extend(temp)
nx_url = html.xpath('//a[@class="nextpage"]/@href')[0]
url = 'https://sc.chinaz.com/jianli/' + nx_url
# print(result)
file_counter = 1
for i in result:
url = 'https:' + i
r = requests.get(url, headers=headers)
html = etree.HTML(r.text)
doc = html.xpath('//ul[@class="clearfix"]/li[1]/a/@href')[0]
doc_name = doc.split('/')[-1]
r = requests.get(doc, headers=headers)
with open(doc_name, 'wb') as f:
f.write(r.content)
print(f'已下载{doc_name}, 共下载{file_counter}个简历。')
file_counter += 1
if __name__ == "__main__":
main()
|