| 
 | 
 
 
发表于 2020-7-15 09:13:33
|
显示全部楼层
 
 
 
 
文件太多,我没有全部测试,只测试了几条 
D:\wp>py app4.py 
c1(AR)-18163914863.pdf 下载成功 
c00814-18023226274.pdf 下载成功 
c00814AR-19195125508.pdf 下载成功 
c00814-16464376689.pdf 下载成功 
- import requests
 
 - from lxml import etree
 
 - import re
 
 - import os
 
 - url = 'http://www.jkl.com.cn/cn/invest.aspx'
 
 - UA = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'}
 
 - r1 = requests.get(url=url, headers=UA).text
 
 - x1 = etree.HTML(r1)
 
 - n1 = x1.xpath('//div[@class="infoLis"]//a/text()')
 
 - l1 = x1.xpath('//div[@class="infoLis"]//@href')
 
 - keys1 = [] ; values1 = []
 
 - keys2 = [] ; values2 = []
 
 - for n2 in n1:
 
 -     n2 = n2.strip()
 
 -     keys1.append(n2)
 
 - for l2 in l1:
 
 -     l2 = 'http://www.jkl.com.cn/cn/' + l2
 
 -     values1.append(l2)
 
 - d1 = dict(zip(keys1,values1))
 
 - for n3,l3 in d1.items( ):
 
 -     n3 = n3.replace('/','.')
 
 -     path = 'c:/' + n3
 
 -     if not os.path.exists(path):
 
 -         os.mkdir(path)
 
 -     r2 = requests.get(url=l3, headers=UA).text
 
 -     x2 = etree.HTML(r2)
 
 -     weiye = x2.xpath('//a[text()="尾页"]/@href')
 
 -     if weiye != []:
 
 -         re1 = re.search("(\d+)'\)", weiye[0])
 
 -         yeshu = re1.group(1)
 
 -     else:
 
 -         yeshu = 1
 
 -     for yema in range(1, int(yeshu)+1):
 
 -         data = {'__EVENTARGUMENT': yema,
 
 -                 '__EVENTTARGET': 'AspNetPager1'
 
 -                 }
 
 -         r3 = requests.get(url=l3, params=data, headers=UA).text
 
 -         x3 = etree.HTML(r3)
 
 -         l4 = x3.xpath('//div[@class="newsLis"]//li//@href')
 
 -         n4 = x3.xpath('//div[@class="newsLis"]//li/a/text()')
 
 -         for n5 in n4:
 
 -             n5 = n5.strip()
 
 -             keys2.append(n5)
 
 -         for l5 in l4:
 
 -             l5 = 'http://www.jkl.com.cn' + l5
 
 -             values2.append(l5)
 
 - d2 = dict(zip(keys2,values2))
 
 - #print(d1)
 
 - for x in d2:
 
 -    if not os.path.exists(x):
 
 -         os.makedirs(x)
 
 - for x in d2:  
 
 -     pdfurl=d2[x]
 
 -     pdfdata=requests.get(pdfurl).content      
 
 -     with open(d2[x].split("/")[-1],"wb") as f:
 
 -          f.write(pdfdata)
 
 -     print(d2[x].split("/")[-1],"下载成功")     
 
  复制代码 |   
 
 
 
 |