|  | 
 
 
 楼主|
发表于 2020-6-18 13:55:51
|
显示全部楼层 
| 十分感谢,我也贴一下我的,希望多多指教我
 
 import re,os,time,random,lxml,sys
 from lxml import etree
 import requests
 from bs4 import BeautifulSoup
 from requests.adapters import HTTPAdapter
 from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
 os.system('title 优美图吧整站爬虫程序 @小伍的游乐场-5yang.cc')#设置窗口标题
 #coding:utf-8
 result = []
 urls = []
 down_path = '优美图吧'
 if not os.path.exists(down_path):
 os.makedirs(down_path)
 
 def getpage(url):
 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
 s = requests.session()#加入会话保持系统
 s.mount('http://', HTTPAdapter(max_retries=10))#http和https都为最大重试10次
 s.mount('https://', HTTPAdapter(max_retries=10))
 x = 0
 while x < 10:
 try:
 r = s.get(url,headers=headers,timeout = (20,20))
 if r.status_code == 200:
 return r.text
 print('该地址已经Return')
 except requests.exceptions.RequestException as e:
 print(e)
 x += 1
 print('出错重试!')
 
 def lastpage(content,url):#获取lastpage
 zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
 picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
 if len(picurl) == 0:#判断列表是否为空
 #        print('该页面为空,不存在最大页面: ',url)
 return 0
 else:
 lastpage = picurl[0]#最大page是列表,取出来使用
 #print(lastpage)
 return lastpage
 
 def pingjie(cc):
 pingjie1 = re.sub('.html','',url)#将.html替换为空
 pinjie2 = pingjie1+'_'+ str(x) +'.html'
 urls.append(pinjie2)
 
 
 def down_pic(content):
 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
 picnamezz = re.compile('点击图片.*?alt="(.*?)" width',re.S)
 picname = re.findall(picnamezz,content)
 #print('该图片名称为:',picname[0])
 
 jpgnow = re.compile('点击图片查看.*?<img src="(.*?)" alt=".*? width',re.S)#正则中添加变量的方式
 #print(jpgnow)
 jpgdown = re.findall(jpgnow,content)#图片绝对地址
 #print('图片绝对地址为:',jpgdown[0])
 
 html = etree.HTML(content)
 filename = html.xpath('//*[@id="body-header-top"]/div[2]/div/div/div[2]/h1/text()')
 #print('该图集名称为:',filename[0])
 s = requests.session()#加入会话保持系统
 s.mount('http://', HTTPAdapter(max_retries=3))#http和https都为最大重试3次
 s.mount('https://', HTTPAdapter(max_retries=3))
 try:
 jpgget = s.get(jpgdown[0],headers=headers,timeout = (30,30))#这里的timeout分别是链接时间和读取时间,超时自动放弃
 except requests.exceptions.RequestException as e:#必须加入except,否则程序爆出的错误让你无法定位,之前没加我的程序报我语法错误,且错误在pagenum()
 print(e)
 print(time.strftime('%Y-%m-%d %H:%M:%S'))
 pass
 
 down_path1 = f'优美图吧/{filename[0]}'
 if not os.path.exists(f'优美图吧/{filename[0]}'):
 os.makedirs(f'优美图吧/{filename[0]}')
 try:
 #print(jpgdown) #打印图片绝对网址
 with open(f'优美图吧/{filename[0]}/{picname[0]}.jpg','wb') as f:
 f.write(jpgget.content)
 print(f'已完成{picname[0]}的下载')
 except:
 print('该图片下载超时,自动跳过...')
 
 
 
 
 if __name__ == "__main__":
 
 url = 'https://www.umtuba.com/wp-admin/admin-ajax.php'
 
 headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest'}
 
 data = {'action': 'postlist_newajax', 'id': '0', 'type': 'index'}
 
 num = int(input('请问您要下载多少页 :'))
 print('时间较长,请耐心等待.')
 print('时间较长,请耐心等待..')
 print('时间较长,请耐心等待...')
 print('时间较长,请耐心等待....')
 print('时间较长,请耐心等待.....')
 print('时间较长,请耐心等待......')
 for item in range(num):
 data['paged'] = item
 r = requests.post(url, headers=headers, data=data)
 j_data = r.json()
 #print(j_data)
 result.extend(re.findall(r'<a href="(.*?)"', str(j_data)))
 #print(result)
 
 for url in result:
 content = getpage(url)
 cc = lastpage(content,url)
 for x in range(1,int(cc)+1):
 pingjie(cc)
 print(urls)
 print('时间较长,请耐心等待.')
 print('时间较长,请耐心等待..')
 print('时间较长,请耐心等待...')
 print('时间较长,请耐心等待....')
 print('时间较长,请耐心等待.....')
 print('时间较长,请耐心等待......')
 
 
 
 #    for url in urls:
 #        down_pic(getpage(url))
 ex = ThreadPoolExecutor(max_workers = 20)
 future = [ex.submit(down_pic,getpage(url)) for url in urls]
 wait(future,return_when=ALL_COMPLETED)
 
 
 | 
 |