|
楼主 |
发表于 2020-6-18 13:55:51
|
显示全部楼层
十分感谢,我也贴一下我的,希望多多指教我
import re,os,time,random,lxml,sys
from lxml import etree
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from concurrent.futures import ThreadPoolExecutor,wait,ALL_COMPLETED
os.system('title 优美图吧整站爬虫程序 @小伍的游乐场-5yang.cc')#设置窗口标题
#coding:utf-8
result = []
urls = []
down_path = '优美图吧'
if not os.path.exists(down_path):
os.makedirs(down_path)
def getpage(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
s = requests.session()#加入会话保持系统
s.mount('http://', HTTPAdapter(max_retries=10))#http和https都为最大重试10次
s.mount('https://', HTTPAdapter(max_retries=10))
x = 0
while x < 10:
try:
r = s.get(url,headers=headers,timeout = (20,20))
if r.status_code == 200:
return r.text
print('该地址已经Return')
except requests.exceptions.RequestException as e:
print(e)
x += 1
print('出错重试!')
def lastpage(content,url):#获取lastpage
zhengze = re.compile('page-numbers".*?"最后页">(.*?)</a>',re.S)
picurl = re.findall(zhengze,content)#取得每个页面的最大page,例如某图集最大42页
if len(picurl) == 0:#判断列表是否为空
# print('该页面为空,不存在最大页面: ',url)
return 0
else:
lastpage = picurl[0]#最大page是列表,取出来使用
#print(lastpage)
return lastpage
def pingjie(cc):
pingjie1 = re.sub('.html','',url)#将.html替换为空
pinjie2 = pingjie1+'_'+ str(x) +'.html'
urls.append(pinjie2)
def down_pic(content):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
picnamezz = re.compile('点击图片.*?alt="(.*?)" width',re.S)
picname = re.findall(picnamezz,content)
#print('该图片名称为:',picname[0])
jpgnow = re.compile('点击图片查看.*?<img src="(.*?)" alt=".*? width',re.S)#正则中添加变量的方式
#print(jpgnow)
jpgdown = re.findall(jpgnow,content)#图片绝对地址
#print('图片绝对地址为:',jpgdown[0])
html = etree.HTML(content)
filename = html.xpath('//*[@id="body-header-top"]/div[2]/div/div/div[2]/h1/text()')
#print('该图集名称为:',filename[0])
s = requests.session()#加入会话保持系统
s.mount('http://', HTTPAdapter(max_retries=3))#http和https都为最大重试3次
s.mount('https://', HTTPAdapter(max_retries=3))
try:
jpgget = s.get(jpgdown[0],headers=headers,timeout = (30,30))#这里的timeout分别是链接时间和读取时间,超时自动放弃
except requests.exceptions.RequestException as e:#必须加入except,否则程序爆出的错误让你无法定位,之前没加我的程序报我语法错误,且错误在pagenum()
print(e)
print(time.strftime('%Y-%m-%d %H:%M:%S'))
pass
down_path1 = f'优美图吧/{filename[0]}'
if not os.path.exists(f'优美图吧/{filename[0]}'):
os.makedirs(f'优美图吧/{filename[0]}')
try:
#print(jpgdown) #打印图片绝对网址
with open(f'优美图吧/{filename[0]}/{picname[0]}.jpg','wb') as f:
f.write(jpgget.content)
print(f'已完成{picname[0]}的下载')
except:
print('该图片下载超时,自动跳过...')
if __name__ == "__main__":
url = 'https://www.umtuba.com/wp-admin/admin-ajax.php'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest'}
data = {'action': 'postlist_newajax', 'id': '0', 'type': 'index'}
num = int(input('请问您要下载多少页 :'))
print('时间较长,请耐心等待.')
print('时间较长,请耐心等待..')
print('时间较长,请耐心等待...')
print('时间较长,请耐心等待....')
print('时间较长,请耐心等待.....')
print('时间较长,请耐心等待......')
for item in range(num):
data['paged'] = item
r = requests.post(url, headers=headers, data=data)
j_data = r.json()
#print(j_data)
result.extend(re.findall(r'<a href="(.*?)"', str(j_data)))
#print(result)
for url in result:
content = getpage(url)
cc = lastpage(content,url)
for x in range(1,int(cc)+1):
pingjie(cc)
print(urls)
print('时间较长,请耐心等待.')
print('时间较长,请耐心等待..')
print('时间较长,请耐心等待...')
print('时间较长,请耐心等待....')
print('时间较长,请耐心等待.....')
print('时间较长,请耐心等待......')
# for url in urls:
# down_pic(getpage(url))
ex = ThreadPoolExecutor(max_workers = 20)
future = [ex.submit(down_pic,getpage(url)) for url in urls]
wait(future,return_when=ALL_COMPLETED)
|
|