|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
#coding=gbk
import urllib.request
import os
import time
def url_open(url):
html = ''
try:
headers = {
'Referer':'https://www.mzitu.com/',
'User-Agent':'Mozilla /5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'
}
req = urllib.request.Request(url,headers=headers)
response = urllib.request.urlopen(req)
html = response.read()
except Exception as e:
#print('网址未找到: '+url)
html = ''
pass
return html
def get_page(url):
html = url_open(url).decode('utf-8')
ret = ''
a = html.find("class='page-numbers")
while True:
if a != -1:
b = html.find('>',a,a+64)
if b != -1:
c = html.find('<',b,b+16)
if c != -1:
ret = html[b+1:c]
else:
break
a = html.find("class='page-numbers",a+1,a+255)
return ret
def find_mz(url):
mz_addrs = []
mz_brief = []
html = url_open(url)
if (len(html) > 0):
html = html.decode('utf-8')
else:
return mz_addrs
a = html.find('data-original=')
while a != -1:
a = html.find('<span><a href=', a, a+255)
if a != -1:
b = html.find(' target=', a, a+ 64)
if b != -1:
mz_addrs.append(html[a+14:b].replace('"',''))
brief = url.split('/')[-1]
c = html.find('>', b, b+ 32)
if c != -1:
d = html.find('<', c, c+ 255)
if d != -1:
brief = html[c+1:d-1]
mz_brief.append(brief)
else:
b = a + 15
a = html.find('<span><a href=', b)
return mz_addrs,mz_brief
def undress_mz(url):
mz_dresses = []
idx = 0
while True:
idx = idx + 1
html = url_open(url+'/'+str(idx))
if (len(html) > 0):
html = html.decode('utf-8')
else:
print(' -- '+url+'此MZ共 '+str(idx-1)+' 个精彩瞬间/DRESS.')
break
a = html.find('main-image')
if a != -1:
b = html.find('<img src=', a, a+ 128)
if b != -1:
c = html.find('.jpg', b, b+ 64)
if c != -1:
mz_dresses.append(html[b+9:c+4].replace('"',''))
print(' ** 发现MZ第'+str(idx)+'个精彩瞬间 ... ')
time.sleep(delay)
html=''
return mz_dresses
def save_mzdress(imgurl):
filename = imgurl.split('/')[-1]
with open (filename, 'wb') as f:
img = url_open(imgurl)
f.write(img)
def read_step():
if os.path.isfile(step_file) == False:
return
file = open(step_file, 'r', encoding='UTF-8')
try:
while True:
text_line = file.readline()
if text_line and len(text_line) > 0:
step_list.append(text_line)
else:
break
finally:
file.close()
def step_complete(content):
if content in step_list:
if step_list[len(step_list)-1] != content :
return True
else:
return False
else:
return False
def mark_step(content):
f = open(step_file,'a',encoding='utf-8')
try:
f.write(content)
f.flush()
finally:
f.close()
def down_mz():
page_num = int(get_page(url))
print ('!!! 共发现'+str(page_num)+'波MZ !!!')
pIdx = 1
while pIdx <= page_num:
print ('-- 第'+str(pIdx)+'波MZ准备入场')
page_url = url + 'page/' +str(pIdx) +'/'
mz_addrs,mz_brief = find_mz(page_url)
for i in range(len(mz_addrs)):
addr = mz_addrs[i]
brief = mz_brief[i]
if step_complete(addr+','+brief+'\n'):
print(' -- '+addr+','+brief+' 第'+str(pIdx)+'波第'+str(i+1)+'个MZ已OK, 跳过... ...')
continue
mark_step(addr+','+brief+'\n')
try:
os.mkdir(brief)
except OSError:
pass
os.chdir(brief)
try:
print (' -- '+addr+' 第'+str(pIdx)+'波第'+str(i+1)+'个MZ特点: '+brief)
mz_dresses = undress_mz(addr)
#for dresses in mz_dresses:
for j in range(len(mz_dresses)):
dress = mz_dresses[j]
save_mzdress(dress)
print(' -- 保存MZ第'+str(j+1)+'个瞬间 ... '+dress)
time.sleep(delay)
except Exception as e:
pass
os.chdir(os.pardir)
print(' -- 中场休息(6s) ... ...')
time.sleep(delay*8)
print(' ')
pIdx += 1
url = 'https://www.mzitu.com/' #主页
folder = 'MZT' #保存目录
delay = 0.4 #获取每个地址后的延时 避免封IP 根据需要调整
step_list = [] #断点续传中间变量
step_file = 'mzstep.txt' #断点续传保存文件
if __name__ == '__main__':
try:
os.mkdir(folder)
except OSError:
pass
os.chdir(folder)
read_step()
down_mz()
print(' --- 已成功完成 --- ')
|
|