|
发表于 2020-2-29 15:26:19
|
显示全部楼层
#creator:qs
#description:download picture from https://www.mzitu.com/
#structure:download---year---month---file---picture
#date:2020.2.28
#Need:延时自动切换
#借鉴了楼主!没写注释!有点bug!但能用!
import requests as r
from bs4 import BeautifulSoup as bs
import easygui as g
import lxml
import os
import time as t
def get_soup(url):
# url------html------soup
headers ={'Referer':'https://www.mzitu.com','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}
response = r.get(url,headers=headers)
html = response.text
soup = bs(html,"lxml")
return soup
def get_pic(url):
# url------pic
headers ={'Referer':'https://www.mzitu.com','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'}
response = r.get(url,headers=headers)
pic = response.content
return pic
def init_file(file,soup):
# soup----year-month-file in download_file
year_names = soup.find_all(class_="year")
year_len=len(year_names)
years = soup.find_all(class_="archives")
for i in range(year_len):
year_name = year_names[i].text
os.mkdir(file+"\\"+year_name)
year = years[i]
month_complex = year.find_all("li")
month_len=len(month_complex)
for j in range(month_len):
month_name = month_complex[j].p.em.text
month_text = month_complex[j].p.text[6:-2]
os.mkdir(file+"\\"+year_name+"\\"+month_name)
month = month_complex[j].contents[1]
file_addresses=year_name+"\t"+month_name+"\t"+month_text
file_complex = month.find_all("a")
file_len = len(file_complex)
for k in range(file_len):
file_name = file_complex[k].text
file_address = file+"\\"+year_name+"\\"+month_name+"\\"+file_name
os.mkdir(file_address)
file_href = file_complex[k]["href"]
file_addresses += "\n"+str(k+1)+"\t"+file_href
download_pics(file_address,file_href,file_name)
t.sleep(3)
file_addresses_txt = open(file+"\\"+year_name+"\\"+month_name+"\\"+month_name+".txt","w")
file_addresses_txt.write(file_addresses)
file_addresses_txt.close()
def download_pics(file_address,file_href,file_name):
"""
file_href-----pic_first_url---pages------pics in file_addresses
"""
soup = get_soup(file_href)
pic_first_url = soup.find(class_="main-image").p.a.img["src"]
page_num = int(soup.find(class_="pagenavi").find_all("a")[-2].text)
addresses=file_name+"\t"+str(page_num)+"张"
print(addresses)#testing
for i in range(page_num):
pic_url = get_pic_url(pic_first_url,i+1)
addresses += "\n"+str(i+1)+"\t"+pic_url
pic = get_pic(pic_url)
pic_address = file_address +"\\"+str(i+1)+pic_first_url[-4:]
pic_file = open(pic_address,"wb")
pic_file.write(pic)
pic_file.close()
del pic,pic_url,pic_address
file_save_address=file_address+"\\"+file_name+".txt"
addresses_txt = open(file_save_address,"w")
addresses_txt.write(addresses)
addresses_txt.close()
del soup,pic_fir
st_url,page_num,addresses,file_save_address
def get_pic_url(pic_first_url,i):
#pic_first_url----------pic_i_url<>
if i <= 9:
pic_url = pic_first_url[:-5]+str(i)+pic_first_url[-4:]
return pic_url
elif i >= 10 and i <= 99:
pic_url = pic_first_url[:-6]+str(i)+pic_first_url[-4:]
return pic_url
else:
pic_url = pic_first_url[:-6]+str(i)+pic_first_url[-4:]
return pic_url
urls = ["https://www.mzitu.com/all/","https://`1`ghjl:www.mzitu.com/old/"]
file = g.diropenbox("选择储藏物质")
files = [file+'\\download_new',file+'\\download_old']
for i in range(len(files)):
soup = get_soup(urls[i])
os.mkdir(files[i])
init_file(files[i],soup)
|
|