|  | 
 
 发表于 2020-6-14 20:34:25
|
显示全部楼层 
| 2020.6.14随笔做了一个 
 import os
 import urllib.request
 import urllib.parse
 import bs4
 def open_url(url):
 req = urllib.request.Request(url)
 req.add_header("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36")
 response = urllib.request.urlopen(req)
 html = response.read()
 return html
 #找出当前网页的下一页
 def begin(url):
 html = open_url(url)
 html = html.decode("utf-8")
 soup=bs4.BeautifulSoup(html,"html.parser")
 temp=soup.find_all(class_="previous-comment-page")#class_="current-comment-page"
 print("begin")
 #print(temp)
 #print(temp[0].attrs["href"])
 return (temp[0].attrs["href"])
 
 #找出当前网页的所有图片下载地址网址
 def find_imgs(url_page):
 html = open_url(url_page).decode("utf-8")
 soup=bs4.BeautifulSoup(html,"html.parser")
 temp=soup.find_all("img",referrerpolicy="no-referrer")
 addrs=[]
 for i in temp:
 print(i.attrs["src"])
 addrs.append(i.attrs["src"])
 print("find")
 return addrs
 #保存图片
 def save_imgs(img_addrs):
 for i in img_addrs:
 url = "http:"+i
 html = open_url(url)
 name = url.split("/")[-1]
 with open(name,"wb") as f:
 f.write(html)
 print("save")
 
 #初始网页
 url = "http://jandan.net/ooxx"
 def download(url,page=30):
 path=os.getcwd()+"/aaa"
 os.mkdir(path)
 os.chdir(path)
 save_imgs(find_imgs(url))#url第一个网页下载图片
 #后续网页循环下载
 for i in range(page-1):
 url_page = "http:" + begin(url)
 save_imgs(find_imgs(url_page))
 url = url_page
 
 
 if __name__ == "__main__":
 download(url)
 
 
 
 
 | 
 |