|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- import urllib.request
- #引入解析器包
- from bs4 import BeautifulSoup as bs
- import os
- #打开网址
- def open_url(url):
- req = urllib.request.Request(url)
- req.add_header('user-agent','Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Mobile Safari/537.36')
- response = urllib.request.urlopen(req)
- html = response.read()
- return html
- #将打开后的网页对象解析后返还
- def url_bs(html):
- soup = bs(html,'html.parser')
- return soup
- #接收空白列表,将所有上一页递归直至没有上一页为止返回列表尾部元素
- def a(x,ls):
- d = x.find('a','next-comment-page')
- url = 'http:'+d['href']
- url2 = open_url(url).decode()
- url3 = url_bs(url2)
- ls.append(url)
- #用try抓住TypeError防止结尾报错
- if d:
- try:
- a(url3,ls)
- except TypeError:
- return
- else:
- pass
- #返回列表尾部的元素,也就是首页的url
- return ls[-1]
-
- #判断网页是否首页
- def judge(html):
- p_page = html.find('a','next-comment-page')
- #如果有上一页这个元素的话
- if p_page:
- new_page = 'http:' + p_page['href']
- new_html = open_url(new_page).decode()
- #将页面用写好的解析器解析
- soup = url_bs(new_html)
- ls = []
- #a函数上面有说明
- p = a(soup,ls)
- #将返还的用自创open_url函数打开
- p = open_url(p)
- return p
- #没有上一页这个元素说明为首页,直接返回html
- else:
- return html
- #把页面变成第二页
- def next_page(html):
- a = html.find('a','previous-comment-page')
- aa = 'http:'+a['href']
- bb = open_url(aa)
- cc = url_bs(bb)
- return cc
-
- def find_img(html,ls):
- for i in html.find_all('img'):
- s = str(i)
- a = s.find('src')+5
- b = a + 60
- new_h = 'http:' + s[a:b].strip('"')
- ls.append(new_h)
- return ls
- #提取图片src,进行url拼接,存入一个列表
- def save_img(img):
- for i in img:
- filename = i.split('/')[-1]
- pic = open_url(i)
- with open(filename,'wb') as f:
- f.write(pic)
- print(filename)
-
- #主函数下载图片
- def download(folder='pic',pages=10):
-
- #在当前目录下创建文件夹
- os.mkdir(folder)
-
- #进入当前文件夹操作
- os.chdir(folder)
- url = 'http://i.jandan.net/ooxx'
-
- #打开网址
- html = open_url(url).decode()
-
- #得到解析器解析后的对象
- html2 = url_bs(html)
- #判断后得到首页
- first_page = judge(html2)
-
- #创建空列表备用传入find_img函数
- ls =[]
- for i in range(pages):
- #将解析过后的第一页传进函数进行提取图片src
- img_ls = find_img(first_page,ls)
- save_img(img_ls)
- #把页面变成第二页
- first_page = next_page(first_page)
-
- if __name__ == '__main__':
- download()
复制代码 |
|