|

楼主 |
发表于 2020-6-13 16:37:02
|
显示全部楼层
最近试了一下,原来的版本里正则式匹配不了图片链接了,重新改了一下
- import requests
- import re
- import os
- import easygui as g
- def get_url(url):
- headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0'}
- req = requests.get(url, headers=headers)
- return req
- def get_img_url(url1, url2): #图片地址
- headers = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0',
- 'referer':url2
- }
- req = requests.get(url1, headers=headers)
- return req
- def get_page(url): #共有多少页
- html = get_url(url).text
- p = r"page-numbers.+[\d]+/'"
- page_match = re.findall(p, html)
- for each in page_match:
- page = re.sub(r'\D','', each)
-
- return page
- def find_picurl(url): #每一页中的项目url
-
- html = get_url(url).text
- p = r'<li><a href="(https://www.mzitu.com/[\d]+)" target'
- picurl = re.findall(p, html)
- return picurl
- def pro_name(url): #每一页的项目名称
- html = get_url(url).text
- p = r"alt='(.+)' width='236'"
- targets = re.findall(p, html)
- return targets
-
- def get_p_page(url): #每一个项目的页数即图片数
- html = get_url(url).text
- p = r'<span>([\d]+)</span>'
- p_page = re.findall(p, html)
- for each in p_page:
- page = each
-
- return int(page)
- def find_imgs(url):
- html = get_url(url).text
- p = re.search(r'img class="blur" src="(https.+\.jpg)', html)
- return p.group(1)
- def save_img(url, ref_url):
- filename = url.split('/')[-1]
- with open(filename, 'wb') as f:
- img = get_img_url(url, ref_url).content
- f.write(img)
-
- def main():
- folder = g.diropenbox("请选择存放照片的文件夹")
- os.listdir(folder)
- os.chdir(folder)
-
- url = "https://www.mzitu.com/xinggan/"
- page = get_page(url) #总共几页
- print("总共有{}页".format(page))
- f = True
- while f == True:
- Page = input("请输入下载第几页:")
- url1 = 'https://www.mzitu.com/xinggan/page/{}/'.format(Page)
-
- pic_url = find_picurl(url1)
- pic_pro = pro_name(url1)
- num = len(pic_url) #每一页的项目个数
- print("第%s页共有%d个项目" % (Page, num))
- for each in range(1, num+1):
- print("第%d个项目是:" % each)
- print(pic_pro[each-1])
- Num = int(input("请输入需要下载第几个项目:"))
- p_page = get_p_page(pic_url[Num-1])
- print("共有%d页:" % p_page)
- for i in range(1,p_page+1):
- url2 = pic_url[Num-1] + '/' + str(i)
- img_addr = find_imgs(url2)
- print(img_addr)
- save_img(img_addr, url2)
- print("下载完毕")
- ans = input("是否继续下载:输入Y/N")
- if ans == 'N':
- f = False
- if __name__ == '__main__':
- main()
复制代码 |
|