|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- import requests
- from bs4 import BeautifulSoup
- import re
- import os
- def get_url_list(url):
- """返回某一页url列表"""
- response = requests.get(url).text
- soup = BeautifulSoup(response,'html.parser')
- href1 = re.compile('https://www.meitulu.com/item/')
- url_list = []
- for class_name in soup.find_all('a',href = href1):
- ele = class_name.get('href')
- if ele not in url_list:
- url_list.append(class_name.get('href'))
- # url_list 各组图片总列表
- for jiangchong in url_list:
- print(jiangchong)
- return url_list
- def img_num_list(url_list):
- """返回url_list 中各组图片数量列表"""
- num_list = []
- for name in url_list:
- response = requests.get(name)
- response.encoding = response.apparent_encoding
- soup = BeautifulSoup(response.text, 'html.parser')
- str1 = soup.find_all(string=re.compile('图片数量'))
- num = int(re.findall('[0-9][0-9]', str1[0])[0])
- num_list.append(num)
- # print(num_list)
- return num_list
- def img_list(url,num):
- """返回一组图片url 列表"""
- if num % 4 == 0:
- last_page = num // 4
- else:
- last_page = num // 4 + 1
- imglist = []
- re = requests.get(url).text
- soup = BeautifulSoup(re, 'html.parser').center
- # 特殊的第一页
- for kk in soup.find_all('img'):
- imglist.append(kk.get('src'))
- # 第二页及以后
- for i in range(2, last_page+1):
- re = requests.get(url.rstrip('.html') + f'_{i}' + '.html').text
- soup = BeautifulSoup(re, 'html.parser').center
- for k in soup.find_all('img'):
- imglist.append(k.get('src'))
- return imglist
- def save_image_list(imglist,url):
- """保存一组图片"""
- for i in imglist:
- img_name = i.split('/')[-2] + '-' + i.split('/')[-1]
- f = open(r"E://meitulu//%s//%s" % (url.split('/')[-2] , img_name) , 'wb')
- r = requests.get(i)
- f.write(r.content)
- f.close()
- def other_pages(url):
- re = requests.get(url)
- re.encoding = re.apparent_encoding
- soup = BeautifulSoup(re.text, 'html.parser')
- title = soup.title.string
- if title != "美图录-您访问的信息已删除或不存在":
- return True
- else:
- return False
- def papapa(url):
- file_name = url.split('/')[-2]
- os.mkdir(f"E://meitulu//{file_name}")
- count2 = 2
- while other_pages(url):
- url_list = get_url_list(url)
- num_list = img_num_list(url_list)
- count = 0
- for count3 in url_list:
- try:
- presave_img_list = img_list(count3,num_list[count])
- save_image_list(presave_img_list,url)
- except:
- print('爬虫失败 ' + url + ' ' + count3)
- finally:
- count += 1
- if url.split('/')[-1] == f"{count2}.html":
- url = url.replace(f"{count2}.html", f"{count2 + 1}.html")
- count2 += 1
- else:
- url = url + f'{count2}.html'
- papapa("https://www.meitulu.com/t/xiameijiang/")
复制代码
菜鸟写了一个小爬虫,请各位大佬指教
食用方法:进入例子中的网站,进入某一位模特的主页,将其主页中第一页的url输入函数papapa(),就可以获得她的所有图片.
注意,会在E盘创建一个文件夹,不能有重名文件夹哦. |
-
结果图展示
|