马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import requests
from bs4 import BeautifulSoup
import re
import os
def get_url_list(url):
"""返回某一页url列表"""
response = requests.get(url).text
soup = BeautifulSoup(response,'html.parser')
href1 = re.compile('https://www.meitulu.com/item/')
url_list = []
for class_name in soup.find_all('a',href = href1):
ele = class_name.get('href')
if ele not in url_list:
url_list.append(class_name.get('href'))
# url_list 各组图片总列表
for jiangchong in url_list:
print(jiangchong)
return url_list
def img_num_list(url_list):
"""返回url_list 中各组图片数量列表"""
num_list = []
for name in url_list:
response = requests.get(name)
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'html.parser')
str1 = soup.find_all(string=re.compile('图片数量'))
num = int(re.findall('[0-9][0-9]', str1[0])[0])
num_list.append(num)
# print(num_list)
return num_list
def img_list(url,num):
"""返回一组图片url 列表"""
if num % 4 == 0:
last_page = num // 4
else:
last_page = num // 4 + 1
imglist = []
re = requests.get(url).text
soup = BeautifulSoup(re, 'html.parser').center
# 特殊的第一页
for kk in soup.find_all('img'):
imglist.append(kk.get('src'))
# 第二页及以后
for i in range(2, last_page+1):
re = requests.get(url.rstrip('.html') + f'_{i}' + '.html').text
soup = BeautifulSoup(re, 'html.parser').center
for k in soup.find_all('img'):
imglist.append(k.get('src'))
return imglist
def save_image_list(imglist,url):
"""保存一组图片"""
for i in imglist:
img_name = i.split('/')[-2] + '-' + i.split('/')[-1]
f = open(r"E://meitulu//%s//%s" % (url.split('/')[-2] , img_name) , 'wb')
r = requests.get(i)
f.write(r.content)
f.close()
def other_pages(url):
re = requests.get(url)
re.encoding = re.apparent_encoding
soup = BeautifulSoup(re.text, 'html.parser')
title = soup.title.string
if title != "美图录-您访问的信息已删除或不存在":
return True
else:
return False
def papapa(url):
file_name = url.split('/')[-2]
os.mkdir(f"E://meitulu//{file_name}")
count2 = 2
while other_pages(url):
url_list = get_url_list(url)
num_list = img_num_list(url_list)
count = 0
for count3 in url_list:
try:
presave_img_list = img_list(count3,num_list[count])
save_image_list(presave_img_list,url)
except:
print('爬虫失败 ' + url + ' ' + count3)
finally:
count += 1
if url.split('/')[-1] == f"{count2}.html":
url = url.replace(f"{count2}.html", f"{count2 + 1}.html")
count2 += 1
else:
url = url + f'{count2}.html'
papapa("https://www.meitulu.com/t/xiameijiang/")
菜鸟写了一个小爬虫,请各位大佬指教
食用方法:进入例子中的网站,进入某一位模特的主页,将其主页中第一页的url输入函数papapa(),就可以获得她的所有图片.
注意,会在E盘创建一个文件夹,不能有重名文件夹哦. |