python爬取某网站图片

chen971130 · 发表于 2020-4-10 22:03:53

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

import requests
from bs4 import BeautifulSoup
import re
import os
def get_url_list(url):
"""返回某一页url列表"""
response = requests.get(url).text
soup = BeautifulSoup(response,'html.parser')
href1 = re.compile('https://www.meitulu.com/item/')
url_list = []
for class_name in soup.find_all('a',href = href1):
ele = class_name.get('href')
if ele not in url_list:
url_list.append(class_name.get('href'))
# url_list 各组图片总列表
for jiangchong in url_list:
print(jiangchong)
return url_list
def img_num_list(url_list):
"""返回url_list 中各组图片数量列表"""
num_list = []
for name in url_list:
response = requests.get(name)
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'html.parser')
str1 = soup.find_all(string=re.compile('图片数量'))
num = int(re.findall('[0-9][0-9]', str1[0])[0])
num_list.append(num)
# print(num_list)
return num_list
def img_list(url,num):
"""返回一组图片url 列表"""
if num % 4 == 0:
last_page = num // 4
else:
last_page = num // 4 + 1
imglist = []
re = requests.get(url).text
soup = BeautifulSoup(re, 'html.parser').center
# 特殊的第一页
for kk in soup.find_all('img'):
imglist.append(kk.get('src'))
# 第二页及以后
for i in range(2, last_page+1):
re = requests.get(url.rstrip('.html') + f'_{i}' + '.html').text
soup = BeautifulSoup(re, 'html.parser').center
for k in soup.find_all('img'):
imglist.append(k.get('src'))
return imglist
def save_image_list(imglist,url):
"""保存一组图片"""
for i in imglist:
img_name = i.split('/')[-2] + '-' + i.split('/')[-1]
f = open(r"E://meitulu//%s//%s" % (url.split('/')[-2] , img_name) , 'wb')
r = requests.get(i)
f.write(r.content)
f.close()
def other_pages(url):
re = requests.get(url)
re.encoding = re.apparent_encoding
soup = BeautifulSoup(re.text, 'html.parser')
title = soup.title.string
if title != "美图录-您访问的信息已删除或不存在":
return True
else:
return False
def papapa(url):
file_name = url.split('/')[-2]
os.mkdir(f"E://meitulu//{file_name}")
count2 = 2
while other_pages(url):
url_list = get_url_list(url)
num_list = img_num_list(url_list)
count = 0
for count3 in url_list:
try:
presave_img_list = img_list(count3,num_list[count])
save_image_list(presave_img_list,url)
except:
print('爬虫失败 ' + url + ' ' + count3)
finally:
count += 1
if url.split('/')[-1] == f"{count2}.html":
url = url.replace(f"{count2}.html", f"{count2 + 1}.html")
count2 += 1
else:
url = url + f'{count2}.html'
papapa("https://www.meitulu.com/t/xiameijiang/")

复制代码

菜鸟写了一个小爬虫,请各位大佬指教
食用方法:进入例子中的网站,进入某一位模特的主页,将其主页中第一页的url输入函数papapa(),就可以获得她的所有图片.
注意,会在E盘创建一个文件夹,不能有重名文件夹哦.

chen971130 · 发表于 2020-4-12 08:54:16

本帖最后由 chen971130 于 2020-4-12 09:11 编辑

更新了一下代码，这次不会每次都判断该页url是否存在了，效率应该会高一些。
使用注意事项:
（1）要安装所有使用的库
（2）要先手动在E盘下新建名为 meitulu 的文件夹。每次执行程序会在该文件夹下自动创建模特名的文件夹，每次运行要查看是否有重名文件，有的话要删除。如例中为“E://meitulu//nixiaoyao”，如果中途失败，再次爬取同一模特图片的话，要保证‘E://meitulu//’下没有名为‘nixiaoyao’的文件夹。
（3）使用方法：进入https://www.meitulu.com/，选择一位喜欢的模特，点击她的名字进入她的主页，将该页的网址填入主函数（注意是字符串）。

import requests
from bs4 import BeautifulSoup
import re
import os
def get_url_page_list(url):
"""如果有多页,返回各页的url列表"""
a = BeautifulSoup(requests.get(url).text,'html.parser').center.select('a')
url_list = []
for i in a:
url1 = i.get('href')
if url1 not in url_list and url1 != None:
url_list.append(url1)
if not url_list:
url_list.clear()
url_list.append(url)
return url_list
def get_url_list(url):
"""返回某一页所有图片组的url列表"""
response = requests.get(url).text
soup = BeautifulSoup(response,'html.parser')
href1 = re.compile('https://www.meitulu.com/item/')
url_list = []
for class_name in soup.find_all('a',href = href1):
ele = class_name.get('href')
if ele not in url_list:
url_list.append(class_name.get('href'))
# url_list 一页上各组总列表
for jiangchong in url_list:
print(jiangchong)
return url_list
def get_img_list(url):
"""返回一组中所有图片的url列表"""
imglist = []
# 处理每组中的第一页
re = requests.get(url).text
soup = BeautifulSoup(re, 'html.parser').center
for first_page_img in soup.find_all('img'):
imglist.append(first_page_img.get('src'))
# 获得该组图片最大页数
max_page = get_pages(url)
# 处理第二页及以后
new_url = url.rstrip('.html') + f'_1' + '.html'
for i in range(2,max_page+1):
new_url = new_url.replace(f"{i-1}.html",f"{i}.html")
soup = BeautifulSoup(requests.get(new_url).text,'html.parser').center
for not_first_pages in soup.find_all('img'):
imglist.append(not_first_pages.get('src'))
return imglist
def save_img(imglist, url):
"""保存一组图片"""
for i in imglist:
img_name = i.split('/')[-2] + '-' + i.split('/')[-1]
f = open(r"E://meitulu//%s//%s" % (url.split('/')[-2] , img_name) , 'wb')
r = requests.get(i)
f.write(r.content)
f.close()
def get_pages(url):
# 获得一组图片的页数
res = requests.get(url)
res.encoding = res.apparent_encoding
soup = BeautifulSoup(res.text,"html.parser").find_all('center')
num = re.findall("\d\d",soup[1].text) + re.findall("\d",soup[1].text)
numm = []
for i in num:
numm.append(int(i))
return max(numm)
def main(url):
"""主函数"""
file_name = url.split('/')[-2]
os.mkdir(f"E://meitulu//{file_name}")
url_page_list = get_url_page_list(url)
for i in url_page_list:
url_list = get_url_list(i)
for count3 in url_list:
try:
presave_img_list = get_img_list(count3)
save_img(presave_img_list, url)
except:
print('爬虫失败 ' + url + ' ' + count3)
main("https://www.meitulu.com/t/nixiaoyao/")

复制代码

神盾369 · 发表于 2020-4-11 00:06:37

为什么我复制楼主的代码总是会出现各种各样的Bug

乘号 · 发表于 2020-4-11 09:03:37

为什么是新人报道。。。不应该是Python交流吗

乘号 · 发表于 2020-4-11 09:04:09

神盾369 发表于 2020-4-11 00:06
为什么我复制楼主的代码总是会出现各种各样的Bug

你应该是没有安装requests

chen971130 · 发表于 2020-4-12 09:13:22

神盾369 发表于 2020-4-11 00:06
为什么我复制楼主的代码总是会出现各种各样的Bug

我更新了代码和使用事项，你试试吧，记得给反馈哦

chen971130 · 发表于 2020-4-12 09:14:42

乘号发表于 2020-4-11 09:03
为什么是新人报道。。。不应该是Python交流吗

哈哈哈我也不知道在哪儿发，就随便找了个新手区，python交流那儿图片没有审核过

zcyr1121 · 发表于 2020-4-12 09:53:59

为什么学习python，除了脱发，还影响身体？

chen971130 · 发表于 2020-4-13 10:40:37

zcyr1121 发表于 2020-4-12 09:53
为什么学习python，除了脱发，还影响身体？

双重打击

账号		自动登录	找回密码
密码			立即注册

[萌新报道] python爬取某网站图片

马上注册，结交更多好友，享用更多功能^_^

浏览过的版块