python爬取某网站图片,新手乐园,技术交流,鱼C论坛

chen971130 发表于 2020-4-10 22:03:53

python爬取某网站图片

import requests
from bs4 import BeautifulSoup
import re
import os

def get_url_list(url):

"""返回某一页url列表"""
response = requests.get(url).text
soup = BeautifulSoup(response,'html.parser')
href1 = re.compile('https://www.meitulu.com/item/')
url_list = []
for class_name in soup.find_all('a',href = href1):
   ele = class_name.get('href')
   if ele not in url_list:
         url_list.append(class_name.get('href'))

# url_list 各组图片总列表
for jiangchong in url_list:
   print(jiangchong)

return url_list

def img_num_list(url_list):

"""返回url_list 中各组图片数量列表"""
num_list = []
for name in url_list:
   response = requests.get(name)
   response.encoding = response.apparent_encoding
   soup = BeautifulSoup(response.text, 'html.parser')
   str1 = soup.find_all(string=re.compile('图片数量'))
   num = int(re.findall('', str1))
   num_list.append(num)
# print(num_list)
return num_list

def img_list(url,num):
"""返回一组图片url 列表"""
if num % 4 == 0:
   last_page = num // 4
else:
   last_page = num // 4 + 1

imglist = []
re = requests.get(url).text
soup = BeautifulSoup(re, 'html.parser').center
# 特殊的第一页
for kk in soup.find_all('img'):
   imglist.append(kk.get('src'))
# 第二页及以后
for i in range(2, last_page+1):
   re = requests.get(url.rstrip('.html') + f'_{i}' + '.html').text
   soup = BeautifulSoup(re, 'html.parser').center
   for k in soup.find_all('img'):
         imglist.append(k.get('src'))
return imglist

def save_image_list(imglist,url):
"""保存一组图片"""
for i in imglist:
   img_name = i.split('/')[-2] + '-' + i.split('/')[-1]
   f = open(r"E://meitulu//%s//%s" % (url.split('/')[-2] , img_name) , 'wb')
   r = requests.get(i)
   f.write(r.content)
   f.close()

def other_pages(url):
re = requests.get(url)
re.encoding = re.apparent_encoding
soup = BeautifulSoup(re.text, 'html.parser')
title = soup.title.string
if title != "美图录-您访问的信息已删除或不存在":
   return True
else:
   return False

def papapa(url):
file_name = url.split('/')[-2]
os.mkdir(f"E://meitulu//{file_name}")
count2 = 2
while other_pages(url):
   url_list = get_url_list(url)
   num_list = img_num_list(url_list)
   count = 0
   for count3 in url_list:
         try:
            presave_img_list = img_list(count3,num_list)
            save_image_list(presave_img_list,url)
         except:
            print('爬虫失败 ' + url + ' ' + count3)
         finally:
            count += 1
   if url.split('/')[-1] == f"{count2}.html":
         url = url.replace(f"{count2}.html", f"{count2 + 1}.html")
         count2 += 1
   else:
         url = url + f'{count2}.html'

papapa("https://www.meitulu.com/t/xiameijiang/")

菜鸟写了一个小爬虫,请各位大佬指教
食用方法:进入例子中的网站,进入某一位模特的主页,将其主页中第一页的url输入函数papapa(),就可以获得她的所有图片.
注意,会在E盘创建一个文件夹,不能有重名文件夹哦.

chen971130 发表于 2020-4-12 08:54:16

本帖最后由 chen971130 于 2020-4-12 09:11 编辑

更新了一下代码，这次不会每次都判断该页url是否存在了，效率应该会高一些。
使用注意事项:
（1）要安装所有使用的库
（2）要先手动在E盘下新建名为meitulu的文件夹。每次执行程序会在该文件夹下自动创建模特名的文件夹，每次运行要查看是否有重名文件，有的话要删除。如例中为“E://meitulu//nixiaoyao”，如果中途失败，再次爬取同一模特图片的话，要保证‘E://meitulu//’下没有名为‘nixiaoyao’的文件夹。
（3）使用方法：进入https://www.meitulu.com/，选择一位喜欢的模特，点击她的名字进入她的主页，将该页的网址填入主函数（注意是字符串）。
import requests
from bs4 import BeautifulSoup
import re
import os

def get_url_page_list(url):
"""如果有多页,返回各页的url列表"""
a = BeautifulSoup(requests.get(url).text,'html.parser').center.select('a')
url_list = []
for i in a:
   url1 = i.get('href')
   if url1 not in url_list andurl1 != None:
         url_list.append(url1)
if not url_list:
   url_list.clear()
   url_list.append(url)
return url_list

def get_url_list(url):
"""返回某一页所有图片组的url列表"""
response = requests.get(url).text
soup = BeautifulSoup(response,'html.parser')
href1 = re.compile('https://www.meitulu.com/item/')
url_list = []
for class_name in soup.find_all('a',href = href1):
   ele = class_name.get('href')
   if ele not in url_list:
         url_list.append(class_name.get('href'))
# url_list 一页上各组总列表
for jiangchong in url_list:
   print(jiangchong)
return url_list

def get_img_list(url):
"""返回一组中所有图片的url列表"""
imglist = []
# 处理每组中的第一页
re = requests.get(url).text
soup = BeautifulSoup(re, 'html.parser').center
for first_page_img in soup.find_all('img'):
   imglist.append(first_page_img.get('src'))
# 获得该组图片最大页数
max_page = get_pages(url)
# 处理第二页及以后
new_url = url.rstrip('.html') + f'_1' + '.html'
for i in range(2,max_page+1):
   new_url = new_url.replace(f"{i-1}.html",f"{i}.html")
   soup = BeautifulSoup(requests.get(new_url).text,'html.parser').center
   for not_first_pages in soup.find_all('img'):
         imglist.append(not_first_pages.get('src'))
return imglist

def save_img(imglist, url):
"""保存一组图片"""
for i in imglist:
   img_name = i.split('/')[-2] + '-' + i.split('/')[-1]
   f = open(r"E://meitulu//%s//%s" % (url.split('/')[-2] , img_name) , 'wb')
   r = requests.get(i)
   f.write(r.content)
   f.close()

def get_pages(url):
# 获得一组图片的页数
res = requests.get(url)
res.encoding = res.apparent_encoding
soup = BeautifulSoup(res.text,"html.parser").find_all('center')
num = re.findall("\d\d",soup.text) + re.findall("\d",soup.text)
numm = []
for i in num:
   numm.append(int(i))
return max(numm)

def main(url):
"""主函数"""
file_name = url.split('/')[-2]
os.mkdir(f"E://meitulu//{file_name}")
url_page_list = get_url_page_list(url)
for i in url_page_list:
   url_list = get_url_list(i)
   for count3 in url_list:
         try:
            presave_img_list = get_img_list(count3)
            save_img(presave_img_list, url)
         except:
            print('爬虫失败 ' + url + ' ' + count3)

main("https://www.meitulu.com/t/nixiaoyao/")

神盾369 发表于 2020-4-11 00:06:37

为什么我复制楼主的代码总是会出现各种各样的Bug

乘号发表于 2020-4-11 09:03:37

为什么是新人报道。。。不应该是Python交流吗

乘号发表于 2020-4-11 09:04:09

神盾369 发表于 2020-4-11 00:06
为什么我复制楼主的代码总是会出现各种各样的Bug

你应该是没有安装requests

chen971130 发表于 2020-4-12 09:13:22

神盾369 发表于 2020-4-11 00:06
为什么我复制楼主的代码总是会出现各种各样的Bug

我更新了代码和使用事项，你试试吧，记得给反馈哦

chen971130 发表于 2020-4-12 09:14:42

乘号发表于 2020-4-11 09:03
为什么是新人报道。。。不应该是Python交流吗

哈哈哈我也不知道在哪儿发，就随便找了个新手区，python交流那儿图片没有审核过

zcyr1121 发表于 2020-4-12 09:53:59

为什么学习python，除了脱发，还影响身体？

chen971130 发表于 2020-4-13 10:40:37

zcyr1121 发表于 2020-4-12 09:53
为什么学习python，除了脱发，还影响身体？

双重打击

页: [1]

鱼C论坛's Archiver

python爬取某网站图片