|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
背景如下:爬取 https://pic.yxdown.com/ 上面的图片 希望达到:每个系列有个 文件夹 里面 保存该主题的所有图片 ,但是 我执行了后 一直没有反应 不知道哪里出了问题
import json
import os.path
import re
import urllib
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
import logging
import random
from pathlib import Path
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
agent_list = ["Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10"]
MAX_FILENAME_LENGTH = 255
def sanitize_path(path):
# return os.path.abspath(os.path.expanduser(path))
return Path(path).expanduser().resolve()
def sanitize_directory_name(name):
name = re.sub(r'[\\/*? :"<>|]','',name)
# max_length = 255
if len(name)> MAX_FILENAME_LENGTH:
name = name[:MAX_FILENAME_LENGTH]
return name
def download_image(img_url, img_save_path):
try:
response =requests.get(img_url, timeout=10)
response.raise_for_status()
with open(img_save_path, 'wb') as f:
f.write(response.content)
logging.info('图片已保存:%s',img_save_path)
except requests.RequestException as e:
logging.error('下载图片发生错误:%s',e)
def choice_agent(agent_list):
return random.choice(agent_list)
def get_html(url, headers):
"""
解析网页内容的方法
解析每个url地址的公共方法
:param url:
:param headers:
:return:
"""
try:
headers ={'User-Agent': choice_agent(headers)}
html = requests.get(url, headers=headers, timeout=10)
html.raise_for_status()
html.encoding = 'utf-8'
logging.info('get_html: ' + url)
logging.info('获取页面:'+ url)
print(html.text)
return html.text
except requests.RequestException as e:
logging.error('请求错误:%s', e)
return None
def get_all_serials_image_info(url, headers):
"""
获取 当前url 上 所有系列图片的 地址 ,点击后可以跳转
:param url:
:param headers:
:return:
"""
try:
headers ={'User-Agent': choice_agent(headers)}
html = get_html(url, headers=headers)
if not html:
logging.error('未获取到页面')
return None
soup = BeautifulSoup(html,'html.parser')
all_img_info = soup.find_all('div', class_='cbmiddle')
all_serials_image_info = [] # 获取 当前url 上 所有系列图片的 地址
data_list =[]
for img_info in all_img_info:
img_cover = img_info.find('img')
img_title = img_info.find('b', class_='imagname').text
img_viewer = img_info.find('span').text
img_quantity = img_info.find('em').text
img_url = "https://pic.yxdown.com/"+ img_info.get('href')
all_serials_image_info.append(img_url)
data_list.append({
'img_cover': img_cover,
'img_title': img_title,
'img_viewer': img_viewer,
'img_quantity': img_quantity,
'img_url': img_url
})
print(data_list)
print(all_serials_image_info)
return data_list,all_serials_image_info
except Exception as e:
logging.error('获取图片信息时发生错误: %s',e)
return None
def get_every_serial_info(url, headers):
try:
headers ={'User-Agent': choice_agent(headers)}
data_list,all_serials_image_info = get_all_serials_image_info(url, headers=headers)
if not data_list:
logging.error('未获取到图片信息')
return
with ThreadPoolExecutor(max_workers=10) as executor:
logging.info('开始下载图片')
futures =[]
for data in data_list:
img_title = sanitize_directory_name(data['img_title'])
if not os.path.exists(img_title):
os.makedirs(img_title)
for i in range(1,int(len(data['img_quantity']))+ 1):
each_img_html = data['img_url']+'#p='+ str(i)
html = get_html(each_img_html, headers=headers)
if not html:
continue
soup = BeautifulSoup(html, 'html.parser' )
each_img_url = soup.find('img')['src']
image_name = os.path.basename(each_img_url)
img_save_path = os.path.join(img_title, image_name)
futures.append(executor.submit(download_image,each_img_html, img_save_path))
for future in futures:
future.result()
except requests.RequestException as e:
logging.error('请求错误:%s',e)
except Exception as e:
logging.error('其他错误:%s',e)
except requests.HTTPError as e:
logging.error('HTTP错误:%s,状态码:%d', e, e.response.status_code)
except requests.ConnectionError as e:
logging.error('连接错误:%s', e)
if __name__ == ' main ':
url = "https://pic.yxdown.com/"
headers = agent_list
data_list, all_serials_image_info = get_all_serials_image_info(url, headers=headers)
for data in data_list:
get_every_serial_info(data['img_url'], agent_list)
问题有点大啊(自己看法) 不知道有没有来得及 已经尽力赶了 早上10点看见的 帮你修改了一点 现在是可以正常运行和下载图片 由于本人是个小白所以可能没办法给出最好的代码
给一点我的理解:get_every_serial_info 这块问题最大 我测到最后时候发现每次拼接#p= 它的图片url都不会变后面去看了源码 发现对不上 猜测可能是js缘故 但是看到了有script代码<script>var images = .... 里面有当前页面的所有图片链接 所以就直接获取里面url 其次这块思路可能有点问题 获取图片url和后期下载图片代码都用了同一块get_all_serials_image_info函数 主要问题就是前后两块不可能页面布局一模一样就会导致很多获取有问题
以上是我的见解 希望可以帮助到你 下面是修改好的代码 共勉~
import json
import os.path
import re
import urllib
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
import logging
import random
from pathlib import Path
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
agent_list = ["Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Avant Browser/1.2.789rel1 (url)",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
"Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10"]
MAX_FILENAME_LENGTH = 255
def sanitize_path(path):
# return os.path.abspath(os.path.expanduser(path))
return Path(path).expanduser().resolve()
def sanitize_directory_name(name):
name = re.sub(r'[\\/*? :"<>|]','',name)
# max_length = 255
if len(name)> MAX_FILENAME_LENGTH:
name = name[:MAX_FILENAME_LENGTH]
return name
def download_image(img_url, img_save_path):
try:
# print("下载:",img_url,img_save_path)
response =requests.get(img_url, timeout=10)
response.raise_for_status()
with open(img_save_path, 'wb') as f:
f.write(response.content)
logging.info('图片已保存:%s',img_save_path)
except requests.RequestException as e:
logging.error('下载图片发生错误:%s',e)
def choice_agent(agent_list):
return random.choice(agent_list)
def get_html(url, headers):
"""
解析网页内容的方法
解析每个url地址的公共方法
:param url:
:param headers:
:return:
"""
try:
# headers ={'User-Agent': choice_agent(headers)}
headers =headers
# 第一次传进来的已经是一个完整的headers了 所以不需要再加一次
# print(headers)
html = requests.get(url, headers=headers, timeout=10)
html.raise_for_status()
html.encoding = 'utf-8'
logging.info('get_html: ' + url)
logging.info('获取页面:'+ url)
# print(html.text)
return html.text
except requests.RequestException as e:
logging.error('请求错误:%s', e)
return None
def get_all_serials_image_info(url, headers):
"""
获取 当前url 上 所有系列图片的 地址 ,点击后可以跳转
:param url:
:param headers:
:return:
"""
try:
headers ={'User-Agent': choice_agent(headers)}
# headers =headers
html = get_html(url, headers=headers)
if not html:
logging.error('未获取到页面')
return None
soup = BeautifulSoup(html,'html.parser')
all_img_info = soup.find_all('div', class_='cbmiddle')
all_serials_image_info = [] # 获取 当前url 上 所有系列图片的 地址
data_list =[]
for img_info in all_img_info[1:]:
# 修改 因为第一次的img_info 取到的是标题块的 不是内容 所以没有img这个标签
# print(img_info)
img_cover = img_info.find('img')
img_title = img_info.find('b', class_='imgname').text
# 修改:imgname字打错
img_viewer = img_info.find('span').text
img_quantity = img_info.find('em').text
# img_url = "pic.yxdown.com(url)"+ img_info.get('href')
img_url = "pic.yxdown.com(url)"+img_info.find("a", class_="proimg").get('href')
# 修改 img_info是一整个列表(不大会表达) 直接.get获取不到 因为都是标签 .get是获取标签内部的属性
all_serials_image_info.append(img_url)
data_list.append({
'img_cover': f'{img_cover}',
'img_title': f'{img_title}',
'img_viewer': f'{img_viewer}',
'img_quantity': f'{img_quantity}',
'img_url': f'{img_url}'
})
print(data_list)
print(all_serials_image_info)
return data_list,all_serials_image_info
except Exception as e:
logging.error('获取图片信息时发生错误: %s',e)
return None
def get_every_serial_info(url, headers,data):
try:
respon = requests.get(url, headers=headers)
# 这里没有用get_html 我认为没必要再调用一次那个自定义函数浪费内存 如果为了方便看倒是可以
# 思路:由于尝试过获取每一页然后获取图片 发现每次都是一个url 页面上看了也是 所以只能获取js里的url
if respon.status_code == 200:
# 状态码200
each_img_url = re.findall('\"big\":\"(.*?)\"', respon.text, re.S)
# 正则提取 将所有图片放入列表里
# print(each_img_url)
with ThreadPoolExecutor(max_workers=10) as executor:
logging.info('开始下载图片')
img_title = sanitize_directory_name(data['img_title'])
if not os.path.exists(img_title):
os.makedirs(img_title)
image_name = [os.path.basename(url) for url in each_img_url]
# 列表推导式 处理列表里的url 获取文件名
img_save_path = [os.path.join(img_title, name) for name in image_name]
executor.map(download_image, each_img_url, img_save_path)
# 传入两个列表 但是内部是一个一个值传进去
# 这边each_img的url用错 是each_img_url
# 多线程的这个可以不用 好像是会相当于for语句 不会提速
# futures.append(executor.submit(download_image,each_img_html, img_save_path))
# for future in futures:
# future.result()
except requests.RequestException as e:
logging.error('请求错误:%s',e)
except Exception as e:
logging.error('其他错误:%s',e)
except requests.HTTPError as e:
logging.error('HTTP错误:%s,状态码:%d', e, e.response.status_code)
except requests.ConnectionError as e:
logging.error('连接错误:%s', e)
if __name__ == '__main__':
url = "ic.yxdown.com(url)"
headers = agent_list
data_list, all_serials_image_info = get_all_serials_image_info(url, headers=headers)
for data in data_list:
# print(data)
get_every_serial_info(data['img_url'], {"User-Agent":choice_agent(agent_list)}, data)
# 这里把data直接传入 后面直接调用
|
评分
-
查看全部评分
|