[已解决]Mac下python爬虫多次失败

ten$1 · 发表于 2020-7-11 20:54:45

本帖最后由 ten$1 于 2020-7-11 22:21 编辑

第一次实验：

import urllib.request
from bs4 import BeautifulSoup
import os
def Download(url,picAlt,name):
path = 'D:\\pythonD爬虫妹子图\\'+picAlt+'\\'
if not os.path.exists(path):
os.makedirs(path)
urllib.request.urlretrieve( url, '{0}{1}.jpg'.format(path, name))
header = {
"User-Agent":'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive'
}
def run(targetUrl, beginNUM ,endNUM):
req = urllib.request.Request(url=targetUrl,headers=header)
response = urllib.request.urlopen(req)
html = response.read().decode('gb2312','ignore')
soup = BeautifulSoup(html, 'html.parser')
Divs = soup.find_all('div',attrs={'id':'big-pic' })
nowpage = soup.find('span',attrs={'class':'nowpage'}).get_text()
totalpage= soup.find('span',attrs={'class':'totalpage'}).get_text()
if beginNUM ==endNUM :
return
for div in Divs:
beginNUM = beginNUM+1
if div.find("a") is None :
print("没有下一张了")
return
elif div.find("a")['href'] is None or div.find("a")['href']=="":
print("没有下一张了None")
return
print("下载信息：总进度：",beginNUM,"/",endNUM," ，正在下载套图：(",nowpage,"/",totalpage,")")
if int(nowpage)<int(totalpage):
nextPageLink ="http://www.mmonly.cc/mmtp/qcmn/" +(div.find('a')['href'])
elif int(nowpage)==int(totalpage):
nextPageLink = (div.find('a')['href'])
picLink = (div.find('a').find('img')['src'])
picAlt = (div.find('a').find('img'))['alt']
print('下载的图片链接:',picLink)
print('套图名：[ ', picAlt , ' ] ')
print('开始下载...........')
Download(picLink,picAlt, nowpage)
print("下载成功！")
print('下一页链接:',nextPageLink)
run(nextPageLink,beginNUM ,endNUM)
return
if __name__ == '__main__':
targetUrl ="http://www.mmonly.cc/mmtp/qcmn/237269.html"
run(targetUrl,beginNUM=0,endNUM=70)
print(" OVER")

复制代码

结果

Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1350, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1240, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1286, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1235, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1006, in _send_output
self.send(msg)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 946, in send
self.connect()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1409, in connect
self.sock = self._context.wrap_socket(self.sock,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 500, in wrap_socket
return self.sslsocket_class._create(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1040, in _create
self.do_handshake()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1309, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 59, in <module>
run(targetUrl,beginNUM=0,endNUM=70)
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 21, in run
response = urllib.request.urlopen(req)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 531, in open
response = meth(req, response)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 640, in http_response
response = self.parent.error(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 563, in error
result = self._call_chain(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 755, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1393, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1353, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)>

复制代码

登录/注册后可看大图

第二次：

import urllib.request
import os
import re
#打开url操作
def url_open(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
'Referer': 'http://wwww.mzitu.com'}
req = urllib.request.Request(url,headers = headers)
response = urllib.request.urlopen(req)
html = response.read()
return html
#获取当前图片组的最大页码数
def get_maxpage(url):
html = url_open(url).decode('utf-8')
pages = re.findall(r'<span>\d{1,2}',html)
return pages[-1][6:len(pages[-1])]
#传入当前页面url，返回当前页面所有图片组链接地址列表
def find_imgs(url):
html = url_open(url).decode('utf-8')
imgs_url = re.findall(r'http://www.mzitu.com/\d{6}',html)
return imgs_url
#传入图片组url，返回图片组中所有图片链接地址列表
def find_img(url,page):
html = url_open(url + '/' + str(page)).decode('utf-8')
img_addrs = []
a = html.find('img src=')
while a != -1:
b = html.find('.jpg" alt="',a,a+255)
if b!= -1:
img_addrs.append(html[a+9:b+4])
else:
b =a + 9
a = html.find('img src=',b)
return img_addrs[0]
#根据图片地址列表，将图片保存到folder中
def save_img(folder,img_addrs):
for each in img_addrs:
filename = each.split('/')[-1]
print(filename)
with open(filename,'wb') as f:
img = url_open(each)
f.write(img)
def download(folder = 'meizi',*pages):
if not os.path.exists(folder):
os.mkdir(folder)
os.chdir(folder)
url = 'http://www.mzitu.com'
# page_num = int(get_page(url)) #获取当前页数
for page in pages:
page_url = url + '/page/' + str(page) + '/'
#创建页文件夹
pagefolder = "page-" + str(page)
if not os.path.exists(pagefolder):
os.mkdir(pagefolder)
os.chdir(pagefolder)
#获取图片组地址列表
img_group_addrs = find_imgs(page_url)
#对于每个图片组，获取图片地址并保存
group = 0
for addr in img_group_addrs:
group += 1
img_addrs = [find_img(addr,x) for x in range(int(get_maxpage(addr)))]
#创建组文件夹
groupfolder = str(page) + "-" + str(group)
if not os.path.exists(groupfolder):
os.mkdir(groupfolder)
os.chdir(groupfolder)
save_img(groupfolder,img_addrs)
os.chdir(os.pardir)
os.chdir(os.pardir)
if __name__ == '__main__':
download('meizi',1)#第一个参数为文件夹名，第二个参数为要爬取的页码

复制代码

结果

Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1350, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1240, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1286, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1235, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1006, in _send_output
self.send(msg)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 946, in send
self.connect()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1409, in connect
self.sock = self._context.wrap_socket(self.sock,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 500, in wrap_socket
return self.sslsocket_class._create(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1040, in _create
self.do_handshake()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1309, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 83, in <module>
download('meizi',1)#第一个参数为文件夹名，第二个参数为要爬取的页码
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 67, in download
img_group_addrs = find_imgs(page_url)
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 22, in find_imgs
html = url_open(url).decode('utf-8')
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 10, in url_open
response = urllib.request.urlopen(req)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 531, in open
response = meth(req, response)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 640, in http_response
response = self.parent.error(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 563, in error
result = self._call_chain(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 755, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1393, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1353, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)>
>>>

复制代码

最佳答案

月排行榜 / 总排行榜

Twilight6

2020-7-11 20:54:46

ten$1 发表于 2020-7-11 22:23
就是原厂封装的IDE啊。。证书错误怎么解决？

安装了不会报没有模块 bs4 的错误才对呀，你是不是电脑不止一个版本的 Python 呢？安装的时候看见 Successfully 才算是成功的哈

证书错误加上这串代码试试看，取消证书验证：

import ssl

ssl._create_default_https_context = ssl._create_unverified_context
复制代码

跳转到最佳答案楼层

Twilight6 · 发表于 2020-7-11 20:54:46

这个最佳答案由 Twilight6 给出，感谢 Twilight6 的回答。

单击隐藏图章

ten$1 发表于 2020-7-11 22:23
就是原厂封装的IDE啊。。证书错误怎么解决？

安装了不会报没有模块 bs4 的错误才对呀，你是不是电脑不止一个版本的 Python 呢？安装的时候看见 Successfully 才算是成功的哈

证书错误加上这串代码试试看，取消证书验证：

import ssl

ssl._create_default_https_context = ssl._create_unverified_context
复制代码

ten$1 · 发表于 2020-7-11 20:55:45

这是怎么回事啊？

Twilight6 · 发表于 2020-7-11 21:08:40

第一个报错是你没安装 BeautifulSoup 模块吧？

第二个报错是证书错误吧？SSL

ten$1 · 发表于 2020-7-11 22:20:54

Twilight6 发表于 2020-7-11 21:08
第一个报错是你没安装 BeautifulSoup 模块吧？

第二个报错是证书错误吧？SSL

BeautifulSoup装好了

Twilight6 · 发表于 2020-7-11 22:22:09

ten$1 发表于 2020-7-11 22:20
BeautifulSoup装好了

你用的是 PyCharm ?

ten$1 · 发表于 2020-7-11 22:23:23

Twilight6 发表于 2020-7-11 22:22
你用的是 PyCharm ?

就是原厂封装的IDE啊。。证书错误怎么解决？

ten$1 · 发表于 2020-7-11 22:28:26

Twilight6 发表于 2020-7-11 22:26
安装了不会报没有模块 bs4 的错误才对呀，你是不是电脑不止一个版本的 Python 呢？安装的时候看见 Suc ...

好了，谢谢谢谢

账号		自动登录	找回密码
密码			立即注册

[已解决]Mac下python爬虫多次失败

最佳答案

评分

浏览过的版块