Mac下python爬虫多次失败,Python交流,编程语言专区,鱼C论坛

ten$1 发表于 2020-7-11 20:54:45

Mac下python爬虫多次失败

本帖最后由 ten$1 于 2020-7-11 22:21 编辑

第一次实验：
import urllib.request
from bs4 import BeautifulSoup
import os

def Download(url,picAlt,name):
path = 'D:\\pythonD爬虫妹子图\\'+picAlt+'\\'
if not os.path.exists(path):
   os.makedirs(path)
urllib.request.urlretrieve( url, '{0}{1}.jpg'.format(path, name))

header = {
"User-Agent":'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive'
}

def run(targetUrl, beginNUM ,endNUM):
req = urllib.request.Request(url=targetUrl,headers=header)
response = urllib.request.urlopen(req)
html = response.read().decode('gb2312','ignore')
soup = BeautifulSoup(html, 'html.parser')
Divs = soup.find_all('div',attrs={'id':'big-pic' })
nowpage = soup.find('span',attrs={'class':'nowpage'}).get_text()
totalpage= soup.find('span',attrs={'class':'totalpage'}).get_text()
if beginNUM ==endNUM :
   return
for div in Divs:
   beginNUM = beginNUM+1

   if div.find("a") is None :
         print("没有下一张了")
         return
   elif div.find("a")['href'] is None or div.find("a")['href']=="":
         print("没有下一张了None")
         return
   print("下载信息：总进度：",beginNUM,"/",endNUM," ，正在下载套图：(",nowpage,"/",totalpage,")")

   if int(nowpage)<int(totalpage):
         nextPageLink ="http://www.mmonly.cc/mmtp/qcmn/" +(div.find('a')['href'])
   elif int(nowpage)==int(totalpage):
         nextPageLink = (div.find('a')['href'])

   picLink = (div.find('a').find('img')['src'])
   picAlt = (div.find('a').find('img'))['alt']
   print('下载的图片链接:',picLink)
   print('套图名：[ ', picAlt , ' ] ')
   print('开始下载...........')
   Download(picLink,picAlt, nowpage)
   print("下载成功！")
   print('下一页链接:',nextPageLink)
   run(nextPageLink,beginNUM ,endNUM)
   return

if __name__ == '__main__':
targetUrl ="http://www.mmonly.cc/mmtp/qcmn/237269.html"
run(targetUrl,beginNUM=0,endNUM=70)
print(" OVER")

结果
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1350, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1240, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1286, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1235, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1006, in _send_output
self.send(msg)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 946, in send
self.connect()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1409, in connect
self.sock = self._context.wrap_socket(self.sock,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 500, in wrap_socket
return self.sslsocket_class._create(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1040, in _create
self.do_handshake()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1309, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 59, in <module>
run(targetUrl,beginNUM=0,endNUM=70)
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 21, in run
response = urllib.request.urlopen(req)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 531, in open
response = meth(req, response)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 640, in http_response
response = self.parent.error(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 563, in error
result = self._call_chain(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 755, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1393, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1353, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)>

static/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.png
第二次：
import urllib.request
import os
import re

#打开url操作
def url_open(url):
   headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
               'Referer': 'http://wwww.mzitu.com'}
   req = urllib.request.Request(url,headers = headers)
   response = urllib.request.urlopen(req)
   html = response.read()
   return html

#获取当前图片组的最大页码数
def get_maxpage(url):
   html = url_open(url).decode('utf-8')
   pages = re.findall(r'<span>\d{1,2}',html)
   return pages[-1])]

#传入当前页面url，返回当前页面所有图片组链接地址列表
def find_imgs(url):
   html = url_open(url).decode('utf-8')
   imgs_url = re.findall(r'http://www.mzitu.com/\d{6}',html)
   return imgs_url

#传入图片组url，返回图片组中所有图片链接地址列表
def find_img(url,page):
   html = url_open(url + '/' + str(page)).decode('utf-8')
   img_addrs = []

   a = html.find('img src=')
   while a != -1:
            b = html.find('.jpg" alt="',a,a+255)
            if b!= -1:
                     img_addrs.append(html)
            else:
                     b =a + 9
            a = html.find('img src=',b)
   return img_addrs

#根据图片地址列表，将图片保存到folder中
def save_img(folder,img_addrs):
   for each in img_addrs:
            filename = each.split('/')[-1]
            print(filename)
            with open(filename,'wb') as f:
                     img = url_open(each)
                     f.write(img)

def download(folder = 'meizi',*pages):
   if not os.path.exists(folder):
            os.mkdir(folder)
   os.chdir(folder)

   url = 'http://www.mzitu.com'
   # page_num = int(get_page(url))    #获取当前页数

   for page in pages:
            page_url = url + '/page/' + str(page) + '/'
            #创建页文件夹
            pagefolder = "page-" + str(page)
            if not os.path.exists(pagefolder):
                     os.mkdir(pagefolder)
            os.chdir(pagefolder)
            #获取图片组地址列表
            img_group_addrs = find_imgs(page_url)
            #对于每个图片组，获取图片地址并保存
            group = 0
            for addr in img_group_addrs:
                     group += 1
                     img_addrs =
                     #创建组文件夹
                     groupfolder = str(page) + "-" + str(group)
                     if not os.path.exists(groupfolder):
                           os.mkdir(groupfolder)
                     os.chdir(groupfolder)
                     save_img(groupfolder,img_addrs)
                     os.chdir(os.pardir)
            os.chdir(os.pardir)

if __name__ == '__main__':
   download('meizi',1)#第一个参数为文件夹名，第二个参数为要爬取的页码

结果
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1350, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1240, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1286, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1235, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1006, in _send_output
self.send(msg)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 946, in send
self.connect()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1409, in connect
self.sock = self._context.wrap_socket(self.sock,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 500, in wrap_socket
return self.sslsocket_class._create(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1040, in _create
self.do_handshake()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1309, in do_handshake
self._sslobj.do_handshake()
ssl.SSLCertVerificationError: certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 83, in <module>
download('meizi',1)#第一个参数为文件夹名，第二个参数为要爬取的页码
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 67, in download
img_group_addrs = find_imgs(page_url)
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 22, in find_imgs
html = url_open(url).decode('utf-8')
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 10, in url_open
response = urllib.request.urlopen(req)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 222, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 531, in open
response = meth(req, response)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 640, in http_response
response = self.parent.error(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 563, in error
result = self._call_chain(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 755, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 525, in open
response = self._open(req, data)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 542, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1393, in https_open
return self.do_open(http.client.HTTPSConnection, req,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1353, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)>
>>>

Twilight6 发表于 2020-7-11 20:54:46

ten$1 发表于 2020-7-11 22:23
就是原厂封装的IDE啊。。证书错误怎么解决？

安装了不会报没有模块 bs4 的错误才对呀，你是不是电脑不止一个版本的 Python 呢？安装的时候看见 Successfully 才算是成功的哈

证书错误加上这串代码试试看，取消证书验证：
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

ten$1 发表于 2020-7-11 20:55:45

这是怎么回事啊？

Twilight6 发表于 2020-7-11 21:08:40

第一个报错是你没安装 BeautifulSoup 模块吧？

第二个报错是证书错误吧？SSL

ten$1 发表于 2020-7-11 22:20:54

Twilight6 发表于 2020-7-11 21:08
第一个报错是你没安装 BeautifulSoup 模块吧？

第二个报错是证书错误吧？SSL

BeautifulSoup装好了

Twilight6 发表于 2020-7-11 22:22:09

ten$1 发表于 2020-7-11 22:20
BeautifulSoup装好了

你用的是 PyCharm ?

ten$1 发表于 2020-7-11 22:23:23

Twilight6 发表于 2020-7-11 22:22
你用的是 PyCharm ?

就是原厂封装的IDE啊。。证书错误怎么解决？

ten$1 发表于 2020-7-11 22:28:26

Twilight6 发表于 2020-7-11 22:26
安装了不会报没有模块 bs4 的错误才对呀，你是不是电脑不止一个版本的 Python 呢？安装的时候看见 Suc ...

好了，谢谢谢谢

页: [1]

鱼C论坛's Archiver

Mac下python爬虫多次失败