ten$1 发表于 2020-7-11 20:54:45

Mac下python爬虫多次失败

本帖最后由 ten$1 于 2020-7-11 22:21 编辑

第一次实验:
import urllib.request
from bs4 import BeautifulSoup
import os

def Download(url,picAlt,name):
    path = 'D:\\pythonD爬虫妹子图\\'+picAlt+'\\'
    if not os.path.exists(path):
      os.makedirs(path)
    urllib.request.urlretrieve( url, '{0}{1}.jpg'.format(path, name))

header = {
    "User-Agent":'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
    'Accept': '*/*',
    'Accept-Language': 'en-US,en;q=0.8',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive'
    }

def run(targetUrl, beginNUM ,endNUM):
    req = urllib.request.Request(url=targetUrl,headers=header)
    response = urllib.request.urlopen(req)
    html = response.read().decode('gb2312','ignore')
    soup = BeautifulSoup(html, 'html.parser')
    Divs = soup.find_all('div',attrs={'id':'big-pic' })
    nowpage = soup.find('span',attrs={'class':'nowpage'}).get_text()
    totalpage= soup.find('span',attrs={'class':'totalpage'}).get_text()
    if beginNUM ==endNUM :
      return
    for div in Divs:
      beginNUM = beginNUM+1

      if div.find("a") is None :
            print("没有下一张了")
            return
      elif div.find("a")['href'] is None or div.find("a")['href']=="":
            print("没有下一张了None")
            return
      print("下载信息:总进度:",beginNUM,"/",endNUM," ,正在下载套图:(",nowpage,"/",totalpage,")")

      if int(nowpage)<int(totalpage):
            nextPageLink ="http://www.mmonly.cc/mmtp/qcmn/" +(div.find('a')['href'])
      elif int(nowpage)==int(totalpage):
            nextPageLink = (div.find('a')['href'])

      picLink = (div.find('a').find('img')['src'])
      picAlt = (div.find('a').find('img'))['alt']
      print('下载的图片链接:',picLink)
      print('套图名:[ ', picAlt , ' ] ')
      print('开始下载...........')
      Download(picLink,picAlt, nowpage)
      print("下载成功!")
      print('下一页链接:',nextPageLink)
      run(nextPageLink,beginNUM ,endNUM)
      return


if __name__ == '__main__':
    targetUrl ="http://www.mmonly.cc/mmtp/qcmn/237269.html"
    run(targetUrl,beginNUM=0,endNUM=70)
    print(" OVER")
   

结果
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1350, in do_open
    h.request(req.get_method(), req.selector, req.data, headers,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1240, in request
    self._send_request(method, url, body, headers, encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1286, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1235, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1006, in _send_output
    self.send(msg)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 946, in send
    self.connect()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1409, in connect
    self.sock = self._context.wrap_socket(self.sock,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 500, in wrap_socket
    return self.sslsocket_class._create(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1040, in _create
    self.do_handshake()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1309, in do_handshake
    self._sslobj.do_handshake()
ssl.SSLCertVerificationError: certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 59, in <module>
    run(targetUrl,beginNUM=0,endNUM=70)
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 21, in run
    response = urllib.request.urlopen(req)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 222, in urlopen
    return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 531, in open
    response = meth(req, response)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 640, in http_response
    response = self.parent.error(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 563, in error
    result = self._call_chain(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
    result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 755, in http_error_302
    return self.parent.open(new, timeout=req.timeout)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 525, in open
    response = self._open(req, data)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 542, in _open
    result = self._call_chain(self.handle_open, protocol, protocol +
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
    result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1393, in https_open
    return self.do_open(http.client.HTTPSConnection, req,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1353, in do_open
    raise URLError(err)
urllib.error.URLError: <urlopen error certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)>



static/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.pngstatic/image/hrline/line4.png
第二次:
import urllib.request
import os
import re

#打开url操作
def url_open(url):
      headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
                   'Referer': 'http://wwww.mzitu.com'}
      req = urllib.request.Request(url,headers = headers)
      response = urllib.request.urlopen(req)
      html = response.read()
      return html

#获取当前图片组的最大页码数
def get_maxpage(url):
      html = url_open(url).decode('utf-8')
      pages = re.findall(r'<span>\d{1,2}',html)
      return pages[-1])]

#传入当前页面url,返回当前页面所有图片组链接地址列表
def find_imgs(url):
      html = url_open(url).decode('utf-8')
      imgs_url = re.findall(r'http://www.mzitu.com/\d{6}',html)
      return imgs_url

#传入图片组url,返回图片组中所有图片链接地址列表
def find_img(url,page):
      html = url_open(url + '/' + str(page)).decode('utf-8')
      img_addrs = []

      a = html.find('img src=')
      while a != -1:
                b = html.find('.jpg" alt="',a,a+255)
                if b!= -1:
                        img_addrs.append(html)
                else:
                        b =a + 9
                a = html.find('img src=',b)
      return img_addrs


#根据图片地址列表,将图片保存到folder中
def save_img(folder,img_addrs):
      for each in img_addrs:
                filename = each.split('/')[-1]
                print(filename)
                with open(filename,'wb') as f:
                        img = url_open(each)
                        f.write(img)

def download(folder = 'meizi',*pages):
      if not os.path.exists(folder):
                os.mkdir(folder)
      os.chdir(folder)

      url = 'http://www.mzitu.com'
      # page_num = int(get_page(url))      #获取当前页数

      for page in pages:
                page_url = url + '/page/' + str(page) + '/'
                #创建页文件夹
                pagefolder = "page-" + str(page)
                if not os.path.exists(pagefolder):
                        os.mkdir(pagefolder)
                os.chdir(pagefolder)
                #获取图片组地址列表
                img_group_addrs = find_imgs(page_url)
                #对于每个图片组,获取图片地址并保存
                group = 0
                for addr in img_group_addrs:
                        group += 1
                        img_addrs =
                        #创建组文件夹
                        groupfolder = str(page) + "-" + str(group)
                        if not os.path.exists(groupfolder):
                              os.mkdir(groupfolder)
                        os.chdir(groupfolder)
                        save_img(groupfolder,img_addrs)
                        os.chdir(os.pardir)
                os.chdir(os.pardir)

if __name__ == '__main__':
      download('meizi',1)#第一个参数为文件夹名,第二个参数为要爬取的页码

结果
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1350, in do_open
    h.request(req.get_method(), req.selector, req.data, headers,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1240, in request
    self._send_request(method, url, body, headers, encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1286, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1235, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1006, in _send_output
    self.send(msg)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 946, in send
    self.connect()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1409, in connect
    self.sock = self._context.wrap_socket(self.sock,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 500, in wrap_socket
    return self.sslsocket_class._create(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1040, in _create
    self.do_handshake()
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1309, in do_handshake
    self._sslobj.do_handshake()
ssl.SSLCertVerificationError: certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 83, in <module>
    download('meizi',1)#第一个参数为文件夹名,第二个参数为要爬取的页码
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 67, in download
    img_group_addrs = find_imgs(page_url)
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 22, in find_imgs
    html = url_open(url).decode('utf-8')
File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 10, in url_open
    response = urllib.request.urlopen(req)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 222, in urlopen
    return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 531, in open
    response = meth(req, response)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 640, in http_response
    response = self.parent.error(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 563, in error
    result = self._call_chain(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
    result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 755, in http_error_302
    return self.parent.open(new, timeout=req.timeout)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 525, in open
    response = self._open(req, data)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 542, in _open
    result = self._call_chain(self.handle_open, protocol, protocol +
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
    result = func(*args)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1393, in https_open
    return self.do_open(http.client.HTTPSConnection, req,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1353, in do_open
    raise URLError(err)
urllib.error.URLError: <urlopen error certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)>
>>>

Twilight6 发表于 2020-7-11 20:54:46

ten$1 发表于 2020-7-11 22:23
就是 原厂封装 的IDE啊。。证书错误怎么解决?

安装了不会报没有模块 bs4 的错误才对呀,你是不是电脑不止一个版本的 Python 呢? 安装的时候看见 Successfully 才算是成功的哈

证书错误加上这串代码试试看,取消证书验证:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

ten$1 发表于 2020-7-11 20:55:45

这是怎么回事啊?

Twilight6 发表于 2020-7-11 21:08:40

第一个报错是你没安装 BeautifulSoup 模块吧?

第二个报错是证书错误吧?SSL

ten$1 发表于 2020-7-11 22:20:54

Twilight6 发表于 2020-7-11 21:08
第一个报错是你没安装 BeautifulSoup 模块吧?

第二个报错是证书错误吧?SSL

BeautifulSoup装好了

Twilight6 发表于 2020-7-11 22:22:09

ten$1 发表于 2020-7-11 22:20
BeautifulSoup装好了

你用的 是 PyCharm ?

ten$1 发表于 2020-7-11 22:23:23

Twilight6 发表于 2020-7-11 22:22
你用的 是 PyCharm ?

就是 原厂封装 的IDE啊。。证书错误怎么解决?

ten$1 发表于 2020-7-11 22:28:26

Twilight6 发表于 2020-7-11 22:26
安装了不会报没有模块 bs4 的错误才对呀,你是不是电脑不止一个版本的 Python 呢? 安装的时候看见 Suc ...

好了,谢谢谢谢
页: [1]
查看完整版本: Mac下python爬虫多次失败