|
10鱼币
本帖最后由 ten$1 于 2020-7-11 22:21 编辑
第一次实验:
- import urllib.request
- from bs4 import BeautifulSoup
- import os
-
- def Download(url,picAlt,name):
- path = 'D:\\pythonD爬虫妹子图\\'+picAlt+'\\'
- if not os.path.exists(path):
- os.makedirs(path)
- urllib.request.urlretrieve( url, '{0}{1}.jpg'.format(path, name))
-
- header = {
- "User-Agent":'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
- 'Accept': '*/*',
- 'Accept-Language': 'en-US,en;q=0.8',
- 'Cache-Control': 'max-age=0',
- 'Connection': 'keep-alive'
- }
- def run(targetUrl, beginNUM ,endNUM):
- req = urllib.request.Request(url=targetUrl,headers=header)
- response = urllib.request.urlopen(req)
- html = response.read().decode('gb2312','ignore')
- soup = BeautifulSoup(html, 'html.parser')
- Divs = soup.find_all('div',attrs={'id':'big-pic' })
- nowpage = soup.find('span',attrs={'class':'nowpage'}).get_text()
- totalpage= soup.find('span',attrs={'class':'totalpage'}).get_text()
- if beginNUM ==endNUM :
- return
- for div in Divs:
- beginNUM = beginNUM+1
-
- if div.find("a") is None :
- print("没有下一张了")
- return
- elif div.find("a")['href'] is None or div.find("a")['href']=="":
- print("没有下一张了None")
- return
- print("下载信息:总进度:",beginNUM,"/",endNUM," ,正在下载套图:(",nowpage,"/",totalpage,")")
-
- if int(nowpage)<int(totalpage):
- nextPageLink ="http://www.mmonly.cc/mmtp/qcmn/" +(div.find('a')['href'])
- elif int(nowpage)==int(totalpage):
- nextPageLink = (div.find('a')['href'])
-
- picLink = (div.find('a').find('img')['src'])
- picAlt = (div.find('a').find('img'))['alt']
- print('下载的图片链接:',picLink)
- print('套图名:[ ', picAlt , ' ] ')
- print('开始下载...........')
- Download(picLink,picAlt, nowpage)
- print("下载成功!")
- print('下一页链接:',nextPageLink)
- run(nextPageLink,beginNUM ,endNUM)
- return
-
- if __name__ == '__main__':
- targetUrl ="http://www.mmonly.cc/mmtp/qcmn/237269.html"
- run(targetUrl,beginNUM=0,endNUM=70)
- print(" OVER")
-
复制代码
结果
- Traceback (most recent call last):
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1350, in do_open
- h.request(req.get_method(), req.selector, req.data, headers,
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1240, in request
- self._send_request(method, url, body, headers, encode_chunked)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1286, in _send_request
- self.endheaders(body, encode_chunked=encode_chunked)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1235, in endheaders
- self._send_output(message_body, encode_chunked=encode_chunked)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1006, in _send_output
- self.send(msg)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 946, in send
- self.connect()
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1409, in connect
- self.sock = self._context.wrap_socket(self.sock,
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 500, in wrap_socket
- return self.sslsocket_class._create(
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1040, in _create
- self.do_handshake()
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1309, in do_handshake
- self._sslobj.do_handshake()
- ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)
- During handling of the above exception, another exception occurred:
- Traceback (most recent call last):
- File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 59, in <module>
- run(targetUrl,beginNUM=0,endNUM=70)
- File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 21, in run
- response = urllib.request.urlopen(req)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 222, in urlopen
- return opener.open(url, data, timeout)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 531, in open
- response = meth(req, response)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 640, in http_response
- response = self.parent.error(
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 563, in error
- result = self._call_chain(*args)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
- result = func(*args)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 755, in http_error_302
- return self.parent.open(new, timeout=req.timeout)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 525, in open
- response = self._open(req, data)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 542, in _open
- result = self._call_chain(self.handle_open, protocol, protocol +
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
- result = func(*args)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1393, in https_open
- return self.do_open(http.client.HTTPSConnection, req,
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1353, in do_open
- raise URLError(err)
- urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)>
复制代码
第二次:
- import urllib.request
- import os
- import re
- #打开url操作
- def url_open(url):
- headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
- 'Referer': 'http://wwww.mzitu.com'}
- req = urllib.request.Request(url,headers = headers)
- response = urllib.request.urlopen(req)
- html = response.read()
- return html
- #获取当前图片组的最大页码数
- def get_maxpage(url):
- html = url_open(url).decode('utf-8')
- pages = re.findall(r'<span>\d{1,2}',html)
- return pages[-1][6:len(pages[-1])]
- #传入当前页面url,返回当前页面所有图片组链接地址列表
- def find_imgs(url):
- html = url_open(url).decode('utf-8')
- imgs_url = re.findall(r'http://www.mzitu.com/\d{6}',html)
- return imgs_url
- #传入图片组url,返回图片组中所有图片链接地址列表
- def find_img(url,page):
- html = url_open(url + '/' + str(page)).decode('utf-8')
- img_addrs = []
- a = html.find('img src=')
- while a != -1:
- b = html.find('.jpg" alt="',a,a+255)
- if b!= -1:
- img_addrs.append(html[a+9:b+4])
- else:
- b =a + 9
- a = html.find('img src=',b)
- return img_addrs[0]
- #根据图片地址列表,将图片保存到folder中
- def save_img(folder,img_addrs):
- for each in img_addrs:
- filename = each.split('/')[-1]
- print(filename)
- with open(filename,'wb') as f:
- img = url_open(each)
- f.write(img)
- def download(folder = 'meizi',*pages):
- if not os.path.exists(folder):
- os.mkdir(folder)
- os.chdir(folder)
- url = 'http://www.mzitu.com'
- # page_num = int(get_page(url)) #获取当前页数
- for page in pages:
- page_url = url + '/page/' + str(page) + '/'
- #创建页文件夹
- pagefolder = "page-" + str(page)
- if not os.path.exists(pagefolder):
- os.mkdir(pagefolder)
- os.chdir(pagefolder)
- #获取图片组地址列表
- img_group_addrs = find_imgs(page_url)
- #对于每个图片组,获取图片地址并保存
- group = 0
- for addr in img_group_addrs:
- group += 1
- img_addrs = [find_img(addr,x) for x in range(int(get_maxpage(addr)))]
- #创建组文件夹
- groupfolder = str(page) + "-" + str(group)
- if not os.path.exists(groupfolder):
- os.mkdir(groupfolder)
- os.chdir(groupfolder)
- save_img(groupfolder,img_addrs)
- os.chdir(os.pardir)
- os.chdir(os.pardir)
- if __name__ == '__main__':
- download('meizi',1)#第一个参数为文件夹名,第二个参数为要爬取的页码
复制代码
结果
- Traceback (most recent call last):
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1350, in do_open
- h.request(req.get_method(), req.selector, req.data, headers,
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1240, in request
- self._send_request(method, url, body, headers, encode_chunked)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1286, in _send_request
- self.endheaders(body, encode_chunked=encode_chunked)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1235, in endheaders
- self._send_output(message_body, encode_chunked=encode_chunked)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1006, in _send_output
- self.send(msg)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 946, in send
- self.connect()
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/http/client.py", line 1409, in connect
- self.sock = self._context.wrap_socket(self.sock,
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 500, in wrap_socket
- return self.sslsocket_class._create(
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1040, in _create
- self.do_handshake()
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/ssl.py", line 1309, in do_handshake
- self._sslobj.do_handshake()
- ssl.SSLCertVerificationError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)
- During handling of the above exception, another exception occurred:
- Traceback (most recent call last):
- File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 83, in <module>
- download('meizi',1)#第一个参数为文件夹名,第二个参数为要爬取的页码
- File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 67, in download
- img_group_addrs = find_imgs(page_url)
- File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 22, in find_imgs
- html = url_open(url).decode('utf-8')
- File "/Users/xiaojiayudeapple/Desktop/My Code/python/A.py", line 10, in url_open
- response = urllib.request.urlopen(req)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 222, in urlopen
- return opener.open(url, data, timeout)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 531, in open
- response = meth(req, response)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 640, in http_response
- response = self.parent.error(
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 563, in error
- result = self._call_chain(*args)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
- result = func(*args)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 755, in http_error_302
- return self.parent.open(new, timeout=req.timeout)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 525, in open
- response = self._open(req, data)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 542, in _open
- result = self._call_chain(self.handle_open, protocol, protocol +
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 502, in _call_chain
- result = func(*args)
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1393, in https_open
- return self.do_open(http.client.HTTPSConnection, req,
- File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/urllib/request.py", line 1353, in do_open
- raise URLError(err)
- urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1108)>
- >>>
复制代码
安装了不会报没有模块 bs4 的错误才对呀,你是不是电脑不止一个版本的 Python 呢? 安装的时候看见 Successfully 才算是成功的哈
证书错误加上这串代码试试看,取消证书验证:
- import ssl
- ssl._create_default_https_context = ssl._create_unverified_context
复制代码
|
最佳答案
查看完整内容
安装了不会报没有模块 bs4 的错误才对呀,你是不是电脑不止一个版本的 Python 呢? 安装的时候看见 Successfully 才算是成功的哈
证书错误加上这串代码试试看,取消证书验证:
|