|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 fendow 于 2019-7-25 16:58 编辑
- import urllib.request
- import os
- def url_open(url):
- req = urllib.request.Request(url)
- req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36')
- response = urllib.request.urlopen(req)
- html = response.read()
- return html
-
- def get_page(url):
- html = url_open(url).decode('utf-8')
- a = html.find('current-comment-page') + 23
- b = html.find(']',a)
- return html[a:b]
-
- def find_imgs(url):
- html = url_open(url).decode('utf-8')
- img_addrs = []
- a = html.find('img src=')
-
- while a != -1:
- b = html.find('.jpg',a,a+255 )
- if b != -1:
- img_addrs.append(html[a+9:b+4])
- else:
- b = a+9
- a = html.find('img src=',b)
-
- def save_imgs(folder,img_addrs):
- for each in img_addre:
- filename = each.split('/')[-1]
- with open(filename,'wb') as f:
- img = url_open(each)
- f.write(img)
- def download_mm(folder='ooxx',pages=18):
- os.mkdir(folder)
- os.chdir(folder)
-
- url = 'http://jandan.net/ooxx/'
- page_num = int(get_page(url))
-
- for i in range(pages):
- page_num -= i
- page_url = url + 'page-' + str(page_num) + '#comments'
- img_addrs = find_imgs(page_url)
- save_imgs(folder,img_addrs)
-
- if __name__== '__main__':
- download_mm()
复制代码
最后报错
- TimeoutError Traceback (most recent call last)
- C:\ProgramData\Anaconda3\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
- 1316 h.request(req.get_method(), req.selector, req.data, headers,
- -> 1317 encode_chunked=req.has_header('Transfer-encoding'))
- 1318 except OSError as err: # timeout error
- C:\ProgramData\Anaconda3\lib\http\client.py in request(self, method, url, body, headers, encode_chunked)
- 1228 """Send a complete request to the server."""
- -> 1229 self._send_request(method, url, body, headers, encode_chunked)
- 1230
- C:\ProgramData\Anaconda3\lib\http\client.py in _send_request(self, method, url, body, headers, encode_chunked)
- 1274 body = _encode(body, 'body')
- -> 1275 self.endheaders(body, encode_chunked=encode_chunked)
- 1276
- C:\ProgramData\Anaconda3\lib\http\client.py in endheaders(self, message_body, encode_chunked)
- 1223 raise CannotSendHeader()
- -> 1224 self._send_output(message_body, encode_chunked=encode_chunked)
- 1225
- C:\ProgramData\Anaconda3\lib\http\client.py in _send_output(self, message_body, encode_chunked)
- 1015 del self._buffer[:]
- -> 1016 self.send(msg)
- 1017
- C:\ProgramData\Anaconda3\lib\http\client.py in send(self, data)
- 955 if self.auto_open:
- --> 956 self.connect()
- 957 else:
- C:\ProgramData\Anaconda3\lib\http\client.py in connect(self)
- 927 self.sock = self._create_connection(
- --> 928 (self.host,self.port), self.timeout, self.source_address)
- 929 self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
- C:\ProgramData\Anaconda3\lib\socket.py in create_connection(address, timeout, source_address)
- 726 if err is not None:
- --> 727 raise err
- 728 else:
- C:\ProgramData\Anaconda3\lib\socket.py in create_connection(address, timeout, source_address)
- 715 sock.bind(source_address)
- --> 716 sock.connect(sa)
- 717 # Break explicitly a reference cycle
- TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。
- During handling of the above exception, another exception occurred:
- URLError Traceback (most recent call last)
- <ipython-input-20-bcc7d98d73ed> in <module>
- 45
- 46 if __name__== '__main__':
- ---> 47 download_mm()
- <ipython-input-20-bcc7d98d73ed> in download_mm(folder, pages)
- 36
- 37 url = 'http://jandan.net/ooxx/'
- ---> 38 page_num = int(get_page(url))
- 39
- 40 for i in range(pages):
- <ipython-input-20-bcc7d98d73ed> in get_page(url)
- 9 print(url)
- 10 def get_page(url):
- ---> 11 html = url_open(url).decode('utf-8')
- 12 a = html.find('current-comment-page') + 23
- 13 b = html.find(']',a)
- <ipython-input-20-bcc7d98d73ed> in url_open(url)
- 5 req = urllib.request.Request(url)
- 6 req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36')
- ----> 7 response = urllib.request.urlopen(url)
- 8 html = response.read()
- 9 print(url)
- C:\ProgramData\Anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
- 220 else:
- 221 opener = _opener
- --> 222 return opener.open(url, data, timeout)
- 223
- 224 def install_opener(opener):
- C:\ProgramData\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
- 523 req = meth(req)
- 524
- --> 525 response = self._open(req, data)
- 526
- 527 # post-process response
- C:\ProgramData\Anaconda3\lib\urllib\request.py in _open(self, req, data)
- 541 protocol = req.type
- 542 result = self._call_chain(self.handle_open, protocol, protocol +
- --> 543 '_open', req)
- 544 if result:
- 545 return result
- C:\ProgramData\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
- 501 for handler in handlers:
- 502 func = getattr(handler, meth_name)
- --> 503 result = func(*args)
- 504 if result is not None:
- 505 return result
- C:\ProgramData\Anaconda3\lib\urllib\request.py in http_open(self, req)
- 1343
- 1344 def http_open(self, req):
- -> 1345 return self.do_open(http.client.HTTPConnection, req)
- 1346
- 1347 http_request = AbstractHTTPHandler.do_request_
- C:\ProgramData\Anaconda3\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
- 1317 encode_chunked=req.has_header('Transfer-encoding'))
- 1318 except OSError as err: # timeout error
- -> 1319 raise URLError(err)
- 1320 r = h.getresponse()
- 1321 except:
- URLError: <urlopen error [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。>
复制代码
请问大佬们,这个是什么问题 是 notebook 不支持吗。。
煎蛋网,在小甲鱼老师的带领下,加上了不弱的反爬功能,换一个网站吧,这个对身体也不好。
|
|