鱼C论坛

 找回密码
 立即注册
查看: 1796|回复: 5

[已解决]关于爬虫的一些看不懂后台文件

[复制链接]
发表于 2020-7-21 18:21:17 | 显示全部楼层 |阅读模式

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有账号?立即注册

x
放暑假了 最近闲的蛋疼 写了个爬虫结果一堆bug 想改成多线程加速吧 bug更多了
单线程版:
  1. #导包
  2. import requests
  3. import parsel
  4. import os
  5. import time


  6. #准备工作
  7. if not os.path.exists('image'):
  8.     os.mkdir('image')

  9. headers = {
  10.     'User-Agent':
  11.         'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
  12. }

  13. #爬虫
  14. for page in range(0,6500):
  15.     #url,headers准备
  16.     print("=================/正在保存第{}页数据=================".format(page))
  17.     base_url = 'https://anime-pictures.net/pictures/view_posts/{}?lang=en'.format(page)

  18.     if page>0:
  19.         time.sleep(10)
  20.         print('太累了,休息下|・ω・` )')

  21.     #请求数据
  22.     response = requests.get(url = base_url,headers = headers)
  23.     html_data = response.text


  24.     #筛选数据
  25.     selector = parsel.Selector(html_data)
  26.     result_list = selector.xpath('//span[@class="img_block_big"]')

  27.     for result in result_list:
  28.         image_url = result.xpath('./a/picture/source/img/@src').extract_first()
  29.         image_id = result.xpath('./a/picture/source/img/@id').extract_first()

  30.         img_url = 'https:' + image_url  # 手动拼接完整url

  31.         all_title = image_id + '.' + img_url.split('.')[-1]

  32.         img_data = requests.get(url = img_url,headers = headers).content


  33.         #保存数据
  34.         try:
  35.             with open('image\\' + all_title, mode='wb') as f:
  36.                 print('保存成功:', image_id)
  37.                 f.write(img_data)

  38.         except:
  39.             pass
  40.             print('保存失败:', image_id,'(•́へ•́╬)')
  41.            



复制代码

异常内容:
C:\Users\Administrator\AppData\Local\Programs\Python\Python36\python.exe E:/python_fruit/滑稽图/爬虫.py
=================/正在保存第0页数据=================
保存成功: common_preview_img_655082
保存成功: common_preview_img_655084
保存成功: common_preview_img_655085
保存成功: common_preview_img_654687
保存成功: common_preview_img_655061
保存成功: common_preview_img_655062
保存成功: common_preview_img_655063
保存成功: common_preview_img_655064
保存成功: common_preview_img_654930
保存成功: common_preview_img_654929
保存成功: common_preview_img_654927
保存成功: common_preview_img_654850
保存成功: common_preview_img_654926
Traceback (most recent call last):
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\contrib\pyopenssl.py", line 488, in wrap_socket
    cnx.do_handshake()
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\OpenSSL\SSL.py", line 1934, in do_handshake
    self._raise_ssl_error(self._ssl, result)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\OpenSSL\SSL.py", line 1663, in _raise_ssl_error
    raise SysCallError(errno, errorcode.get(errno))
OpenSSL.SSL.SysCallError: (10054, 'WSAECONNRESET')

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\connectionpool.py", line 677, in urlopen
    chunked=chunked,
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\connectionpool.py", line 976, in _validate_conn
    conn.connect()
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\connection.py", line 370, in connect
    ssl_context=context,
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\util\ssl_.py", line 377, in ssl_wrap_socket
    return context.wrap_socket(sock, server_hostname=server_hostname)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\contrib\pyopenssl.py", line 494, in wrap_socket
    raise ssl.SSLError("bad handshake: %r" % e)
ssl.SSLError: ("bad handshake: SysCallError(10054, 'WSAECONNRESET')",)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\adapters.py", line 449, in send
    timeout=timeout
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\connectionpool.py", line 725, in urlopen
    method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\util\retry.py", line 439, in increment
    raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='cdn.anime-pictures.net', port=443): Max retries exceeded with url: /jvwall_images/956/9566c4f4db6d8c928273bc192fae2a40_cp.jpg (Caused by SSLError(SSLError("bad handshake: SysCallError(10054, 'WSAECONNRESET')",),))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "E:/python_fruit/滑稽图/爬虫.py", line 44, in <module>
    img_data = requests.get(url = img_url,headers = headers).content
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\api.py", line 76, in get
    return request('get', url, params=params, **kwargs)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\api.py", line 61, in request
    return session.request(method=method, url=url, **kwargs)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\sessions.py", line 530, in request
    resp = self.send(prep, **send_kwargs)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\sessions.py", line 643, in send
    r = adapter.send(request, **kwargs)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\adapters.py", line 514, in send
    raise SSLError(e, request=request)
requests.exceptions.SSLError: HTTPSConnectionPool(host='cdn.anime-pictures.net', port=443): Max retries exceeded with url: /jvwall_images/956/9566c4f4db6d8c928273bc192fae2a40_cp.jpg (Caused by SSLError(SSLError("bad handshake: SysCallError(10054, 'WSAECONNRESET')",),))

Process finished with exit code 1



多线程版(多线程还没写好,先测试函数版单线程)
  1. #导包
  2. import requests
  3. import time
  4. import os
  5. import threading
  6. import parsel


  7. if not os.path.exists('image'):
  8.     os.mkdir('image')


  9. base_url = 'https://anime-pictures.net/pictures/view_posts/0?lang=en'

  10. headers = {
  11.     'User-Agent':
  12.         'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
  13. }

  14. cookie = {
  15.     'GA1.2.1560346118.1594984735'
  16. }





  17. def get(url,headers):
  18.     '''请求数据'''
  19.     response = requests.get(url,headers)
  20.     html_data = response.text
  21.     return html_data


  22. def parsel_data(html_data):
  23.     '''筛选数据'''
  24.     selector = parsel.Selector(html_data)
  25.     result_list = selector.xpath('//span[@class="img_block_big"]')

  26.     for result in result_list:
  27.         image_url = result.xpath('./a/picture/source/img/@src').extract_first()
  28.         image_id = result.xpath('./a/picture/source/img/@id').extract_first()

  29.         img_url = 'https:' + image_url #手动拼url

  30.         all_title = img_url

  31.         img_data = requests.get(url = all_title,headers = headers).content


  32.         yield all_title,image_id,img_data


  33. def save(all_title,image_id,img_data):
  34.     '''保存数据'''

  35.     with open('image\\' + all_title, mode='wb') as f:
  36.         print('保存成功:', image_id)
  37.         #f.write(img_data)

  38.     #except:
  39.         #pass
  40.         #print('保存失败:', img_id,'(&#8226;&#769;へ&#8226;&#769;╬)')


  41. def sleep(time):
  42.     '''休眠'''
  43.     time.sleep(time)




  44. if __name__ == '__main__':
  45.     html_data = get(url=base_url, headers=headers)
  46.     for image_data in parsel_data(html_data):
  47.         all_title = image_data[0]  # url https://xxxxxxx...
  48.         img_id = image_data[1]  # ID号
  49.         img_data = image_data[2]  # 数据

  50.         #print(all_title,img_id,img_data)

  51.         save(all_title = all_title, image_id = img_id, img_data = img_data)

  52. 异常内容:
  53. (异常处理被我注释掉了 好弄出来报错内容 如果把注释去掉 则会一直显示保存失败)
  54. C:\Users\Administrator\AppData\Local\Programs\Python\Python36\python.exe E:/python_fruit/滑稽图/爬虫_多线程.py
  55. Traceback (most recent call last):
  56.   File "E:/python_fruit/滑稽图/爬虫_多线程.py", line 82, in <module>
  57.     save(all_title = all_title, image_id = img_id, img_data = img_data)
  58.   File "E:/python_fruit/滑稽图/爬虫_多线程.py", line 57, in save
  59.     with open('image\\' + all_title, mode='wb') as f:
  60. OSError: [Errno 22] Invalid argument: 'image\\https://cdn.anime-pictures.net/jvwall_images/000/00064ebef030944d326648e00ba8aa07_cp.png'

  61. Process finished with exit code 1


  62. 还有 帮我修bug时 请注意带好纸巾 避免失血过多{:10_256:}
  63. 最后 感谢各位鱼油的帮助{:10_334:}



































复制代码
















最佳答案
2020-8-7 07:06:09
本帖最后由 1q23w31 于 2020-8-7 07:11 编辑
风尘岁月 发表于 2020-7-21 20:32
ssl异常是没了 但是还有两个
:C:%users\Administrator\AppData\Local\Programs\Python\Python36\python ...

  1. #导包
  2. import requests
  3. import parsel
  4. import os
  5. import time


  6. #准备工作
  7. if not os.path.exists('image'):
  8.     os.mkdir('image')

  9. headers = {
  10.     'User-Agent':
  11.         'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
  12. }

  13. #爬虫
  14. for page in range(0,6500):
  15.     #url,headers准备
  16.     print("=================/正在保存第{}页数据=================".format(page))
  17.     base_url = 'https://anime-pictures.net/pictures/view_posts/{}?lang=en'.format(page)

  18.     if page>0:
  19.         time.sleep(10)
  20.         print('太累了,休息下|&#65381;ω&#65381;` )')

  21.     #请求数据
  22.     response = requests.get(url = base_url,headers = headers,verify = False)
  23.     html_data = response.text


  24.     #筛选数据
  25.     selector = parsel.Selector(html_data)
  26.     result_list = selector.xpath('//span[@class="img_block_big"]')

  27.     for result in result_list:
  28.         image_url = result.xpath('./a/picture/source/img/@src').extract_first()
  29.         image_id = result.xpath('./a/picture/source/img/@id').extract_first()

  30.         img_url = 'https:' + image_url  # 手动拼接完整url

  31.         all_title = image_id + '.' + img_url.split('.')[-1]

  32.         img_data = requests.get(url = img_url,headers = headers,verify = False).content


  33.         #保存数据
  34.         try:
  35.             with open('image\\' + all_title, mode='wb') as f:
  36.                 print('保存成功:', image_id)
  37.                 f.write(img_data)

  38.         except:
  39.             pass
  40.             print('保存失败:', image_id,'(&#8226;&#769;へ&#8226;&#769;╬)')
  41.            
复制代码

单线程版,测试60张无报错,程序运行时有警告,忽略即可
如图
2020-8-7 7-6-51.png
小甲鱼最新课程 -> https://ilovefishc.com
回复

使用道具 举报

发表于 2020-7-21 19:18:21 | 显示全部楼层

试试在代码最前面加上 :
  1. import ssl
  2. ssl._create_default_https_context = ssl._create_unverified_context
复制代码
小甲鱼最新课程 -> https://ilovefishc.com
回复 支持 反对

使用道具 举报

 楼主| 发表于 2020-7-21 20:32:18 | 显示全部楼层
Twilight6 发表于 2020-7-21 19:18
试试在代码最前面加上 :

ssl异常是没了 但是还有两个
:C:\Users\Administrator\AppData\Local\Programs\Python\Python36\python.exe E:/python_fruit/滑稽图/爬虫.py
=================/正在保存第0页数据=================
保存成功: common_preview_img_654943
保存成功: common_preview_img_655117
保存成功: common_preview_img_655118
保存成功: common_preview_img_654877
保存成功: common_preview_img_655119
保存成功: common_preview_img_654876
保存成功: common_preview_img_654875
Traceback (most recent call last):
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\contrib\pyopenssl.py", line 488, in wrap_socket
    cnx.do_handshake()
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\OpenSSL\SSL.py", line 1934, in do_handshake
    self._raise_ssl_error(self._ssl, result)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\OpenSSL\SSL.py", line 1663, in _raise_ssl_error
    raise SysCallError(errno, errorcode.get(errno))
OpenSSL.SSL.SysCallError: (10054, 'WSAECONNRESET')

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\connectionpool.py", line 677, in urlopen
    chunked=chunked,
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
    self._validate_conn(conn)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\connectionpool.py", line 976, in _validate_conn
    conn.connect()
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\connection.py", line 370, in connect
    ssl_context=context,
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\util\ssl_.py", line 377, in ssl_wrap_socket
    return context.wrap_socket(sock, server_hostname=server_hostname)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\contrib\pyopenssl.py", line 494, in wrap_socket
    raise ssl.SSLError("bad handshake: %r" % e)
ssl.SSLError: ("bad handshake: SysCallError(10054, 'WSAECONNRESET')",)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\adapters.py", line 449, in send
    timeout=timeout
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\connectionpool.py", line 725, in urlopen
    method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\util\retry.py", line 439, in increment
    raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='cdn.anime-pictures.net', port=443): Max retries exceeded with url: /jvwall_images/25f/25f5a4e05c882ff0f6673432040a1328_cp.png (Caused by SSLError(SSLError("bad handshake: SysCallError(10054, 'WSAECONNRESET')",),))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "E:/python_fruit/滑稽图/爬虫.py", line 47, in <module>
    img_data = requests.get(url = img_url,headers = headers).content
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\api.py", line 76, in get
    return request('get', url, params=params, **kwargs)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\api.py", line 61, in request
    return session.request(method=method, url=url, **kwargs)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\sessions.py", line 530, in request
    resp = self.send(prep, **send_kwargs)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\sessions.py", line 643, in send
    r = adapter.send(request, **kwargs)
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\adapters.py", line 514, in send
    raise SSLError(e, request=request)
requests.exceptions.SSLError: HTTPSConnectionPool(host='cdn.anime-pictures.net', port=443): Max retries exceeded with url: /jvwall_images/25f/25f5a4e05c882ff0f6673432040a1328_cp.png (Caused by SSLError(SSLError("bad handshake: SysCallError(10054, 'WSAECONNRESET')",),))

Process finished with exit code 1
小甲鱼最新课程 -> https://ilovefishc.com
回复 支持 反对

使用道具 举报

发表于 2020-7-21 20:33:52 | 显示全部楼层
风尘岁月 发表于 2020-7-21 20:32
ssl异常是没了 但是还有两个
:C:%users\Administrator\AppData\Local\Programs\Python\Python36\python ...



其他的帮不了你了,没怎么遇到过~
小甲鱼最新课程 -> https://ilovefishc.com
回复 支持 反对

使用道具 举报

 楼主| 发表于 2020-7-21 20:36:28 | 显示全部楼层
Twilight6 发表于 2020-7-21 20:33
其他的帮不了你了,没怎么遇到过~

好吧
小甲鱼最新课程 -> https://ilovefishc.com
回复 支持 反对

使用道具 举报

发表于 2020-8-7 07:06:09 | 显示全部楼层    本楼为最佳答案   
本帖最后由 1q23w31 于 2020-8-7 07:11 编辑
风尘岁月 发表于 2020-7-21 20:32
ssl异常是没了 但是还有两个
:C:%users\Administrator\AppData\Local\Programs\Python\Python36\python ...

  1. #导包
  2. import requests
  3. import parsel
  4. import os
  5. import time


  6. #准备工作
  7. if not os.path.exists('image'):
  8.     os.mkdir('image')

  9. headers = {
  10.     'User-Agent':
  11.         'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
  12. }

  13. #爬虫
  14. for page in range(0,6500):
  15.     #url,headers准备
  16.     print("=================/正在保存第{}页数据=================".format(page))
  17.     base_url = 'https://anime-pictures.net/pictures/view_posts/{}?lang=en'.format(page)

  18.     if page>0:
  19.         time.sleep(10)
  20.         print('太累了,休息下|&#65381;ω&#65381;` )')

  21.     #请求数据
  22.     response = requests.get(url = base_url,headers = headers,verify = False)
  23.     html_data = response.text


  24.     #筛选数据
  25.     selector = parsel.Selector(html_data)
  26.     result_list = selector.xpath('//span[@class="img_block_big"]')

  27.     for result in result_list:
  28.         image_url = result.xpath('./a/picture/source/img/@src').extract_first()
  29.         image_id = result.xpath('./a/picture/source/img/@id').extract_first()

  30.         img_url = 'https:' + image_url  # 手动拼接完整url

  31.         all_title = image_id + '.' + img_url.split('.')[-1]

  32.         img_data = requests.get(url = img_url,headers = headers,verify = False).content


  33.         #保存数据
  34.         try:
  35.             with open('image\\' + all_title, mode='wb') as f:
  36.                 print('保存成功:', image_id)
  37.                 f.write(img_data)

  38.         except:
  39.             pass
  40.             print('保存失败:', image_id,'(&#8226;&#769;へ&#8226;&#769;╬)')
  41.            
复制代码

单线程版,测试60张无报错,程序运行时有警告,忽略即可
如图
2020-8-7 7-6-51.png
小甲鱼最新课程 -> https://ilovefishc.com
回复 支持 反对

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

小黑屋|手机版|Archiver|鱼C工作室 ( 粤ICP备18085999号-1 | 粤公网安备 44051102000585号)

GMT+8, 2025-6-24 20:38

Powered by Discuz! X3.4

© 2001-2023 Discuz! Team.

快速回复 返回顶部 返回列表