|
楼主 |
发表于 2020-5-31 15:08:21
|
显示全部楼层
目前写出来的最新的代码 import requests
import parsel
import re
import concurrent.futures
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4043.400'}
def send_request(url):
'''请求数据'''
response = requests.get(url = url, headers = headers, verify = False)
return(response)
def parse_data(data):
'''数据解析'''
selector = parsel.Selector(data)
result_list = selector.xpath('//a[@class="col-xs-6 col-sm-3"]')
for result in result_list:
title = result.xpath('./img/@data-original').extract_first()
src_url = result.xpath('./img/@alt').extract_first()
#准备文件后缀名
all_title = title + '.' + src_url.split('.')[-1]
yield all_title, src_url
def sava_data(file_name,data):
'''数据保存'''
with open('img\\' + file_name, mode = 'wb') as f:
f.write(data)
print('保存完成:', file_name)
def main(page):
'''实现翻页的效果'''
for page in range(1,page + 1):
print('============正在爬取第{}页数据============'.format(page))
thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=3)
res = send_request('https://www.doutula.com/photo/list/?page={}'.format(str(page)))
src_url = parse_data(res.text)
for file, url in src_url:
image_response = send_request(url)
thread_pool.submit(save_data, file, image_response.content)
thread_pool.shutdown()
if __name__ == '__main__':
main(10)
各位鱼油可以自己尝试哈
有个异常:祖安异常
C:\Users\Administrator\AppData\Local\Programs\Python\Python36\python.exe E:/python_fruit/表情包/表情包_ronot-多线程.py
============正在爬取第1页数据============
C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\urllib3\connectionpool.py:986: InsecureRequestWarning: Unverified HTTPS request is being made to host 'www.doutula.com'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/e ... e.html#ssl-warnings
InsecureRequestWarning,
Traceback (most recent call last):
File "E:/python_fruit/表情包/表情包_ronot-多线程.py", line 58, in <module>
main(10)
File "E:/python_fruit/表情包/表情包_ronot-多线程.py", line 48, in main
image_response = send_request(url)
File "E:/python_fruit/表情包/表情包_ronot-多线程.py", line 16, in send_request
response = requests.get(url = url, headers = headers, verify = False)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\sessions.py", line 516, in request
prep = self.prepare_request(req)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\sessions.py", line 459, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\models.py", line 314, in prepare
self.prepare_url(url, params)
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python36\lib\site-packages\requests\models.py", line 388, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL '一群渣渣': No schema supplied. Perhaps you meant http://一群渣渣?
Process finished with exit code 1
希望各位鱼油能够告诉我为什么会出现这个异常 |
|