|
发表于 2020-7-23 09:15:29
|
显示全部楼层
本楼为最佳答案
本帖最后由 nahongyan1997 于 2020-7-23 09:20 编辑
@风尘岁月 我修改了你的代码以实现你提出的问题的解决:
- #导包
- import requests
- # 这里要这样导入最好
- from time import sleep
- import os
- import threading
- import parsel
- if not os.path.exists('image'):
- os.mkdir('image')
- #base_url = 'https://anime-pictures.net/pictures/view_posts/0?lang=en'
- headers = {
- 'User-Agent':
- 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
- }
- def get(url,headers):
- '''请求数据'''
- response = requests.get(url,headers)
- html_data = response.text
- return html_data
- def parsel_data(html_data):
- '''筛选数据'''
- selector = parsel.Selector(html_data)
- result_list = selector.xpath('//span[@class="img_block_big"]')
- for result in result_list:
- image_url = result.xpath('./a/picture/source/img/@src').extract_first()
- image_id = result.xpath('./a/picture/source/img/@id').extract_first()
- img_url = 'https:' + image_url #手动拼url
- all_title = img_url
- img_data = requests.get(url = all_title,headers = headers).content
- yield all_title,image_id,img_data
- def save(all_title,image_id,img_data):
- '''保存数据'''
- try:
- with open('image\\' + image_id, mode='wb') as f:
- print('保存成功:', image_id)
- f.write(img_data)
- except:
- pass
- print('保存失败:', image_id,'(•́へ•́╬)')
- # 这段代码不能用
- # def sleep(time):
- # '''休眠'''
- # time.sleep(time)
- # 使用多线程需要把你重复执行的部分单独写成一个函数
- def start_save(base_url):
- lock.acquire()
- html_data = get(url=base_url,headers=headers)
- for image_data in parsel_data(html_data):
- all_title = image_data[0] #url https://xxxxxxxxxx...
- img_id = image_data[1] # ID
- img_data = image_data[2] #数据
- save(all_title=all_title, image_id = img_id, img_data = img_data)
- lock.release()
- def main(page):
- for page in range(0,page + 1):
- print('###############正在下载第{}页数据###############'.format(page))
- if page > 0:
- print('休息一下')
- sleep(10)
- base_url = 'https://anime-pictures.net/pictures/view_posts/0?lang=en'.format(page)
- print(base_url)
- # 这里是多线程的开启方式,修改后速度有明显提升
- my_thread = threading.Thread(target=start_save,args=(base_url,))
- my_thread.setDaemon(True)
- my_thread.start()
- if __name__ == '__main__':
- lock = threading.RLock()
-
- main(6300)
复制代码
|
|