|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
如下所示,close(), time.sleep(),socket.setdefaulttimeout()都用过了,搞不明白
- import os
- import re
- import urllib
- import json
- import socket
- import urllib.request
- import urllib.parse
- import urllib.error
- import time
- import _thread
- timeout = 20
- socket.setdefaulttimeout(timeout)
- def SaveImage(url='https://original.img.cheerfun.dev/img-original/img/2019/03/05/03/09/55/73518990_p0.png',path='test.png'):
- rq = urllib.request.Request(url)
- rq.add_header('accept','image/webp,image/apng,image/*,*/*;q=0.8')
- rq.add_header('accept-encoding','gzip, deflate, br')
- rq.add_header('accept-languag','zh-CN,zh;q=0.9,en;q=0.8,pl;q=0.7')
- rq.add_header('sec-fetch-dest','image')
- rq.add_header('sec-fetch-mode','no-cors')
- rq.add_header('sec-fetch-site','cross-site')
- rq.add_header('referer','https://pixivic.com/popSearch')
- rq.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36')
- rp = urllib.request.urlopen(rq)
- #print(rp.getcode())
- s=rp.read()
- rq.close()
- #print(len(s))
- f=open(path,'wb')
- f.write(s)
- f.close()
- time.sleep(10)
- class Crawler:
- __time_sleep = 10
- __amount = 0
- __start_amount = 0
- __counter = 0
- headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'}
- def __init__(self, t=0.1, word='白上吹雪 '):
- self.time_sleep = t
- if not os.path.exists("./" + word):
- os.mkdir("./" + word)
- self.__counter = len(os.listdir('./' + word)) + 1
- def get_suffix(self, name):
- m = re.search(r'\.[^\.]*$', name)
- if m.group(0) and len(m.group(0)) <= 5:
- return m.group(0)
- else:
- return '.jpeg'
- def save_image(self, rsp_data, word):
-
- for image_info in rsp_data['imageUrls']:
- try:
- pps=image_info['original'].replace('https://i.pximg.net/','https://original.img.cheerfun.dev/')
- print(pps)
- suffix = self.get_suffix(pps)
- # 保存图片
- _thread.start_new_thread(SaveImage,(pps,'./' + word + '/' + str(self.__counter) + str(suffix)))
- except urllib.error.HTTPError as urllib_err:
- print(urllib_err)
- continue
- except Exception as err:
- time.sleep(10)
- print(err)
- print("产生未知错误,放弃保存")
- continue
- else:
- sum=len(os.listdir('./' + word))
- print(f'第{str(self.__counter)}张涩图正在保存,已有{str(sum)}张涩图')
- self.__counter += 1
- time.sleep(self.time_sleep)
- return
- def get_images(self, word='白上吹雪'):
- search = urllib.parse.quote(word)
- pn = self.__start_amount
- while pn <= self.__amount:
- url='https://api.pixivic.com/illustrations?keyword='+search+'&page='+str(pn)
- print(url)
- try:
- req = urllib.request.Request(url=url, headers=self.headers)
- page = urllib.request.urlopen(req)
- rsp = page.read().decode('utf-8')
- except UnicodeDecodeError as e:
- print(e)
- print('-----UnicodeDecodeErrorurl:', url)
- except urllib.error.URLError as e:
- print(e)
- print("-----urlErrorurl:", url)
- except socket.timeout as e:
- print(e)
- print("-----socket timout:", url)
- else:
- rsp_data = json.loads(rsp)
- tmp=rsp_data['data']
- for ele in tmp:
- self.save_image(ele, word)
- print("下载下一页")
- pn += 1
-
- page.close()
- print("下载任务结束")
- return
- def start(self, word, spider_page_num=1, start_page=1):
- self.__start_amount = start_page
- self.__amount = spider_page_num
- self.get_images(word)
- if __name__ == '__main__':
- word=input('你想要谁的涩图?请输入:')
- crawler = Crawler(1.3,word)
- crawler.start(word, 10, 1)
复制代码
|
|