|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
程序执行完后,打开文件夹会发现图片全部损坏,有没有哪位知道原因的,最好有解决方法。
- import requests
- import os, sys, stat
- from lxml import etree
- import time
- class HuangMan():
- def __init__(self):
- #设置请求头
- self.headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
- self.url_list = []
- self.Hman_url_list = []
- self.Hman_name_list = []
-
- def get_url_list(self):
- #这里是设置爬取到第3页,想要更多的话请自行更改
- url = "http://www.93qoqo.com/artlist/27-%d.html"
- for i in range(2,4):
- #获取第2到3页的url
- self.url_list.append(url % i)
-
- #由于第一页网址不同,所以单独写一个获取函数
- def start_1(self):
- url_1 = "http://www.93qoqo.com/artlist/27.html"
- response = requests.get(url_1, headers = self.headers).content.decode('utf-8')
- html = etree.HTML(response)
- result = html.xpath('//ul/li[@class="name"]/a')
- for i in range(len(result)):
- #获取result的属性
- shuxing = result[i].attrib
- #获取url
- self.Hman_url_list.append("http://www.93qoqo.com" + shuxing.get('href'))
- #获取名字
- self.Hman_name_list.append(shuxing.get('title'))
- print("正在爬取第1页的网站信息")
- time.sleep(2)
- #从第二页开始爬取信息
- def start(self):
- self.start_1()
- self.get_url_list()
- for url in self.url_list:
- response = requests.get(url, headers = self.headers).content.decode('utf-8')
- html = etree.HTML(response)
- result = html.xpath('//ul/li[@class="name"]/a')
- for i in range(len(result)):
- #获取result的属性
- shuxing = result[i].attrib
- #获取url
- self.Hman_url_list.append("http://www.93qoqo.com" + shuxing.get('href'))
- #获取名字
- self.Hman_name_list.append(result[i].text)
-
- print("正在爬取第%d页的网站信息" % (self.url_list.index(url) + 2))
- time.sleep(2)
- self.wenjian()
- self.main()
- def wenjian(self):
- #创建文件夹
- if os.path.exists("photo") == False:
- os.makedirs("photo")
- os.chmod("photo", stat.S_IWRITE)
- os.chdir("photo")
-
- else:
- print("文件已经存在")
-
- def main(self):
- for url in self.Hman_url_list:
- response = requests.get(url, headers = self.headers).content.decode('utf-8')
- html = etree.HTML(response)
- result = html.xpath('//center/div[@class="t_msgfont"]/img')
- print(self.Hman_name_list[self.Hman_url_list.index(url)])
- for i in range(len(self.Hman_name_list)):
- #创建次级文件夹用于存放图片
- f = os.makedirs(self.Hman_name_list[i])
- os.chmod(self.Hman_name_list[i], stat.S_IWRITE)
- print("正在创建文件夹")
- for e in range(len(result)):
- print("下载图片中。。。请稍等片刻")
- name ="%d.jpg" % e
- #请求图片
- respomse=requests.get(url,headers=self.headers)
- #下载
- with open(os.path.join(self.Hman_name_list[i],name),mode='wb') as f:
- f.write(respomse.content)
-
-
- if __name__ == "__main__":
- h = HuangMan()
- h.start()
复制代码 |
|