简单的多线程图片爬虫
本帖最后由 lj2122 于 2021-3-13 08:04 编辑import requests, os
from lxml import etree
from requests.adapters import HTTPAdapter
from threading import Thread
class DownLoad():
def __init__(self):
super(DownLoad, self).__init__()
self.headers ={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3861.400 QQBrowser/10.7.4313.400'}
# max_retries=1 重试1次
self.s = requests.session()
self.man_name = ''
self.s.mount('http://', HTTPAdapter(max_retries=2))
self.s.mount('https://', HTTPAdapter(max_retries=2))
#下载多少个页面
page = 2
self.get_man(page)
def get_man(self,page):
for i in range(page):
i = i + 1
html = self.s.request("GET",url=url,headers=self.headers,timeout=(10, 20)).content.decode()
data = etree.HTML(html)
result = []
html = data.xpath("//tr[@align='center']/td[@class='tal']/a/@href")
for res in html:
res = 'https://m1.p990.xyz/2048/' + res
result.append(res)
# print(res)
# print(result)
if i == 1:
del result
for url in result:
print(url)
self.get_pic(url)
def get_pic(self,url):
html = self.s.request("GET",url=url,headers=self.headers,timeout=(10, 20)).content.decode()
data = etree.HTML(html)
pic = data.xpath("//div[@class='f14']//img/@src")
self.man_name = data.xpath("//h1/text()").replace("@","").replace("/","").replace("?","")
threads = []
for i in pic:
threads.append(Thread(target=self.down_load_pic,args=(i,)))
for thread in threads:
thread.start()
for thread in threads:
thread.join()
def down_load_pic(self,res):
pic_name = './img/' + self.man_name + '/' + res.split('/')[-1]
if os.path.isfile(pic_name):
print(res.split('/')[-1] ,'.....picture exist.....')
else:
get_pic = self.s.request("GET",url=res,headers=self.headers,timeout=(10, 20))
get_pic = get_pic.content
print('Download....',"picture")
if os.path.exists('./img/'):
pass
else:
os.mkdir('./img/')
woman_dir = './img/' + self.man_name
if os.path.exists(woman_dir):
pass
else:
os.mkdir(woman_dir)
with open(pic_name, "wb") as f:
print('save picture',pic_name)
f.write(get_pic)
if __name__ == '__main__':
DownLoad()
print('download finished')
页:
[1]