|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import urllib.request
import urllib.error
from http import server, client
import os, re, math
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}
def urlOpen(url):
global header
req = urllib.request.Request(url, None, header)
return urllib.request.urlopen(req).read()
def tryToGet(url):
errorTimes = 0
while errorTimes != 5:
try:
return urlOpen(url)
except:
errorTimes += 1
return None
def getSubPage(url):
response = tryToGet(url)
if response != None:
html = response.decode('utf-8')
p = re.compile(r'<li class="s\d{1,2}"><a href="/tupian/(.{1,20})">')
return p.findall(html)
else:
print('**********当前页面获取失败')
return list()
imgCount = 1
def saveImgInPage(url):
global imgCount
print('********正在获取页面' + url)
response = tryToGet(url)
if response != None:
html = response.decode('utf-8')
p = re.compile(r'<div class="il_img".*<img src="(.*)" alt.*</div>')
imgList = p.findall(html)
for each in imgList:
response = tryToGet(each)
if response != None:
with open(str(imgCount) + '.jpg', 'wb') as f:
f.write(response)
print('**********目前已成功获取%d张图片!' % imgCount)
imgCount += 1
else:
print('**********当前页面获取失败!')
def work():
if not os.path.isdir('图片素材'):
os.mkdir('图片素材')
os.chdir('图片素材')
url = 'http://www.ivsky.com/tupian/'
subPageList = getSubPage(url)
for each in subPageList:
saveImgInPage(url + each)
if __name__ == '__main__':
work()
|
|