来钓鱼 发表于 2017-12-12 15:18:18

图片素材(供学习交流)

import urllib.request
import urllib.error
from http import server, client
import os, re, math

header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36'}

def urlOpen(url):
    global header
    req = urllib.request.Request(url, None, header)
    return urllib.request.urlopen(req).read()

def tryToGet(url):
    errorTimes = 0
    while errorTimes != 5:
      try:
            return urlOpen(url)
      except:
            errorTimes += 1

    return None

def getSubPage(url):
    response = tryToGet(url)
    if response != None:
      html = response.decode('utf-8')

      p = re.compile(r'<li class="s\d{1,2}"><a href="/tupian/(.{1,20})">')
      return p.findall(html)
    else:
      print('**********当前页面获取失败')
      return list()

imgCount = 1
def saveImgInPage(url):
    global imgCount
    print('********正在获取页面' + url)
    response = tryToGet(url)
    if response != None:
      html = response.decode('utf-8')

      p = re.compile(r'<div class="il_img".*<img src="(.*)" alt.*</div>')
      imgList = p.findall(html)

      for each in imgList:
            response = tryToGet(each)
            if response != None:
                with open(str(imgCount) + '.jpg', 'wb') as f:
                  f.write(response)
                print('**********目前已成功获取%d张图片!' % imgCount)
                imgCount += 1
    else:
      print('**********当前页面获取失败!')

def work():
    if not os.path.isdir('图片素材'):
      os.mkdir('图片素材')
    os.chdir('图片素材')

    url = 'http://www.ivsky.com/tupian/'
    subPageList = getSubPage(url)

    for each in subPageList:
      saveImgInPage(url + each)

if __name__ == '__main__':
    work()





页: [1]
查看完整版本: 图片素材(供学习交流)