Python爬虫第六弹：Tkinter爬虫3合1，爬妹子图+*图+百度图片，申精

qqqq79852852 · 发表于 2015-9-29 08:03:23

厉害啊，来学习

小榆萍 · 发表于 2015-9-30 11:07:15

就怕流氓有文化

xaltin · 发表于 2015-10-1 11:12:31

点亮star在哪里？先点个赞

chuxue · 发表于 2015-10-5 17:56:36

lihais dfjado

classic64g · 发表于 2015-10-5 20:24:09

学习

航海王 · 发表于 2015-10-6 07:24:04

看看

wking · 发表于 2015-10-7 15:40:03

学习学习

不坏不修 · 发表于 2015-10-10 16:57:34

可能吗

jxj21 · 发表于 2015-10-16 10:36:06

如何爬

hongzhong · 发表于 2015-10-16 14:48:00

git是什么

NewSong · 发表于 2015-10-16 18:18:26

谢谢

嗯333 · 发表于 2015-10-17 23:10:33

66666

Prince8 · 发表于 2015-10-18 12:12:11

66666666666:lol:

qqqq79852852 · 发表于 2015-10-18 12:56:30

支持一下！

manny · 发表于 2015-10-19 10:33:27

两包烟的钱，把不了妹买不了田，不如拿来支持小甲鱼推出更多原创教学视频！

ianv · 发表于 2015-10-19 11:45:12

import urllib.request
import urllib.error
import os
import sys
import http.server
import http.client
import time
import re
import random
import math

data = None
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'}
enctype = 'utf-8'
proxies = []
max_error_times = 5       #最多允许失败5次，否则放弃该图片下载

def create_localhost():
number = int((math.sqrt(5)-1)/2) * len(proxies)
for x in range(number):
      proxies.append(None)

def get_result(req_or_url,is_retrieve=False,filename = None):       #flag是否使用retrieve
error_time = 0
while True:
      try:
         if error_time == max_error_times:
            print('失败次数达%d次......放弃操作' % max_error_times)
            return None
         error_time += 1
         if is_retrieve:
            return urllib.request.urlretrieve(req_or_url,filename)
         else:
            return urllib.request.urlopen(req_or_url)
      except urllib.error.URLError as e:
         if hasattr(e,'code'):
            print(e.code,e.reason)
            change_proxy()
            continue
         elif hasattr(e,'reason'):
            print(e)
            change_proxy()
            continue
      except (ConnectionResetError,http.client.BadStatusLine) as e:
         print(e)
         change_proxy()
         continue
      except TimeoutError as e:
         print(e)
         print('服务器长时间无响应，自动切换代理.....')
         change_proxy()
         continue

def get_proxy():
global data,headers,proxies
req = urllib.request.Request('http://www.xici.net.co',None,headers)
response = get_result(req)
html = response.read().decode('utf-8')
p = re.compile(r'''<tr\sclass[^>]*>\s+
                                 <td>.+</td>\s+
                                 <td>(.*)?</td>\s+
                                 <td>(.*)?</td>\s+
                                 <td>(.*)?</td>\s+
                                 <td>(.*)?</td>\s+
                                 <td>(.*)?</td>\s+
                                 <td>(.*)?</td>\s+
                              </tr>''',re.VERBOSE)
proxy_list = p.findall(html)
for each_proxy in proxy_list[1:]:
      if each_proxy[4] == 'HTTP':
         proxies.append(each_proxy[0]+':'+each_proxy[1])

def change_proxy():
proxy = random.choice(proxies)
if proxy == None:
      proxy_support = proxy_support = urllib.request.ProxyHandler({})
else:
      proxy_support = urllib.request.ProxyHandler({'http':proxy})
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [('User-Agent',headers['User-Agent'])]
urllib.request.install_opener(opener)
print('智能切换代理：%s' % ('本机' if proxy==None else proxy))

def get_page():       #获取最大页数
home = 'http://jandan.net/ooxx'
global data,headers,enctype
req = urllib.request.Request(home,data,headers)
response = get_result(req)
html = response.read().decode(enctype)
find_string = 'current-comment-page'
find_start = html.index(find_string) + len(find_string) + 3
find_end = html.index(']',find_start+1)
return int(html[find_start:find_end])
test = None
def get_pic(page):    #生成器，返回一个图片链接
global data,headers,enctype
while True:
      url = 'http://jandan.net/ooxx/page-%d' % page
      print('当前页面：%d' % page)
      req = urllib.request.Request(url,data,headers)
      response = get_result(req)
      if response == None:
         print('获取页面失败.....')
         sys.exit()
      html = response.read().decode(enctype)
      pic = re.compile(r'<img\s+src="(http://.+?\.(?:jpg|jpeg|gif))"')
      for pic in pic.finditer(html):
         yield pic.group(1)
      time.sleep(5)
      page -= 1
      if page<1:
         break

save_path = 'D:\\图片\\妹子图'

def download():
count = 1
global data,headers
for pic_url in get_pic(get_page()):       #get_page()改为页数如1000可从1000页开始下载
      file_name = os.path.split(pic_url)[1]
      if not os.path.isdir(save_path): #目录不存在就创建
         os.makedirs(save_path)
      get_result(pic_url,True,save_path+'\\'+file_name)
      print('本次成功下载第%d个图片! %s' % (count , pic_url))
      count += 1

if __name__ == '__main__':
get_proxy()
create_localhost()
download()

Anson.zeng · 发表于 2015-10-20 08:27:50

看看，，，，，，

ipenny · 发表于 2015-10-22 17:53:02

收点儿鱼币不过分吧

lyman2501 · 发表于 2015-10-26 16:09:11

五道杠 · 发表于 2015-10-27 12:49:33

看看

账号		自动登录	找回密码
密码			立即注册

[作品展示] Python爬虫第六弹：Tkinter爬虫3合1，爬妹子图+*图+百度图片，申精