|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
报错和代码如下,求助大侠
Traceback (most recent call last):
File "C:\Python27\test\download_mm.py", line 138, in <module>
download_ip()
File "C:\Python27\test\download_mm.py", line 26, in download_ip
str_ip=re.search(r'(([01]?\d?\d|2[0-4]\d|25[0-5])\.){3}([01]?\d?\d|2[0-4]\d|25[0-5])',html).group()
AttributeError: 'NoneType' object has no attribute 'group'
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import urllib2
- import os
- import os.path
- import random
- import datetime
- import re
- #获取代理ip将它保存在本地ini文档中
- def download_ip(folder='AGENT_IP'):
- if os.path.exists('C:\\Python27\\test\\'+folder)==False:
- os.mkdir(folder) #创建文件夹
- os.chdir(folder) #切换至当前文件夹
- #ini文档以日期格式命名
- today=datetime.date.today()
- if os.path.isfile('C:\\Python27\\test\\'+folder+'\\'+str(today)+'.ini')==False:
- url='http://www.kuaidaili.com/'
- html=url_open(url).decode('utf-8','ignore')
-
- #匹配IP地址
- ip_addrs=[]
- a=0
- while a!=-1:
- str_ip=re.search(r'(([01]?\d?\d|2[0-4]\d|25[0-5])\.){3}([01]?\d?\d|2[0-4]\d|25[0-5])',html).group()
- print str_ip
- a=html.find(str_ip)
- str_port=re.search(r'[0-9]{2,4}',html[a+20:]).group()
- ip_addrs.append(str_ip+':'+str_port)
- html=html[a+80:]
- for each_ip in ip_addrs:
- print each_ip
- #打开文档并写入
- '''
- f=open(file_name,'wb')
- f.write(ip_addrs)
- f.close
- '''
- def url_open_agent(url):
- req=urllib2.Request(url)
- req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0') #伪装成浏览器访问
- #加入代理IP
- '''
- proxy=['120.195.202.189:80']
- proxy=random.choice(proxy)
- proxy_support=urllib2.ProxyHandler({'http':proxy})
- opener=urllib2.build_opener(proxy_support)
- urllib2.install_opener(opener)
- '''
- response=urllib2.urlopen(req)
- html=response.read() #因图片和字符全部调用此函数访问,返回图片时不可以用decode解码,所以字符的解码放在具体的调用函数里
- return html
- def url_open(url):
- req=urllib2.Request(url)
- req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0') #伪装成浏览器访问
- #加入代理IP
- '''
- proxy=['120.195.202.189:80']
- proxy=random.choice(proxy)
- proxy_support=urllib2.ProxyHandler({'http':proxy})
- opener=urllib2.build_opener(proxy_support)
- urllib2.install_opener(opener)
- '''
- response=urllib2.urlopen(req)
- html=response.read() #因图片和字符全部调用此函数访问,返回图片时不可以用decode解码,所以字符的解码放在具体的调用函数里
- return html
- def get_page(url):
- #pass
- html=url_open(url).decode('utf-8')
- a=html.find('current-comment-page')+23
- b=html.find(']',a) #从a开始第一次出']'的位置
- #print(html[a:b])
- return html[a:b]
- def find_imgs(url): #获取地址时怎么去掉广告图片???
- #pass
- html=url_open(url).decode('utf-8')
- img_addrs=[]
- a=html.find('img src=')
- while a!=-1:
- #b=html.find('.jpg',a,a+255)
-
- c=html.find('sinaimg',a,a+50)
- if c!=-1:
- b=html.find('.jpg',a,a+255) #正则表达式可以更好地解决此问题,可以找任意格式的图片
- else:
- b=-1
-
- if b!=-1:
- img_addrs.append(html[a+9:b+4])
- else:
- b=a+9
- a=html.find('img src=',b)
-
-
- '''
- for each in img_addrs:
- print(each)
- '''
- return img_addrs
- def save_imgs(folder,img_addrs):
- #pass
- for each in img_addrs:
- filename=each.split('/')[-1] #分割,去除路径,只获取文件名
- with open(filename,'wb') as f:
- img=url_open(each)
- f.write(img)
- def download_mm(folder='OOXX',pages=10):
- if os.path.exists('C:\\Python27\\test\\'+folder)==False:
- os.mkdir(folder) #创建文件夹
- os.chdir(folder) #切换至当前文件夹
-
- url='http://jandan.net/ooxx/'
- page_num=int(get_page(url)) #抓取网页中的数字代码
- for i in range(pages):
- page_num-=i
- page_url=url+'page-'+str(page_num)+'#comments'
- img_addrs=find_imgs(page_url) #获取图片地址,保存成列表
- save_imgs(folder,img_addrs)
- if __name__=='__main__':
- #获取代理ip将它保存在本地ini文档中
- download_ip()
- #download_mm()
-
复制代码 |
|