|
发表于 2016-9-15 15:08:10
|
显示全部楼层
import os
import re
import urllib.request
#打开网页抓取源文件
def get_html(url):
try:
res = urllib.request.Request(url)
res.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')
html = urllib.request.urlopen(res).read().decode('utf-8')
except:
print('网页打开失败.....')
return html
#获得帖子列表页的内容页地址
def get_page(html):
z = r'<div class="postTitle"><a target="_blank" href="/(\d+.html)">.+</a></div>'
page = re.findall(z,html)
pagelist = []
for each in page:
pagelist.append('http://tt.mop.com/' + each)
return pagelist
#抓取内容页图片
def get_img(html):
z = r'<p class="tc mb10"><img src="([^"]+\.jpg)"></p>'
imglist = re.findall(z,html)
x = 1
for each in imglist:
filename = each.split('/')[-1]
urllib.request.urlretrieve(each,filename,None)
x +=1
print('成功下载%d张图片' % x)
if __name__ == "__main__":
url = 'http://tt.mop.com/c35.html'
urllist = get_page(get_html(url))
if not os.path.exists('猫扑女郎'):
os.mkdir('猫扑女郎')
os.chdir('猫扑女郎')
print('一共%d个图片帖子' % len(urllist))
y = 1
for each in urllist:
print('--------进入第%d页--------' % y)
get_img(get_html(each))
y += 1
if y > len(urllist):
print('抓取完毕!') |
|