import urllib.request
import re
import os
path = os.getcwd()
new_path = os.path.join(path, 'tupian')
if not os.path.exists(new_path):
os.mkdir(new_path)#创建文件夹
def open_url(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'}
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
html = response.read()
return html
def get_p_addrs(url):
html = open_url(url).decode('utf-8', 'ignore')
p = r'<a href=".*?"> ([^"]+\.jpg)</a>'
page_addrs = re.findall(p, html)
return page_addrs
def find_img(url):
html = open_url(url).decode('utf-8', 'ignore')
p = r'src="(.*?)" width='
img_addrs = re.findall(p, html.replace("\n","").replace(" ","")) #避免换行符和空格干扰,直接替换掉
if img_addrs != []:
print('此次链接爬取中,共获得%d张图片' % len(img_addrs))
else:
print('# -*-没有找到任何图片')
print( img_addrs)
def save_img(img_addrs):
os.chdir(new_path)
for each in img_addrs:
filename = each.split('/')[-1]
with open(filename, 'wb')as f:
img = open_url(each)
f,write(img)
x = 1
def downdloading():
url = 'http://i.niupic.com/images/2016/01/01/'
page_addrs = get_p_addrs(url)
print('共获得%d个链接' %len(page_addrs))
for each in page_addrs:
new_page_url = url + each
global x
print('正在爬取第%d个链接' %x)
print(new_page_url)
x += 1
img_addrs = find_img(new_page_url)
save_img(img_addrs)
if __name__ =='__main__':
downdloading()
就这样子 老是匹配不出图片 |