|

楼主 |
发表于 2017-5-10 01:51:10
|
显示全部楼层
- import urllib.request
- import re
- import os
- path = os.getcwd()
- new_path = os.path.join(path, 'tupian')
- if not os.path.exists(new_path):
- os.mkdir(new_path)#创建文件夹
-
- def open_url(url):
- headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'}
- req = urllib.request.Request(url, headers=headers)
- response = urllib.request.urlopen(req)
- html = response.read()
- return html
- def get_p_addrs(url):
- html = open_url(url).decode('utf-8', 'ignore')
- p = r'<a href=".*?"> ([^"]+\.jpg)</a>'
- page_addrs = re.findall(p, html)
- return page_addrs
-
- def find_img(url):
- html = open_url(url).decode('utf-8', 'ignore')
- p = r'src="(.*?)" width='
- img_addrs = re.findall(p, html.replace("\n","").replace(" ","")) #避免换行符和空格干扰,直接替换掉
- if img_addrs != []:
- print('此次链接爬取中,共获得%d张图片' % len(img_addrs))
- else:
- print('# -*-没有找到任何图片')
- print( img_addrs)
- def save_img(img_addrs):
- os.chdir(new_path)
- for each in img_addrs:
- filename = each.split('/')[-1]
- with open(filename, 'wb')as f:
- img = open_url(each)
- f,write(img)
- x = 1
-
- def downdloading():
- url = 'http://i.niupic.com/images/2016/01/01/'
-
- page_addrs = get_p_addrs(url)
- print('共获得%d个链接' %len(page_addrs))
- for each in page_addrs:
- new_page_url = url + each
- global x
- print('正在爬取第%d个链接' %x)
- print(new_page_url)
- x += 1
- img_addrs = find_img(new_page_url)
- save_img(img_addrs)
-
- if __name__ =='__main__':
- downdloading()
-
复制代码
就这样子 老是匹配不出图片 |
|