|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import requests
import bs4
import re
import os
def open_url(url):
'''
抓取网页代码
'''
headers = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
html = requests.get(url,headers=headers)
return html
def find_img_url(html):
'''
正则化获取图片地址
'''
p = r'img src="(.+\.jpg)"'
img_list = re.findall(p,html.text)#html.text是requests模块里面的获取响应网站的文本文件(自动解码)
return img_list
def main(num=10):#爬取前10页的图片,num可以自定义
os.mkdir('OOXX')#os模块创建文件夹
os.chdir('OOXX')#修改路径
urlz = 'http://www.meizitu.com'
name = 0
img_list = []
for n in range(1,num+1):
url = urlz + '/a/more_' + str(n) +'.html'
img_list.extend(find_img_url(open_url(url)))
new_img_list = []#去掉重复的图片
for img in img_list:
if img not in new_img_list:
new_img_list.append(img)
for each in new_img_list:
with open(str(name)+'.jpg','wb')as f :
f.write(open_url(each).content)
name += 1
if __name__=='__main__':
main()
|
|