|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
- import urllib.request
- import os
- # 把打开网页的过程封装成函数(伪装用浏览器)
- def url_open(url):
- req = urllib.request.Request(url)
- req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0')
- response = urllib.request.urlopen(url)
- html = response.read()
- return html
- # 将每个图片的地址封装到img_addrs列表
- def find_img(url):
- html = url_open(url).decode('utf-8')
- img_addrs = []
- a = html.find('data-imgurl=')
- while a != -1:
- b = html.find('.jpg', a, a + 255)
- if b != -1:
- img_addrs.append(html[a + 13:b + 4])
- else:
- b = a + 13
- a = html.find('data-imgurl=', b)
- print(html[a + 13:b + 4])
- return img_addrs
- #将图片保存到文件中
- def save_imgs(folder, img_addrs):
- for each in img_addrs:
- filename = each.split('/')[-1]
- with open(filename, 'wb') as f:
- img = url_open(each)
- f.write(img)
- #主函数,保存图片
- def download_mm(folder='hawaii'):
- os.mkdir(folder)
- os.chdir(folder)
- url = 'http://image.baidu.com/search/index?ct=201326592&cl=2&lm=-1&tn=baiduimage&ie=utf-8&word=%E5%A4%8F%E5%A8%81%E5%A4%B7'
- img_addrs = find_img(url)
- save_imgs(folder,img_addrs)
- if __name__ == '__main__':
- download_mm()
复制代码
运行后文件夹创建,但里面不导出图片。在find_img函数下面用print(img_addrs)测试了下结果没有打印也没有报错。问题应该就出在这个函数这里,但是不知道哪里错了,求助各位大佬。
本帖最后由 gopythoner 于 2017-4-16 14:37 编辑
我帮你改了2个地方,试过了可以得到图片
看代码
- import urllib.request
- import os
- import re #引入正则表达式--------------------改动1
- # 把打开网页的过程封装成函数(伪装用浏览器)
- def url_open(url):
- req = urllib.request.Request(url)
- req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0')
- response = urllib.request.urlopen(url)
- html = response.read()
- return html
- # 将每个图片的地址封装到img_addrs列表
- def find_img(url):
- html = url_open(url).decode('utf-8')
- img_addrs = []
- # 选择图片并把链接放入列表中--------------改动2
- img = re.findall("(http:.*?\.jpg)", html)
- for i in img:
- if "baidu.com" in i:
- img_addrs.append(i)
- return img_addrs
- #将图片保存到文件中
- def save_imgs(folder, img_addrs):
- for each in img_addrs:
- filename = each.split('/')[-1]
- with open(filename, 'wb') as f:
- img = url_open(each)
- f.write(img)
- #主函数,保存图片
- def download_mm(folder='hawaii'):
- os.mkdir(folder)
- os.chdir(folder)
- url = 'http://image.baidu.com/search/index?ct=201326592&cl=2&lm=-1&tn=baiduimage&ie=utf-8&word=%E5%A4%8F%E5%A8%81%E5%A4%B7'
- img_addrs = find_img(url)
- save_imgs(folder,img_addrs)
- if __name__ == '__main__':
- download_mm()
复制代码
由于网页是动态生成的,所以下载到的图片与你实际能看到的图片数量是有差异的,关于动态网页的信息爬取的话有点复杂,你这个实现不了
|
|