|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
额- -事情是这样,想用正则表达式爬取猫扑当前页面的图片,用小甲鱼老师的方法可以顺利下载下来,但用正则表达式的方法就不行了,还不是很懂,请大神们指点下!谢谢额~
- import urllib.request
- import os
- import re
- def url_open(url):
- req = urllib.request.Request(url)
- req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36')
- dakai = urllib.request.urlopen(url)
- html = dakai.read()
-
- return html
- def find_imgs(url):
- html = url_open(url).decode('utf-8')
- img_addrs = []
- img = r'<img src="(.+\.jpg)">'
- find = re.findall(html,img)
-
- return img_addrs
- def save_img(jia,img_addrs):
- for each in img_addrs:
- filename = each.split('/')[-1]
- with open(filename,'wb') as f:
- img = url_open(each)
- f.write(img)
-
-
- def cat_mm(jia='mao'):
- os.mkdir(jia)
- os.chdir(jia)
- url = 'http://tt.mop.com/c35.html'
- img_addrs = find_imgs(url)
- save_img(jia,img_addrs)
-
-
- if __name__ == '__main__':
- cat_mm()
复制代码
1. find_imgs 函数, return 的是一个空列表, 应该是 find
2. re.findall 参数顺序不对
3. img 正则表达式不对
附上修改后的代码, 推荐你先不要看我的代码, 自己一个一个改错, 有问题欢迎追问我
- import urllib.request
- import os
- import re
- def url_open(url):
- req = urllib.request.Request(url)
- req.add_header('User-Agent',
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36')
- dakai = urllib.request.urlopen(url)
- html = dakai.read()
- return html
- def find_imgs(url):
- html = url_open(url).decode('utf-8')
- img = r'<img src="(.+?\.jpg)'
- find = re.findall(img, html)
- return find
- def save_img(jia, img_addrs):
- for each in img_addrs:
- filename = each.split('/')[-1]
- with open(filename, 'wb') as f:
- img = url_open(each)
- f.write(img)
- def cat_mm(jia='mao'):
- os.mkdir(jia)
- os.chdir(jia)
- url = 'http://tt.mop.com/c35.html'
- img_addrs = find_imgs(url)
- save_img(jia, img_addrs)
- if __name__ == '__main__':
- cat_mm()
复制代码
|
|