|
5鱼币
这是代码
#调用库
import urllib.request as u_request
import os,re,base64
import time
import requests
#伪装成浏览器访问
header = {}
header['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
#打开网址
def url_open(url):
html = requests.get(url,headers = header ).text #根据猜测来响应网页编码
return html
#正则表达式寻找图片地址
def find_images(url):
html = url_open(url)
m = r'<img src="([^"]+\.jpg)"'
match = re.findall(m,html)
#打印图片地址
for each in range(len(match)):
match[each] = 'http:' + match[each]
print(match[each])
return match
#下载相关图片
def save_images(floder,img_addrs):
for each in img_addrs:
try:
req = u_request.Request(each,headers = header)
response = u_request.urlopen(req)
cat_image = response.read()
filename = each.split('/')[-1]
with open(filename,'wb') as f:
f.write(cat_image)
#测试
#print(each)
except OSError as error:
print(error)
continue
except ValueError as error:
print(error)
continue
#下载图片的数量
def web_link_encode(url,folder):
for i in range(1,105):
string_date = '20210425_'
string_date = str(i)
string_date = string_date.encode('utf-8')
str_base64 = base64.b64encode(string_date)
page_url = url + str_base64.decode() + '=#comments'
print(page_url)
img_addrs = find_images(page_url)
save_images(folder,img_addrs)
time.sleep(2)
#主函数
def download_the_graph(url):
folder = 'graph'
os.mkdir(folder)
os.chdir(folder)
web_link_encode(url,folder)
if __name__ == '__main__':
url = 'http://jandan.net/girl/'
download_the_graph(url)
我想问m = r'<img src="([^"]+\.jpg)"'这一句,为什么正则表达式这样写
就能匹配http://wx2.sinaimg.cn/mw600/0076BSS5ly8gqeiqg7b0fj318y0u0b29.jpg这种网址
最好解释的详细一点
<img src= 是网页源代码就是这样写的啊 图片地址前边是啥这里就是啥啊,就是做个定位啊
|
|