|
楼主 |
发表于 2022-4-28 19:15:29
|
显示全部楼层
我最后导出来的html咋是这样的啊。。。。
import re
import urllib.request
import chardet
def get_encoding(res):
coding_method = chardet.detect(res)['encoding']
if coding_method == 'GB2312':
coding_method = 'GBK'
return coding_method
def open_url(url):
req = urllib.request.Request(url)
req.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36')
response = urllib.request.urlopen(req).read()
cod = get_encoding(response)
html = response.decode(cod)
print(html)
return html
def get_img(html):
p = r'src="[^"]+\.jpg"'
imagelist = re.findall(p,html)
print(imagelist)
for each in imagelist:
filename = each.split('/')[-1]
urllib.request.urlretrieve(each , filename , None)
if __name__ == '__main__':
url = 'http://tieba.baidu.com/photo/p?kw=%E5%BC%A0%E9%9D%93%E9%A2%96&ie=utf-8&flux=1&tid=7679993387&pic_id=0a090ef3d7ca7bcbc5ea47fffb096b63f724a851&pn=1&fp=2&see_lz=1#!/pid0a090ef3d7ca7bcbc5ea47fffb096b63f724a851/pn1'
get_img(open_url(url))
|
-
|