| 
 | 
 
 
 楼主 |
发表于 2022-4-28 19:15:29
|
显示全部楼层
 
 
 
 
我最后导出来的html咋是这样的啊。。。。 
 
 
import re 
import urllib.request 
import chardet 
 
def get_encoding(res): 
    coding_method = chardet.detect(res)['encoding'] 
    if coding_method == 'GB2312': 
        coding_method = 'GBK' 
    return coding_method 
 
def open_url(url): 
    req = urllib.request.Request(url) 
    req.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36') 
 
    response = urllib.request.urlopen(req).read() 
    cod = get_encoding(response) 
    html = response.decode(cod) 
    print(html) 
    return html 
 
def get_img(html): 
    p = r'src="[^"]+\.jpg"' 
    imagelist = re.findall(p,html) 
    print(imagelist) 
    for each in imagelist: 
        filename = each.split('/')[-1] 
        urllib.request.urlretrieve(each , filename , None) 
 
if __name__ == '__main__': 
    url = 'http://tieba.baidu.com/photo/p?kw=%E5%BC%A0%E9%9D%93%E9%A2%96&ie=utf-8&flux=1&tid=7679993387&pic_id=0a090ef3d7ca7bcbc5ea47fffb096b63f724a851&pn=1&fp=2&see_lz=1#!/pid0a090ef3d7ca7bcbc5ea47fffb096b63f724a851/pn1' 
    get_img(open_url(url)) 
 
 |   
- 
 
 
 
 
 
 
 
 |