|
|
发表于 2016-5-14 17:51:20
|
显示全部楼层
import urllib.request
import os
import re
def url_open(url):
req=urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36')
response=urllib.request.urlopen(req)
html=response.read()
print(url)
return html
def get_page(url):
html=url_open(url).decode('utf-8')
p=r'共<span class="red">(.+)</span>页'
patten=re.compile(p)
pagenum=patten.findall(html)
print (pagenum)
return pagenum[0]
def find_imgs(pageurl):
html=url_open(pageurl).decode('utf-8')
imageaddress=re.findall('img.+?class="BDE_Image".+?src="([^"]+\.jpg).+?>',html)
for each in imageaddress:
filename=each.split('//')[-1]
print(filename)
return imageaddress
def save_imgs(folder,imageaddress):
for each in imageaddress:
filename=each.split('/')[-1]
urllib.request.urlretrieve(each,filename,None)
def download(folder='tiebapic3'):
path = os.getcwd()
new_path = os.path.join(path,folder)
if not os.path.exists(new_path):
os.mkdir(folder)
os.chdir(folder)
url="http://tieba.baidu.com/p/3544335017"
pagenum=int(get_page(url))
print(pagenum)
for i in range(1,pagenum+1):
pageurl=url+'?pn='+str(i)
imageaddress=find_imgs(pageurl)
save_imgs(folder,imageaddress)
if __name__ == '__main__':
download()
这个是我看完视频写的,保证好用,请保存成新py文件,在IDLEX运行 |
|