原因在于获取的某个链接是无效链接,复制粘贴到浏览器打开后一片空白,我正在研究如何把这种链接直接跳过去程序继续执行,还没结果……通常这个无效的图片链接出现在倒数第四或倒数第五个位置,直接扔掉后4个或后5个图片链接,程序就能正常执行了,以下是抓取贴吧图片的代码,没用正在表达式,不过亲测可用
import urllib.request
import os
def url_open(url):
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36')
response = urllib.request.urlopen(req)
html = response.read()
return html
def get_page(url):
html = url_open(url).decode('utf-8')
a = html.find('下一页') + 34
b = html.find('尾页') - 2
return html[a:b]
def find_pic(url):
html = url_open(url).decode('utf-8')
pic_addrs=[]
a = html.find('src=')
while a != -1:
b = html.find('jpg"',a,a+255)
if b!=-1:
if html[a+5:b+4].find('http')==0:
pic_addrs.append(html[a+5:b+3])
else:
b = a + 9
a = html.find('src=',b)
#pic_addrs[:-4]若报错换成pic_addrs[:-5]
pic_addrs = pic_addrs[:-4]
#print(pic_addrs)
return pic_addrs
def save_pic(folder,pic_addrs):
for each in pic_addrs:
filename = each.split('/')[-1]
pic=url_open(each)
with open(filename,'wb') as f:
f.write(pic)
def dl_tb_pic(folder='tiebapicture',pages=3):
os.mkdir(folder)
os.chdir(folder)
url = input('请把百度贴吧贴子地址复制到这里后点回车键:')
pages = int(input('从最后一页开始向前采集,请输入需要采集的页数:'))
#url = ''
page_num = int(get_page(url))
#print(page_num)
for i in range(pages):
page_url = url + '?pn=' + str(page_num)
print('当前正在采集第%d页' % page_num)
pic_addrs = find_pic(page_url)
save_pic(folder,pic_addrs)
page_num -= 1
print('采集完成!')
if __name__ == '__main__':
dl_tb_pic()