|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
煎蛋网对图片有保护,只能爬到一个防伪码,请教这是哪种反爬措施,有破解方法吗?没有完美结束这一课的学习,让我强迫症很不爽 ,代码,运行结果如下:
- from urllib import request
- from bs4 import BeautifulSoup
- import re
- import base64
- import os
- def main():
- #新建文件夹
- os.mkdir('OOXX')
- os.chdir('OOXX')
-
- url='http://jandan.net/ooxx'
- #爬取页面
- headers={}
- headers['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0'
- req=request.Request(url,headers=headers)
- response=request.urlopen(req).read().decode('utf-8')
- #调用soup处理页面
- soup=BeautifulSoup(response,'html.parser')
- links_list=soup.find(attrs={'class':'cp-pagenavi'})
- #调用函数寻找初始页码链接,links为字典{page:'',link:''}
- pages_list=findlinks(links_list)
- page=int(pages_list['page'])
- page_link='http:'+pages_list['link']
- stop_page=page-5
- #爬取当前页的图片
- comment_list=soup.find(attrs={'id':'comments'})
- print(comment_list)
- img_list=comment_list.find_all('img')
- print(img_list)
- #下载指定图片
- for item in img_list:
- img_url='http:'+item['src']
- print(img_url)
- filename=item['src'].split('/')[-1]
- headers2={}
- headers2['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0'
- req2=request.Request(img_url,headers=headers)
- response2=request.urlopen(req2).read()
- with open(filename,'wb') as f:
- f.write(response2)
-
-
- while True:
- #爬取图片
- find_img(page_link)
- #处理链接(返回下一个页面对应的链接)
- page_link=handle_link(page_link)
-
- page-=1
- if page==stop_page:
- break
- def findlinks(links_list):
- target=links_list.find('a')
- page=target.text
- link=target['href']
- res={'page':page,'link':link}
- return res
- def handle_link(page_link):
- page_num=re.search('jandan.net/ooxx/(.+?)#',page_link).group(1)
- page_num=str(base64.b64decode(page_num),'utf-8')
- num=page_num.split('-')[1]
- date=page_num.split('-')[0]
- num=str(int(num)-1)
- page_num='-'.join((date,num))
- page_num=str(base64.b64encode(page_num.encode('utf-8')),'utf-8')
- link='http://jandan.net/ooxx/'+page_num+'#comments'
- return link
- def find_img(page_link):
- #爬取对应页面内容
- headers={}
- headers['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0'
- req=request.Request(page_link,headers=headers)
- response=request.urlopen(req).read().decode('utf-8')
- #使用BeautifulSoup获取指定图片
- soup=BeautifulSoup(response,'html.parser')
- comment_list=soup.find(attrs={'class':'commentlist'})
- img_list=comment_list.find_all('img')
- #下载指定图片
- for item in img_list:
- img_url='http:'+item['src']
- filename=item['src'].split('/')[-1]
- headers2={}
- headers2['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0'
- req2=request.Request(img_url,headers=headers)
- response2=request.urlopen(req2).read()
- with open(filename,'wb') as f:
- f.write(response2)
-
- if __name__=='__main__':
- main()
复制代码 运行结果:
|
|