56讲 OOXX 煎蛋网反爬问题,程序写完了才在最后一步触发反爬程序,绝望!!
煎蛋网对图片有保护,只能爬到一个防伪码,请教这是哪种反爬措施,有破解方法吗?没有完美结束这一课的学习,让我强迫症很不爽{:10_277:},代码,运行结果如下:from urllib import request
from bs4 import BeautifulSoup
import re
import base64
import os
def main():
#新建文件夹
os.mkdir('OOXX')
os.chdir('OOXX')
url='http://jandan.net/ooxx'
#爬取页面
headers={}
headers['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0'
req=request.Request(url,headers=headers)
response=request.urlopen(req).read().decode('utf-8')
#调用soup处理页面
soup=BeautifulSoup(response,'html.parser')
links_list=soup.find(attrs={'class':'cp-pagenavi'})
#调用函数寻找初始页码链接,links为字典{page:'',link:''}
pages_list=findlinks(links_list)
page=int(pages_list['page'])
page_link='http:'+pages_list['link']
stop_page=page-5
#爬取当前页的图片
comment_list=soup.find(attrs={'id':'comments'})
print(comment_list)
img_list=comment_list.find_all('img')
print(img_list)
#下载指定图片
for item in img_list:
img_url='http:'+item['src']
print(img_url)
filename=item['src'].split('/')[-1]
headers2={}
headers2['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0'
req2=request.Request(img_url,headers=headers)
response2=request.urlopen(req2).read()
with open(filename,'wb') as f:
f.write(response2)
while True:
#爬取图片
find_img(page_link)
#处理链接(返回下一个页面对应的链接)
page_link=handle_link(page_link)
page-=1
if page==stop_page:
break
def findlinks(links_list):
target=links_list.find('a')
page=target.text
link=target['href']
res={'page':page,'link':link}
return res
def handle_link(page_link):
page_num=re.search('jandan.net/ooxx/(.+?)#',page_link).group(1)
page_num=str(base64.b64decode(page_num),'utf-8')
num=page_num.split('-')
date=page_num.split('-')
num=str(int(num)-1)
page_num='-'.join((date,num))
page_num=str(base64.b64encode(page_num.encode('utf-8')),'utf-8')
link='http://jandan.net/ooxx/'+page_num+'#comments'
return link
def find_img(page_link):
#爬取对应页面内容
headers={}
headers['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0'
req=request.Request(page_link,headers=headers)
response=request.urlopen(req).read().decode('utf-8')
#使用BeautifulSoup获取指定图片
soup=BeautifulSoup(response,'html.parser')
comment_list=soup.find(attrs={'class':'commentlist'})
img_list=comment_list.find_all('img')
#下载指定图片
for item in img_list:
img_url='http:'+item['src']
filename=item['src'].split('/')[-1]
headers2={}
headers2['User-Agent']='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0'
req2=request.Request(img_url,headers=headers)
response2=request.urlopen(req2).read()
with open(filename,'wb') as f:
f.write(response2)
if __name__=='__main__':
main()
运行结果:
{:10_243:} 对此我也很焦灼{:10_266:} {:10_254:} 网页好像只是加了一个base64的数字加密,2021.2.14亲测可运行
import urllib.request
import os
import base64
def url_open(url):
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36')
response = urllib.request.urlopen(req)
html = response.read()
return html
def get_page(url):
html = url_open(url).decode('utf-8')
a = html.find('current-comment-page')+23
b = html.find(']',a)
return html
def find_imgs(url):
html = url_open(url).decode('utf-8')
img_addrs = []
a = html.find('img src=')
while a != -1:
b = html.find('.jpg',a,a+255)
if b != -1:
img_addrs.append(html)
else:
b = a + 9
a = html.find('img src=',b)
return img_addrs
def save_imgs(folder,img_addrs):
for each in img_addrs:
filename = each.split('/')[-1]
with open(filename,'wb') as f:
each = 'http:' + each
img = url_open(each)
f.write(img)
def download_mm(folder='ooxx1',pages=10):
os.mkdir(folder)
os.chdir(folder)
url = 'http://jandan.net/ooxx'
page_num = int(get_page(url))
for i in range(pages):
page_num -= 1
#日期+base64转码(下方改日期)
targt_num = '20210114-' + str(page_num)
targt_num = base64.b64encode(targt_num.encode('utf-8'))
page_url = url +'/' + str(targt_num,'utf-8')+'#comments'#网页链接
img_addrs = find_imgs(page_url)#图片地址
save_imgs(folder, img_addrs)#图片地址保存
if __name__ == "__main__":
download_mm()
qqw 发表于 2021-2-14 14:03
网页好像只是加了一个base64的数字加密,2021.2.14亲测可运行
请教怎么分析出是base64加密的?
页:
[1]