|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import requests
import bs4
import os
import re
def open_url(url):
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
response=requests.get(url,headers=headers)
return response
def get_url(url):
resp=targ=open_url(url)
soup=bs4.BeautifulSoup(resp.text,'html.parser')
targets=soup.find_all("li",class_="galleryli")
uu=[]
for each in targets:
uu.append('https://www.nvshens.net'+each.div.a['href'])
return uu
def get_pages(response):
soup=bs4.BeautifulSoup(response.text,'html.parser')
page1=soup.find_all('div',id="dinfo")
# print(page1)
for each in page1:
pa=int(each.span.text[:-3])
if pa%3:
pag=pa//3+1
else:
pag=pa//3
print(pag)
return pag
def down_pages(url):
req=open_url(url)
# print(req.text)
# soup=bs4.BeautifulSoup(req.text,'html.parser')
# page1=soup.find_all('ul',id="hgallery")
# print(page.text)
p=r"img src='(.+?\.jpg)' alt="
tar=re.findall(p,req.text)
return tar
def main():
os.mkdir('ll')
os.chdir('ll')
u=input('请输入网址:')
tt=get_url(u)
list1=[]
for url in tt:
res=open_url(url)
pages=get_pages(res)
for i in range(1,pages+1):
ur=url+r'/'+str(i)+'.html'
tar=down_pages(ur)
for each in tar:
list1.append(each)
i=0
print(list1)
for n in list1:
i+=1
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
nn=requests.get(n,headers=headers,allow_redirects=False)
with open(str(i)+'.jpg','wb') as f:
f.write(nn.content)
if __name__=='__main__':
main()
想试着爬取女生网的图片,但是下载之后有一部分可以打开 有一部分打不开,结果如图所示,这是为什么?
首先jpg的后缀名是你给加上去的,实际上你爬到的图片二进制数据不一定都是jpg格式的数据流(也许是网站的奇怪反爬操作使你访问图片却收到了非图片数据,又或者是网站返回了一些混淆的数据等等,都有可能),所以就算加了jpg后缀它也不是图片,系统显示不支持此格式图片就说得通了
|
-
-
|