|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import requests
import os
import time
from bs4 import BeautifulSoup
def open_url(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Referer": url
}
soup=requests.get(url,headers=headers)
return soup
def get_list(soup):
html=BeautifulSoup(soup.content, 'lxml')
a_list=html.find_all("ul",attrs={'id':'pins'})[0].find_all('li')
for each in a_list:
href_list.append(each.find_all('span')[0].find_all('a')[0]['href'])
alt_list.append(each.find_all('span')[0].find_all('a')[0].string)
return (alt_list,href_list)
def img_save(img):
name=img.split('/')[-1]
with open(name,'wb') as f:
f.write(open_url(img).content)
def make_dir(pwd):
os.mkdir(pwd)
print("创建文件夹:"+pwd)
os.chdir(pwd)
def get_img(each):
i = 0
while True:
page_url = each + "//" + str(i)
soup = open_url(page_url)
img_url = BeautifulSoup(soup.content, 'lxml')
img = img_url.find_all('div', attrs={'class': 'main-image'})[0].img['src']
print("正在抓取图片:" + img)
img_save(img)
time.sleep(3)
page = img_url.find_all('div', attrs={'class': 'pagenavi'})[0].find_all('span')[-2].string
i = i + 1
if i == 1:
i = 2
if i > int(page):
break
href_list = []
alt_list = []
y=0
for i in range(7,8):
h=open_url("https://www.mzitu.com/page/"+str(i))
alt_list,h_list=get_list(h)
for each in h_list:
make_dir(alt_list[y])
y = y + 1
get_img(each)
os.chdir(r"C:\Users\jok\PycharmProjects\untitled")
爬取到一半时候会报这个错误:
Traceback (most recent call last):
File "C:/Users/jok/PycharmProjects/untitled/pange.py", line 62, in <module>
get_img(each)
File "C:/Users/jok/PycharmProjects/untitled/pange.py", line 41, in get_img
img = img_url.find_all('div', attrs={'class': 'main-image'})[0].img['src']
IndexError: list index out of range
|
|