马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
爬虫爬豆瓣250的图片,新手分享!小甲鱼yyds
import requests
import bs4
import re
def open_url(url='return a text type'):
# 使用代理
#proxies = {'http':"176.31.154.12:80","https": "176.31.154.12:80"}
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}
#res = requests.get(url,headers= headers,proxies = proxies)
res = requests.get(url,headers = headers)
return res.text
def download_img(a,page = '第几页',number = '第几张'):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}
# res = request.get(url,headers= headers,proxies = proxies)
res = requests.get(a,headers = headers)
name = 'D:/1/' +'img' +str(page)+'&'+ str(number)+'.jpg'
if res.status_code == 200:
open(name,'wb').write(res.content)
def find_depth(res):
soup = bs4.BeautifulSoup(res.text, 'html.parser')
depth = soup.find('span',class_='next').previous_sibling.previous_sibling.text
return int(depth)
url = 'https://movie.douban.com/top250'
a = open_url(url)
b = bs4.BeautifulSoup(a,features = 'lxml')
count_page = []
url_all = []
imgs_all = []
#找到所有页面的网址
for i in b('a'):
aaa = i.attrs['href']
print(i.attrs['href'])
aa = re.search(r'start',aaa)
if aa != None:
count_page.append(aaa)
for j in count_page:
url_all.append(url+j)
#找到网址的所有图片地址
c = b('img')
for l in c:
imgs_all.append(l.attrs['src'])
print(url_all)
#准备开爬
page = 0
number = 0
for i in url_all:
page += 1
a = open_url(i)
b = bs4.BeautifulSoup(a,features = 'lxml')
c = b('img')
imgs_all = []
for l in c:
imgs_all.append(l.attrs['src'])
for j in imgs_all:
number += 1
download_img(j,page,number)
|