xunyu 发表于 2022-12-27 10:40:36

爬豆瓣TOP250的所有图片,分享

爬虫爬豆瓣250的图片,新手分享!小甲鱼yyds

import requests
import bs4
import re
def open_url(url='return a text type'):
    # 使用代理
    #proxies = {'http':"176.31.154.12:80","https": "176.31.154.12:80"}
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}
    #res = requests.get(url,headers= headers,proxies = proxies)
    res = requests.get(url,headers = headers)
    return res.text
def download_img(a,page = '第几页',number = '第几张'):
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}
    # res = request.get(url,headers= headers,proxies = proxies)
    res = requests.get(a,headers = headers)
    name = 'D:/1/' +'img' +str(page)+'&'+ str(number)+'.jpg'
    if res.status_code == 200:
      open(name,'wb').write(res.content)
def find_depth(res):
    soup = bs4.BeautifulSoup(res.text, 'html.parser')
    depth = soup.find('span',class_='next').previous_sibling.previous_sibling.text
    return int(depth)
url = 'https://movie.douban.com/top250'
a = open_url(url)
b = bs4.BeautifulSoup(a,features = 'lxml')
count_page = []
url_all = []
imgs_all = []
#找到所有页面的网址
for i in b('a'):
    aaa = i.attrs['href']
    print(i.attrs['href'])
    aa = re.search(r'start',aaa)
    if aa != None:
      count_page.append(aaa)
for j in count_page:
    url_all.append(url+j)
#找到网址的所有图片地址
c = b('img')
for l in c:
    imgs_all.append(l.attrs['src'])
    print(url_all)
#准备开爬
page = 0
number = 0
for i in url_all:
    page += 1
    a = open_url(i)
    b = bs4.BeautifulSoup(a,features = 'lxml')
    c = b('img')
    imgs_all = []
    for l in c:
      imgs_all.append(l.attrs['src'])
    for j in imgs_all:
      number += 1
      download_img(j,page,number)

xunyu 发表于 2022-12-27 10:46:59

D:/1.png

学习编程中的Ben 发表于 2022-12-28 09:15:58

来看看
页: [1]
查看完整版本: 爬豆瓣TOP250的所有图片,分享