|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
爬虫爬豆瓣250的图片,新手分享!小甲鱼yyds
- import requests
- import bs4
- import re
- def open_url(url='return a text type'):
- # 使用代理
- #proxies = {'http':"176.31.154.12:80","https": "176.31.154.12:80"}
- headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}
- #res = requests.get(url,headers= headers,proxies = proxies)
- res = requests.get(url,headers = headers)
- return res.text
- def download_img(a,page = '第几页',number = '第几张'):
- headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'}
- # res = request.get(url,headers= headers,proxies = proxies)
- res = requests.get(a,headers = headers)
- name = 'D:/1/' +'img' +str(page)+'&'+ str(number)+'.jpg'
- if res.status_code == 200:
- open(name,'wb').write(res.content)
- def find_depth(res):
- soup = bs4.BeautifulSoup(res.text, 'html.parser')
- depth = soup.find('span',class_='next').previous_sibling.previous_sibling.text
- return int(depth)
- url = 'https://movie.douban.com/top250'
- a = open_url(url)
- b = bs4.BeautifulSoup(a,features = 'lxml')
- count_page = []
- url_all = []
- imgs_all = []
- #找到所有页面的网址
- for i in b('a'):
- aaa = i.attrs['href']
- print(i.attrs['href'])
- aa = re.search(r'start',aaa)
- if aa != None:
- count_page.append(aaa)
- for j in count_page:
- url_all.append(url+j)
- #找到网址的所有图片地址
- c = b('img')
- for l in c:
- imgs_all.append(l.attrs['src'])
- print(url_all)
- #准备开爬
- page = 0
- number = 0
- for i in url_all:
- page += 1
- a = open_url(i)
- b = bs4.BeautifulSoup(a,features = 'lxml')
- c = b('img')
- imgs_all = []
- for l in c:
- imgs_all.append(l.attrs['src'])
- for j in imgs_all:
- number += 1
- download_img(j,page,number)
复制代码 |
|