|
1鱼币
本帖最后由 Stubborn 于 2019-1-20 07:28 编辑
这样写,能不能爬取到图片,哪里错了,请指点,麻烦了~~您未获授权,无法查看此网页. 403
- #_*_coding:utf-8
- import requests,bs4,os
- from fake_useragent import UserAgent
- ua = UserAgent()
- os.makedirs(r'G:\down',exist_ok='Ture')
- os.chdir(r'G:\down')
- def get_url(url):
- "请求网页函数,模拟请求头"
- headers ={
- 'User-Agent': ua.random
- }
- page_data = requests.get(url,headers=headers)
- soup_data = bs4.BeautifulSoup(page_data.text,'html.parser')
- img_link = soup_data.find_all("span")
- return img_link
- def get_Atlas_dict(img_link):
- "获取到页面的图集URL和图集介绍"
- Atlas_dict = dict()
- try:
- for i in img_link:
- if i.a == None:
- pass
- else:
- #题取出图集的名字和url,因为第一页有总网址,这个得删下,技术不足
- if i.a.text == "妹子图":
- pass
- else:
- Atlas_dict[i.a["href"]]= i.a.text
- except:
- pass
- return Atlas_dict
- def get_Atls_jpg(img_link):
- "返回每个图片的url列表"
- Number_of_pages = []
- atls_jpg_uri = [] #存放每个图片的rul
- for i in img_link:
- #获取到图集张数
- try:
- ss = int(i.text)
- Number_of_pages.append(ss)
- except:
- pass
- #获取到图集张数,捕捉每个图片的url
- for each in range(1,max(Number_of_pages)):
- url = "https://www.mzitu.com/164996"+"/"+str(each)
- page_data = requests.get(url)
- soup_data = bs4.BeautifulSoup(page_data.text, 'html.parser')
- img_link = soup_data.find_all("div",class_="main-image")
- for i in img_link:
- #<a href="https://www.mzitu.com/164996/2"><img alt="可爱眼镜娘Miko酱酒店开房激情阵阵 姿势撩人让人想入非非" height="1050" src="https://i.meizitu.net/2018/12/28c01.jpg" width="700"/></a>
- atls_jpg_uri.append(i.p.a.img["src"])
-
- return atls_jpg_uri
- def get_downsload(list):
- “下载函数”
- for i in list:
- cont = 1
- response = requests.get(i)
- img = response.content
- with open( '%s.jpg'%(cont),'wb' ) as f:
- f.write(img)
- print("第%s张图片下载完成" % (cont))
- cont += 1
- if __name__ == "__main__":
- number = int(input("输入需要下载的页数:"))
- for each in range(1,number+1):
- url = "https://www.mzitu.com/page/"+str(each)+"/"
- img_link = get_url(url) #请求网页,抓取图集URL
- Atlas_dict = get_Atlas_dict(img_link) #抓取出图集的URL与信息,介绍是键值,RUL是键
- for img_url in Atlas_dict:
- img_uri= get_url(img_url) #请求网页,抓取出图片的URL
- atls_jpg_uri_list = get_Atls_jpg(img_uri) #抓取出所有的图片的URL返回一个列表
- #设置下载目录进行下载
- dirs = r'G:\down'
- if not os.path.exists(Atlas_dict[img_url]): #介绍作为文件名
- os.makedirs(Atlas_dict[img_url])
- os.chdir(Atlas_dict[img_url])
- #开始下载啦
- print("正在下载图集:{}".format(Atlas_dict[img_url]))
- get_downsload(img_url)
复制代码
- import requests
- from bs4 import BeautifulSoup as bs
- def get_response(url):
- header = {
- 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
- 'accept-encoding': 'gzip, deflate, br',
- 'accept-language': 'zh-CN,zh;q=0.9',
- 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
- }
- response = requests.get(url=url,headers=header)
- return response
- def get_link(response):
- soup = bs(response.text,'lxml')
- links = soup.select('#pins li span a')
- print(len(links))
- for each in links:
- get_gril_data(each['href'])
- def get_gril_data(url):
- response = get_response(url)
- soup = bs(response.text,'lxml')
- title = soup.select_one('.main-title').text
- category = soup.select_one('.main-meta span a').text
- img_link = soup.select_one('.main-image p a img')['src']
- max_page = int(soup.select('.pagenavi a span')[-2].text) + 1
- download_img(img_link,url)
- for each in range(2,max_page):
- if each < 10:
- img_url = f'{img_url[:-6]}0{each}'
- else:
- img_url = f'{img_url[:-6]}{each}'
- refer_url = f'{url}/{each}'
- download_img(img_url,refer_url)
- def download_img(img_url,refer_url):
- header = {
- 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
- 'accept-encoding': 'gzip, deflate, br',
- 'accept-language': 'zh-CN,zh;q=0.9',
- 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
- 'refer':refer_url
- }
- # 剩下的自己写吧,就是怎么保存的问题了
- def main():
- base_url = 'https://www.mzitu.com/'
- response = get_response(base_url)
- soup = bs(response.text,'lxml')
- max_num = int(soup.select('.nav-links a')[-2].text) + 1
- get_link(response)
- if __name__ == '__main__':
- main()
复制代码
剩下保存和翻页的问题了。
这个随意写的,不是很规范(因为我比较懒)
不习惯直接保存图片的写法,也就这样了
|
最佳答案
查看完整内容
剩下保存和翻页的问题了。
这个随意写的,不是很规范(因为我比较懒)
不习惯直接保存图片的写法,也就这样了
|