|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
不要宣扬,放过头来看当初爬它的时候, 各种百度,各种难题,哈哈,回过头来,这种静态的已经是so easy 了,还能变着花样爬它。
- from lxml import etree
- from OpenSSL import SSL
- import requests
- import re
- import time
- import os
- HEADERS = {
- 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
- 'Referer': 'http://www.mzitu.com'
- }
- QUANTITY =220#图集页数 每页/24
- FILE_PATH = None #下载路径,请设置绝对路径,默认以当前绝对路径做保存
- def get_path(name,num=" "):
- '''
- 当num为None时,用来查询目录是否存在
- :param name:
- :param num:
- :return:
- '''
- currrent_path = os.path.realpath(__file__) # 文件绝对路径
- current_dir = os.path.split(currrent_path)[0]
- file_path = os.path.join(current_dir, name)
- if num==" ":
- if not os.path.exists(file_path):
- return False
- if FILE_PATH is None:
- if not os.path.exists(file_path):
- os.makedirs(file_path)
- else:
- file_path = FILE_PATH
- return os.path.join(file_path,num) + ".jpg"
- def get_response(url):
- """返回URL响应"""
- time.sleep(2)
- return requests.get(url=url,headers=HEADERS)
- def atlas():
- '''
- :yield:图集下载的地址
- '''
- for i in range(QUANTITY):
- url = "https://www.mzitu.com/page/{page}/".format(page=i)
- response = get_response(url=url).text
- imgurl_list = re.findall(r'<li><a href="(.*?)" target="_blank">',response)
- imgname_list = re.findall(r"alt='(.*?)' width=",response)
- for img_naem,img_url in zip(imgname_list,imgurl_list):
- item = {}
- item["name"] = img_naem
- print(img_naem)
- # if not get_path(name=img_naem):
- # break
- item["img_url"] = img_url
- yield item
- def get_download_url(item):
- """
- :param url:图片url
- :return: 图集下载地址
- """
- response = get_response(url=item["img_url"]).text
- etre = etree.HTML(response)
- num = etre.xpath("//div[@class='pagenavi']/a[5]/span/text()")
- for i in range(0,int(num[0])+1):
- item["download"] = item["img_url"]+ "/" + str(i)
- item["number"] = str(i)
- yield item
- def download(item):
- headers = {
- 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
- 'Referer': item["download"]
- }
- try:
- time.sleep(2)
- etre = etree.HTML(requests.get(url=item["download"],headers=headers).text)
- download_url = etre.xpath("//div[@class='main-image']/p/a/img/@src")[0]
- response = requests.get(url=download_url,headers=headers).content
- file_path = get_path(item["name"],item["number"])
- with open(file_path,"wb") as fp:
- fp.write(response)
- except SSL.SysCallError as e:
- print("当前出现错误%s"%e)
- print(item)
- def product(c):
- c.send(None)
- for img in atlas():
- c.send(img)
- c.close()
- def customer():
- data = ""
- while True:
- n = yield data
- if not n:
- return
- for each in get_download_url(item=n):
- download(each)
- def main():
- func = customer()
- product(func)
- if __name__ == '__main__':
- main()
复制代码 |
|