https://www.vmgirls.com/
匹配10个项目下载自动创建目录保存图片,过滤已下载过的。
还有很多地方可以改进,比如用selenium来 加载更多 下载更多项 等坛友们添加了 有兴趣的话
- # -*-coding = utf-8 -*-
- # @Time : 2021/6/24 16:16
- # @Author :
- # @File : vmgirls.py
- # @Software : PyCharm
- import requests
- import time
- import re
- import os
- from tkinter import *
- from tqdm import trange
- def requestURL(url):
- headers = {
- 'referer':'https://www.vmgirls.com/',
- 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
- }
- response = requests.get(url,headers=headers)
- return response
- def getTitleImgUrl(html):
- title_pattern = re.compile(r'<title>(.*?) 丨 唯美女生</title>',re.S)
- title = re.findall(title_pattern,html)[0]
- url_pattern = re.compile(r'<a href="//(.*?)" alt=".*?" title=".*?">',re.S)
- img_url_list = re.findall(url_pattern,html)
- for i in range(0,len(img_url_list)):
- img_url_list[i] = "http://"+img_url_list[i]
- return [title,img_url_list]
- def mkdir(directory_name):
- if os.path.exists(r'./vmgirls'):
- pass
- else:
- try:
- os.makedirs(r'./vmgirls')
- except Exception as e:
- print(e)
- if os.path.exists(fr'./vmgirls/{directory_name}'):
- pass
- else:
- try:
- os.makedirs(fr'./vmgirls/{directory_name}')
- except Exception as e:
- print(e)
- def save_img(url):
- res = requestURL(url).text
- data_list = getTitleImgUrl(res)
- try:
- mkdir(data_list[0])
- except Exception as e:
- print(e)
- current_path = os.getcwd() + '/vmgirls/' + data_list[0] + '/'
- for i in range(0, len(data_list[1])):
- file_name = data_list[0] + str(i) + ".jpg"
- res = requestURL(data_list[1][i])
- try:
- with open(current_path + file_name, 'wb') as f:
- f.write(res.content)
- print("保存成功:" + file_name)
- except Exception as e:
- print(e)
- def getFlist():
- list = []
- for root, dirs, files in os.walk('vmgirls'):
- list.append(os.path.basename(root))
- return list
- def main():
- url = "https://www.vmgirls.com/"
- res = requestURL(url).text
- url_pattern = re.compile(r'<a href=(.*?) title="(.*?)" class="list-title text-md h-2x">.*?</a>',re.S)
- url_list = re.findall(url_pattern,res)
- # url_pattern = re.compile(r'<a href=(\d{5}?).html title="(.*?)".*?>', re.S)
- # url_list = re.findall(url_pattern, res)
- dir_list = getFlist()
- for i in trange(0,len(url_list)):
- dir_name = url_list[i][1]
- if dir_name in dir_list:
- continue
- else:
- url = 'https://www.vmgirls.com/'+url_list[i][0]
- save_img(url)
- if __name__ == '__main__':
- main()
复制代码