|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 会飞的GT 于 2020-2-25 20:34 编辑
# 特定网址爬虫,URL在最下方。
# 更换此网站图集URL即可运行,爬取内容储存在Python默认目录
import requests
import re
import os
import time
def url_open(url):
try:
head = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Safari/605.1.15'}
r = requests.get(url, headers=head, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r
except:
print( 'url_open 产生异常')
def find_img(url):
try:
response = url_open(url)
html = response.text
img_addres = re.findall(r'<img src="(https://ii.hywly.com/.*?)" alt=', html)
return img_addres
except:
print( 'find_img 产生异常' )
def save_imgs(url):
try:
img_name = url.split('/')[-1]
response = url_open(url)
with open(img_name, 'wb') as f:
f.write(response.content)
print(img_name)
except:
print( 'save_imgs 产生异常')
def folder(url):
try:
response = url_open(url)
html = response.text
folder_name_ = re.findall(r'<p>相关编号: (.*?)</p><p>图片数量: .*?P</p><p>发行日期: .*?</p><p>出镜模特:\
<a href=".*?" target="_blank">(.*?)</a> ', html)
folder_name = folder_name_[0][1] + folder_name_[0][0]
if not os.path.exists(folder_name):
os.mkdir(folder_name)
os.chdir(folder_name)
except:
print('folder 产生异常')
def down_self(url):
try:
folder(url)
response = url_open(url)
html = response.text
img_addres = find_img(url)
for i in img_addres:
save_imgs(i)
time.sleep(1)
pages = re.findall(r'<a href="(.*?)">\d\d?</a>', html)
for each in pages:
img_addres = find_img(each)
for addres in img_addres:
save_imgs(addres)
time.sleep(1)
except:
print('down_self 产生异常')
if __name__ == '__main__':
url = 'https://www.meituri.com/a/25222/'
down_self(url)
|
|