from selenium import webdriver
import requests
import urllib
import os
from bs4 import BeautifulSoup
import time
import threading
pre_url = "https://prts.wiki"
PATH = os.getcwd()+"\\Arknights\"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"}
print(PATH)
#动态提取网页源代码
def get_res(url):
option=webdriver.ChromeOptions()
#隐藏chrome窗口
option.add_argument('--headless')
option.add_argument("--disable-infobars")
browser = webdriver.Chrome(options=option)
browser.get(url)
#缓冲
time.sleep(5)
#下拉滑动条
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
res=browser.page_source
browser.quit()
return res
#下载立绘
def ins_pic(html):
main_path = PATH + '立绘\\'
soup=BeautifulSoup(html,'html.parser')
characters=soup.find('div',{'id':'filter-data'}).find_all('div')
for c in characters:
character_name=c.get('data-zh')
path=main_path+character_name
if not os.path.exists(path):
os.makedirs(path)
character_urls=[pre_url+'/w/文件:立绘_'+character_name+'_1.png',pre_url+'/w/文件:立绘_'+character_name+'_2.png']
_soup=BeautifulSoup(requests.get(character_urls[1]).text,'html.parser')
if(_soup.find('div',{'class':'fullImageLink'})==None):
del _soup
del character_urls[1]
for i in range(len(character_urls)):
res=BeautifulSoup(requests.get(character_urls[i]).text,'html.parser').find('div',{'class':'fullImageLink'}).find('a')
res=pre_url+res.get('href')
res=requests.get(res)
print('正在爬取:'+character_name+'_'+str(i+1)+'.png')
with open(path+'\\'+character_name+'_'+str(i+1)+'.png','wb') as f:
f.write(res.content)
#下载皮肤
def get_skin(url):
path = PATH + '立绘\\'
if not os.path.exists(path):
os.makedirs(path)
soup = BeautifulSoup(requests.get(url).text,'html.parser')
pic_urls=soup.find_all('table',{'class':'wikitable logo nomobile'})
for pic in pic_urls:
try:
url=pic.find_all('a',{'class':'image'})[1]
img=url.find('img')
name=str(img.get('alt'))
name=name[name.find('立绘')+3:]
print('正在爬取:'+name)
file_name=name[:name.find(' skin')]
if not os.path.exists(path+file_name):
os.makedirs(path+file_name)
res=str(url.get('href'))
res=pre_url+str(BeautifulSoup(requests.get(pre_url+res).text,'html.parser').find('div',{'class':'fullImageLink'}).find('a').get('href'))
res=requests.get(res)
with open(path+'\\'+file_name+'\\'+name,'wb') as f:
f.write(res.content)
print(name+'爬取完毕')
except Exception as e:
print("爬取图片出错:", str(e))
#下载语音记录
def download_audio(num,url,file_name,save_path,if_again):
try:
temp=requests.get(url,headers=headers)
if(str(temp).find('403')!=-1):
print('403 Forbidden:'+url)
return
if(str(temp.content).find('No')!=-1):
print('语音不存在',url)
return
with open(save_path+'\\'+file_name+'.wav','wb') as f:
f.write(temp.content)
print("爬取完毕")
except Exception as e:
print("程序运行出错:", str(e))
flag=-1
while(flag!='1' and flag!='0'):
flag=input('是否再次尝试(1:是/0:否)')
if(flag=='1'):
download_audio(num,url)
def ins_audio(html,if_again):
print('开始爬取')
soup=BeautifulSoup(html,'html.parser')
characters=soup.find('div',{'id':'filter-data'}).find_all('div')
for c in characters:
character_name=c.get('data-zh')
if os.path.exists(PATH+'语音\\'+character_name) and if_again=='0':
print('跳过')
continue
path=PATH+'语音\\'+character_name+'\\'
url=pre_url+'/w/'+character_name+'/语音记录'
res=BeautifulSoup(get_res(url),'html.parser')
print('已经获得'+character_name+'语音链接,开始下载')
try:
divs=res.find('div',{'class':'table-row-group'}).find_all('div',{'class':'table-row'})
for div in divs:
audio_name=div.find('div',{'class':'table-cell text-center font-bold p-1 !bg-table border border-solid border-divider align-middle truncate'})
audio_name=audio_name.text
save_path=path+audio_name
if not os.path.exists(save_path):
os.makedirs(save_path)
tmp=div.find('a')
tmp='https:'+str(tmp.get('href'))
print(tmp)
file_names=['日本語','中文','한국어','English']
urls=['jp','cn','kr','en']
#网址格式化
if(tmp.find('/voice/')==-1):
urls[0]=tmp[:tmp.find('/voice')]+'/voice/'+tmp[tmp.find('/voice')+10:]
urls[1]=tmp[:tmp.find('/voice')]+'/voice_cn/'+tmp[tmp.find('/voice')+10:]
urls[2]=tmp[:tmp.find('/voice')]+'/voice_kr/'+tmp[tmp.find('/voice')+10:]
urls[3]=tmp[:tmp.find('/voice')]+'/voice_en/'+tmp[tmp.find('/voice')+10:]
else:
urls[0]=tmp
urls[1]=tmp[:tmp.find('/voice')]+'/voice_cn/'+tmp[tmp.find('/voice')+7:]
urls[2]=tmp[:tmp.find('/voice')]+'/voice_kr/'+tmp[tmp.find('/voice')+7:]
urls[3]=tmp[:tmp.find('/voice')]+'/voice_en/'+tmp[tmp.find('/voice')+7:]
print('正在下载'+character_name+'-'+audio_name+'语音\n')
for i in range(len(urls)):
download_audio(i,urls[i],file_names[i],save_path,if_again)
except Exception as e:
print("程序运行出错:", str(e))
#爬取bgm
def ins_music(home_url):
path=PATH+'背景音乐'
if not os.path.exists(path):
os.makedirs(path)
print('获取音乐网址')
soup=BeautifulSoup(requests.get(home_url).text,'html.parser')
tmp_urls=soup.find_all('table',{'class':"wikitable mw-collapsible mw-collapsible-dark mw-collapsible-title-center mw-collapsed"})
bgms=[]
for tmp in tmp_urls:
bgms+=tmp.find_all('tr')
i=0
while(i<len(bgms)):
if(bgms[i].find('source')==None):
del bgms[i]
continue
i+=1
print("获取完毕")
for bgm in bgms:
bgm_name=bgm.find('div',{'class':'nodesktop'}).text
while ':' in bgm_name:
bgm_name=bgm_name[:bgm_name.find(':')]+':'+bgm_name[bgm_name.find(':')+1:]
while '<' in bgm_name:
bgm_name=bgm_name[:bgm_name.find('<')]+'《'+bgm_name[bgm_name.find('<')+1:]
while '>' in bgm_name:
bgm_name=bgm_name[:bgm_name.find('>')]+'》'+bgm_name[bgm_name.find('>')+1:]
while '?' in bgm_name:
bgm_name=bgm_name[:bgm_name.find('?')]+'?'+bgm_name[bgm_name.find('?')+1:]
while '|' in bgm_name:
bgm_name=bgm_name[:bgm_name.find('|')]+'-'+bgm_name[bgm_name.find('|')+1:]
while '*' in bgm_name:
bgm_name=bgm_name[:bgm_name.find('*')]+'×'+bgm_name[bgm_name.find('*')+1:]
while '/' in bgm_name:
bgm_name=bgm_name[:bgm_name.find('/')]+'-'+bgm_name[bgm_name.find('/')+1:]
while '\\' in bgm_name:
bgm_name=bgm_name[:bgm_name.find('\\')]+'-'+bgm_name[bgm_name.find('\\')+1:]
bgm_url=bgm.find('source')
bgm_url=bgm_url.get('src')
print("正在爬取"+bgm_name)
with open(path+'\\'+bgm_name+'.mp3','wb') as f:
f.write(requests.get(bgm_url,headers=headers).content)
print("完成")
#主函数
def main_code():
try:
tmp1=-1
while(tmp1!='1' and tmp1!='0'):
tmp1=input('是否爬取角色语音(1:是 0:否):')
tmp2=-1
while(tmp2!='1' and tmp2!='0'):
tmp2=input('是否爬取角色立绘与皮肤(1:是 0:否):')
tmp3=-1
while(tmp3!='1' and tmp3!='0'):
tmp3=input('是否爬取游戏公测版本BGM及在线发布的游戏相关曲目(1:是 0:否):')
if int(tmp1)==1:
if_again=-1
while(if_again!='0' and if_again!='1'):
if_again=input('是否重复下载(1:是/0:否)')
html = get_res('http://prts.wiki/w/%E5%B9%B2%E5%91%98%E4%B8%80%E8%A7%88')
ins_audio(html,if_again)
print("爬取语音函数运行完毕, 请于文件夹中查看\n")
if int(tmp2)==1:
ins_pic(get_res('http://prts.wiki/w/%E5%B9%B2%E5%91%98%E4%B8%80%E8%A7%88'))
html = get_res('https://prts.wiki/w/%E6%97%B6%E8%A3%85%E5%9B%9E%E5%BB%8A')
soup=BeautifulSoup(html,'html.parser')
urls=soup.find_all('div',{'class':'skinwrapper charskinbtn-controler'})
for i in range(len(urls)):
urls[i]=str(urls[i].find('a').get('href'))
urls[i]=pre_url+urls[i][:urls[i].find('#')]
urls=list(set(urls))
for url in urls:
get_skin(url)
if tmp3=='1':
ins_music('https://prts.wiki/w/%E9%9F%B3%E4%B9%90%E9%89%B4%E8%B5%8F')
print("爬取游戏公测版本BGM及在线发布的游戏相关曲目函数运行完毕, 请于文件夹中查看\n")
except Exception as e:
print("程序运行出错:", str(e))
if __name__ == "__main__":
main_code()