|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
代码如下:
- from selenium import webdriver
- import requests
- import urllib
- import os
- from bs4 import BeautifulSoup
- import time
- import threading
- pre_url = "https://prts.wiki"
- PATH = os.getcwd()+"\\Arknights\"
- headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"}
- print(PATH)
- #动态提取网页源代码
- def get_res(url):
- option=webdriver.ChromeOptions()
- #隐藏chrome窗口
- option.add_argument('--headless')
- option.add_argument("--disable-infobars")
- browser = webdriver.Chrome(options=option)
- browser.get(url)
- #缓冲
- time.sleep(5)
- #下拉滑动条
- browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- time.sleep(1)
- res=browser.page_source
- browser.quit()
- return res
- #下载立绘
- def ins_pic(html):
- main_path = PATH + '立绘\\'
- soup=BeautifulSoup(html,'html.parser')
- characters=soup.find('div',{'id':'filter-data'}).find_all('div')
- for c in characters:
- character_name=c.get('data-zh')
- path=main_path+character_name
- if not os.path.exists(path):
- os.makedirs(path)
- character_urls=[pre_url+'/w/文件:立绘_'+character_name+'_1.png',pre_url+'/w/文件:立绘_'+character_name+'_2.png']
- _soup=BeautifulSoup(requests.get(character_urls[1]).text,'html.parser')
- if(_soup.find('div',{'class':'fullImageLink'})==None):
- del _soup
- del character_urls[1]
- for i in range(len(character_urls)):
- res=BeautifulSoup(requests.get(character_urls[i]).text,'html.parser').find('div',{'class':'fullImageLink'}).find('a')
- res=pre_url+res.get('href')
- res=requests.get(res)
- print('正在爬取:'+character_name+'_'+str(i+1)+'.png')
- with open(path+'\\'+character_name+'_'+str(i+1)+'.png','wb') as f:
- f.write(res.content)
- #下载皮肤
- def get_skin(url):
- path = PATH + '立绘\\'
- if not os.path.exists(path):
- os.makedirs(path)
- soup = BeautifulSoup(requests.get(url).text,'html.parser')
- pic_urls=soup.find_all('table',{'class':'wikitable logo nomobile'})
- for pic in pic_urls:
- try:
- url=pic.find_all('a',{'class':'image'})[1]
- img=url.find('img')
- name=str(img.get('alt'))
- name=name[name.find('立绘')+3:]
- print('正在爬取:'+name)
- file_name=name[:name.find(' skin')]
- if not os.path.exists(path+file_name):
- os.makedirs(path+file_name)
- res=str(url.get('href'))
- res=pre_url+str(BeautifulSoup(requests.get(pre_url+res).text,'html.parser').find('div',{'class':'fullImageLink'}).find('a').get('href'))
- res=requests.get(res)
- with open(path+'\\'+file_name+'\\'+name,'wb') as f:
- f.write(res.content)
- print(name+'爬取完毕')
- except Exception as e:
- print("爬取图片出错:", str(e))
- #下载语音记录
- def download_audio(num,url,file_name,save_path,if_again):
- try:
- temp=requests.get(url,headers=headers)
- if(str(temp).find('403')!=-1):
- print('403 Forbidden:'+url)
- return
- if(str(temp.content).find('No')!=-1):
- print('语音不存在',url)
- return
- with open(save_path+'\\'+file_name+'.wav','wb') as f:
- f.write(temp.content)
- print("爬取完毕")
- except Exception as e:
- print("程序运行出错:", str(e))
- flag=-1
- while(flag!='1' and flag!='0'):
- flag=input('是否再次尝试(1:是/0:否)')
- if(flag=='1'):
- download_audio(num,url)
- def ins_audio(html,if_again):
- print('开始爬取')
- soup=BeautifulSoup(html,'html.parser')
- characters=soup.find('div',{'id':'filter-data'}).find_all('div')
- for c in characters:
- character_name=c.get('data-zh')
- if os.path.exists(PATH+'语音\\'+character_name) and if_again=='0':
- print('跳过')
- continue
- path=PATH+'语音\\'+character_name+'\\'
- url=pre_url+'/w/'+character_name+'/语音记录'
- res=BeautifulSoup(get_res(url),'html.parser')
- print('已经获得'+character_name+'语音链接,开始下载')
- try:
- divs=res.find('div',{'class':'table-row-group'}).find_all('div',{'class':'table-row'})
- for div in divs:
- audio_name=div.find('div',{'class':'table-cell text-center font-bold p-1 !bg-table border border-solid border-divider align-middle truncate'})
- audio_name=audio_name.text
- save_path=path+audio_name
- if not os.path.exists(save_path):
- os.makedirs(save_path)
- tmp=div.find('a')
- tmp='https:'+str(tmp.get('href'))
- print(tmp)
- file_names=['日本語','中文','한국어','English']
- urls=['jp','cn','kr','en']
- #网址格式化
- if(tmp.find('/voice/')==-1):
- urls[0]=tmp[:tmp.find('/voice')]+'/voice/'+tmp[tmp.find('/voice')+10:]
- urls[1]=tmp[:tmp.find('/voice')]+'/voice_cn/'+tmp[tmp.find('/voice')+10:]
- urls[2]=tmp[:tmp.find('/voice')]+'/voice_kr/'+tmp[tmp.find('/voice')+10:]
- urls[3]=tmp[:tmp.find('/voice')]+'/voice_en/'+tmp[tmp.find('/voice')+10:]
- else:
- urls[0]=tmp
- urls[1]=tmp[:tmp.find('/voice')]+'/voice_cn/'+tmp[tmp.find('/voice')+7:]
- urls[2]=tmp[:tmp.find('/voice')]+'/voice_kr/'+tmp[tmp.find('/voice')+7:]
- urls[3]=tmp[:tmp.find('/voice')]+'/voice_en/'+tmp[tmp.find('/voice')+7:]
- print('正在下载'+character_name+'-'+audio_name+'语音\n')
- for i in range(len(urls)):
- download_audio(i,urls[i],file_names[i],save_path,if_again)
-
- except Exception as e:
- print("程序运行出错:", str(e))
- #爬取bgm
- def ins_music(home_url):
- path=PATH+'背景音乐'
- if not os.path.exists(path):
- os.makedirs(path)
- print('获取音乐网址')
- soup=BeautifulSoup(requests.get(home_url).text,'html.parser')
- tmp_urls=soup.find_all('table',{'class':"wikitable mw-collapsible mw-collapsible-dark mw-collapsible-title-center mw-collapsed"})
- bgms=[]
- for tmp in tmp_urls:
- bgms+=tmp.find_all('tr')
- i=0
- while(i<len(bgms)):
- if(bgms[i].find('source')==None):
- del bgms[i]
- continue
- i+=1
- print("获取完毕")
- for bgm in bgms:
- bgm_name=bgm.find('div',{'class':'nodesktop'}).text
- while ':' in bgm_name:
- bgm_name=bgm_name[:bgm_name.find(':')]+':'+bgm_name[bgm_name.find(':')+1:]
- while '<' in bgm_name:
- bgm_name=bgm_name[:bgm_name.find('<')]+'《'+bgm_name[bgm_name.find('<')+1:]
- while '>' in bgm_name:
- bgm_name=bgm_name[:bgm_name.find('>')]+'》'+bgm_name[bgm_name.find('>')+1:]
- while '?' in bgm_name:
- bgm_name=bgm_name[:bgm_name.find('?')]+'?'+bgm_name[bgm_name.find('?')+1:]
- while '|' in bgm_name:
- bgm_name=bgm_name[:bgm_name.find('|')]+'-'+bgm_name[bgm_name.find('|')+1:]
- while '*' in bgm_name:
- bgm_name=bgm_name[:bgm_name.find('*')]+'×'+bgm_name[bgm_name.find('*')+1:]
- while '/' in bgm_name:
- bgm_name=bgm_name[:bgm_name.find('/')]+'-'+bgm_name[bgm_name.find('/')+1:]
- while '\\' in bgm_name:
- bgm_name=bgm_name[:bgm_name.find('\\')]+'-'+bgm_name[bgm_name.find('\\')+1:]
- bgm_url=bgm.find('source')
- bgm_url=bgm_url.get('src')
- print("正在爬取"+bgm_name)
- with open(path+'\\'+bgm_name+'.mp3','wb') as f:
- f.write(requests.get(bgm_url,headers=headers).content)
- print("完成")
- #主函数
- def main_code():
- try:
- tmp1=-1
- while(tmp1!='1' and tmp1!='0'):
- tmp1=input('是否爬取角色语音(1:是 0:否):')
- tmp2=-1
- while(tmp2!='1' and tmp2!='0'):
- tmp2=input('是否爬取角色立绘与皮肤(1:是 0:否):')
- tmp3=-1
- while(tmp3!='1' and tmp3!='0'):
- tmp3=input('是否爬取游戏公测版本BGM及在线发布的游戏相关曲目(1:是 0:否):')
- if int(tmp1)==1:
- if_again=-1
- while(if_again!='0' and if_again!='1'):
- if_again=input('是否重复下载(1:是/0:否)')
- html = get_res('http://prts.wiki/w/%E5%B9%B2%E5%91%98%E4%B8%80%E8%A7%88')
- ins_audio(html,if_again)
- print("爬取语音函数运行完毕, 请于文件夹中查看\n")
- if int(tmp2)==1:
- ins_pic(get_res('http://prts.wiki/w/%E5%B9%B2%E5%91%98%E4%B8%80%E8%A7%88'))
- html = get_res('https://prts.wiki/w/%E6%97%B6%E8%A3%85%E5%9B%9E%E5%BB%8A')
- soup=BeautifulSoup(html,'html.parser')
- urls=soup.find_all('div',{'class':'skinwrapper charskinbtn-controler'})
- for i in range(len(urls)):
- urls[i]=str(urls[i].find('a').get('href'))
- urls[i]=pre_url+urls[i][:urls[i].find('#')]
- urls=list(set(urls))
- for url in urls:
- get_skin(url)
-
- if tmp3=='1':
- ins_music('https://prts.wiki/w/%E9%9F%B3%E4%B9%90%E9%89%B4%E8%B5%8F')
- print("爬取游戏公测版本BGM及在线发布的游戏相关曲目函数运行完毕, 请于文件夹中查看\n")
- except Exception as e:
- print("程序运行出错:", str(e))
-
- if __name__ == "__main__":
- main_code()
复制代码
|
评分
-
查看全部评分
|