|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
代码写得很烂,看看就行
爬虫基于明日方舟的Wiki:http://prts.wiki/w/%E9%A6%96%E9%A1%B5
- import requests
- import urllib
- import os
- from bs4 import BeautifulSoup
- pre_url = "http://prts.wiki/w/"
- PATH = os.getcwd()+"\\Arknights\"
- print(PATH)
- def get_res(url):
- res = requests.get(url)
- if not res:
- print(url, "is not a correct url;")
- return res
- def get_names(html):
- soup = BeautifulSoup(html.text, 'html.parser')
- _targets = soup.find_all('div', class_='smwdata')
- names = []
- for each in _targets:
- each = each.get("data-cn")
- if each:
- names.append(each.strip())
- return names
- def get_url(name):
- url = pre_url + urllib.parse.quote(name)
- print("正在爬取", name, "的网页,网址为:", url)
- return url
- def pic_url(url):
- html = get_res(url)
- soup = BeautifulSoup(html.text, 'html.parser')
- images = []
- for each in soup.find_all('div', id='charimg'):
- pic1 = each.find('div', id='img-stage1')
- pic2 = each.find('div', id='img-stage2')
- if pic1:
- img1 = pic1.find('img').get('src')
- images.append(img1)
- if pic2:
- img2 = pic2.find('img').get('src')
- if img2:
- images.append(img2)
- if images:
- print("成功从网页中找到图片链接!")
- else:
- print("ERROR!!!in", url)
- return images
- def ins_png(urls, name):
- path = PATH + '立绘\\'
- if not os.path.exists(path):
- os.makedirs(path)
- for i in range(len(urls)):
- res = requests.get(urls[i])
- if res:
- if os.path.exists(path+name+'_'+str(i+1)+'.png'):
- return
- img = open(path+name+'_'+str(i+1)+'.png', 'wb')
- img.write(res.content)
- print(name, i+1, "已经保存到", path)
- img.close()
- def main_code(home_url):
- html = get_res(home_url)
- names = get_names(html)
- for name in names:
- url = get_url(name)
- pics = pic_url(url)
- ins_png(pics, name)
- print("爬取皮肤函数运行完毕, 请于文件夹中查看\n")
- if __name__ == "__main__":
- url0 = "http://prts.wiki/w/%E5%B9%B2%E5%91%98%E4%B8%80%E8%A7%88"
- main_code(url0)
复制代码
|
|