Narcissus1 发表于 2021-3-10 09:25:16

爬取明日方舟立绘

代码写得很烂,看看就行{:10_254:}


爬虫基于明日方舟的Wiki:http://prts.wiki/w/%E9%A6%96%E9%A1%B5

import requests
import urllib
import os
from bs4 import BeautifulSoup

pre_url = "http://prts.wiki/w/"
PATH = os.getcwd()+"\\Arknights\\"
print(PATH)

def get_res(url):
    res = requests.get(url)
    if not res:
      print(url, "is not a correct url;")
    return res


def get_names(html):
    soup = BeautifulSoup(html.text, 'html.parser')
    _targets = soup.find_all('div', class_='smwdata')
    names = []
    for each in _targets:
      each = each.get("data-cn")
      if each:
            names.append(each.strip())
    return names


def get_url(name):
    url = pre_url + urllib.parse.quote(name)
    print("正在爬取", name, "的网页,网址为:", url)
    return url


def pic_url(url):
    html = get_res(url)
    soup = BeautifulSoup(html.text, 'html.parser')
    images = []
    for each in soup.find_all('div', id='charimg'):
      pic1 = each.find('div', id='img-stage1')
      pic2 = each.find('div', id='img-stage2')
      if pic1:
            img1 = pic1.find('img').get('src')
            images.append(img1)
      if pic2:
            img2 = pic2.find('img').get('src')
            if img2:
                images.append(img2)
    if images:
      print("成功从网页中找到图片链接!")
    else:
      print("ERROR!!!in", url)
    return images


def ins_png(urls, name):
    path = PATH + '立绘\\'
    if not os.path.exists(path):
      os.makedirs(path)
    for i in range(len(urls)):
      res = requests.get(urls)
      if res:
            if os.path.exists(path+name+'_'+str(i+1)+'.png'):
                return
            img = open(path+name+'_'+str(i+1)+'.png', 'wb')
            img.write(res.content)
            print(name, i+1, "已经保存到", path)
            img.close()


def main_code(home_url):
    html = get_res(home_url)
    names = get_names(html)
    for name in names:
      url = get_url(name)
      pics = pic_url(url)
      ins_png(pics, name)
    print("爬取皮肤函数运行完毕, 请于文件夹中查看\n")


if __name__ == "__main__":
    url0 = "http://prts.wiki/w/%E5%B9%B2%E5%91%98%E4%B8%80%E8%A7%88"
    main_code(url0)

良木 发表于 2021-3-12 13:09:52

瞅瞅
页: [1]
查看完整版本: 爬取明日方舟立绘