|
发表于 2023-2-15 20:07:24
|
显示全部楼层
修改了一下 Python爱好者. 的酷狗爬虫 , 可以爬取酷狗的吞噬星空几乎全部
( 不过只有200多集 )
文件太大上传太慢 , 给你代码 , 自己慢慢爬吧
- import urllib.request
- import urllib.parse
- import json
- import os
- x = []
- i = 1
- def down_load_music(music_address,down_load_address,information_list): #下载歌曲
- #获取音乐名
- music_name = get_right_music_name(information_list)
-
- #创建文件
- with open(f"{down_load_address}\\{music_name}.mp3","w") as f:
- pass
-
- #写入歌曲
- with open(f"{down_load_address}\\{music_name}.mp3","wb") as f:
- music = urllib.request.urlopen(music_address).read()
- f.write(music)
- def get_right_music_name(information_list): #合法化歌曲名
- #初始变量
- music_name = information_list[0] #歌曲名
- characters = ['/', '\\', ':', '*', '?', '"', '<', '>', '|'] #非法字符
- for each in characters:
- if each in music_name: #含此非法字符
- music_name = music_name.replace(each,"") #去除非法字符
- return music_name
- def get_music_address(information_list,music_num,second_headrs_words): #获取音乐地址
- #获取此歌信息
- information_list = information_list[music_num - 1]
- #初始变量
- url = f"http://www.kuwo.cn/api/v1/www/music/playUrl?mid={information_list[-2]}&type=convert_url3&httpsStatus=1&reqId={information_list[-1]}" #歌曲网页地址
- second_headers = get_headers(second_headrs_words) #请求头2
- #获取数据
- req = urllib.request.Request(url,headers = second_headers)
- response = json.loads(urllib.request.urlopen(req).read().decode("utf8"))
- #返回结果
- music_address = response["data"]["url"]
- return music_address,information_list
- def print_music_information(music_information): #打印歌曲信息
- global x
- information_list = [] #储存重要信息
- #初始变量
- reqId = music_information["reqId"] #作品vip信息
- i = 0 #迭代作品序列号
- #打印歌曲信息
- for each in music_information["data"]["list"]:
- #获取信息
- name = each["name"] #作品名字
- artist = each["artist"] #作者姓名
- album = each["album"] #作品专辑
- songTimeMinutes = each["songTimeMinutes"]
- rid = each["rid"] #作品id
- if artist == '流逝&double程':
- #储存信息
- information_list.extend([[name,artist,album,songTimeMinutes,rid,reqId]])
- x.append(f"{name}/,/歌手:{artist}/,/专辑:{album}/,/时长:{songTimeMinutes}/")
- #打印信息
- #print(f"序号{i + 1}:/歌曲:{name}/,/歌手:{artist}/,/专辑:{album}/,/时长:{songTimeMinutes}/")
-
- i += 1
- return information_list
- def get_music_information(music_name,music_page,first_headrs_words): #获取歌曲
- #初始变量
- url = f"http://www.kuwo.cn/api/www/search/searchMusicBykeyWord?key={urllib.parse.quote(music_name)}&pn={music_page}&rn=300&httpsStatus=1&reqId=1947b511-1ac6-11ed-abe0-e348db15d8b3"
- first_headers = get_headers(first_headrs_words) #请求头1
- #获取数据
- req = urllib.request.Request(url,headers = first_headers)
- response = json.loads(urllib.request.urlopen(req).read().decode("utf8"))
-
- return response
- def get_headers(words): #生成请求头
- headers = {each[0]:each[1] for each in [each.split(": ") for each in words.split("\n")]} #转换为字典
-
- return headers
- def first_try(music_name,music_page,first_headers_words):
- try:
- music_information = get_music_information(music_name,music_page,first_headers_words)
- return music_information
-
- except urllib.error.HTTPError as reason:
- if reason == "HTTP Error 504: Gateway Timeout": #暂时性网络连接失败
- #提示
- print("网络连接失败,正在重试中,请耐心等待!")
- #重试
- first_try()
- def second_try(music_address,down_load_address,information_list,music_name,first_headers_words):
- try:
- down_load_music(music_address,down_load_address,information_list)
- except PermissionError as reason:
- while "[Errno 13] Permission denied:" in reason: #没有访问权限
- down_load_address = input("抱歉,我们没有在此文件夹中创建文件的权力,请换个位置,如:'C:\\test':")
- down_load_music(music_address,down_load_address,information_list)
- def third_try(music_name,start_page,first_headers_words):
- #获取歌曲所在页数(有概率报错)(已解决)
- music_page,information_list = get_music_page(get_music_pages(music_name,start_page,first_headers_words),music_name,first_headers_words)
- while music_page == -1: #查无此歌
- music_name = input("抱歉,未查询到此歌,请重新输入歌曲名称:")
- music_page,information_list = get_music_page(get_music_pages(music_name,start_page,first_headers_words),music_name,first_headers_words)
- return music_page,information_list
- def fourth_try(i):
- try:
- #让用户选择要下载的歌所对应的序列号
- music_num = i
- while not(0 < music_num < 301): #范围超出 1 到 30
- music_num = int(input("序列号超出最大值,请重新输入歌曲序列号:"))
- #返回结果
- return music_num
- except ValueError as reason:
- if "invalid literal for int() with base 10:" in reason: #序列号非数字类型
- fourth_try()
- def get_music_page(pages,music_name,first_headers_words): #获取歌曲所在页数
- if pages == 0: #没有此歌
- return -1,-1
-
- else:
- #初始变量
- page = 0 #歌曲所在页数
- for i in range(1,pages + 1):
- #获取歌曲信息(有概率报错)(已解决)
- music_information = first_try(music_name,i,first_headers_words)
- #打印歌曲信息
- information_list = print_music_information(music_information)
- page = i
- #返回结果
- return page,information_list
- def get_music_pages(music_name,start_page,first_headers_words): #获取页面总页数
- #获取歌曲信息(有概率报错)(已解决)
- music_information = first_try(music_name,start_page,first_headers_words)
- #返回结果
- pages = int(music_information["data"]["total"]) // 300 #总页数
- return pages
- def main():
- global i, running
- #获取歌曲所在页数(有概率报错)(已解决)
- music_page,information_list = third_try(music_name,start_page,first_headers_words)
- music_num = fourth_try(i)
- print(i)
- #获取歌曲地址
- music_address,information_list = get_music_address(information_list,music_num,second_headers_words)
- #下载歌曲(有概率报错)
- second_try(music_address,down_load_address,information_list,music_name,first_headers_words)
- i += 1
- if i == len(x):
- running = False
- if __name__ == "__main__":
- #初始变量
- running = True #程序是否继续执行
- #用户信息
- music_name = '吞噬星空'
-
- #初始变量
- first_headers_words = """Accept: application/json, text/plain, */*
- Accept-Language: zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6
- Connection: keep-alive
- Cookie: _ga=GA1.2.38866944.1643424882; BusinessId={"std_plat":404,"std_dev":"d2b0d6c5-27ce-4b06-bf62-1e19b3ad6f84","std_imei":"d2b0d6c5-27ce-4b06-bf62-1e19b3ad6f84"}; _gid=GA1.2.432698325.1660354553; Hm_lvt_cdb524f42f0ce19b169a8071123a4797=1660254093,1660354553,1660356822,1660366722; Hm_lpvt_cdb524f42f0ce19b169a8071123a4797=1660367192; kw_token=UX6Z894LRHB; _gat=1
- Host: www.kuwo.cn
- Referer: http://www.kuwo.cn/search/list?key=%E5%AD%A4%E5%8B%87%E8%80%85
- User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47
- csrf: UX6Z894LRHB""" #请求头1
- second_headers_words = """Accept: application/json, text/plain, */*
- Accept-Language: zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6
- Connection: keep-alive
- Cookie: _ga=GA1.2.38866944.1643424882; BusinessId={"std_plat":404,"std_dev":"d2b0d6c5-27ce-4b06-bf62-1e19b3ad6f84","std_imei":"d2b0d6c5-27ce-4b06-bf62-1e19b3ad6f84"}; _gid=GA1.2.432698325.1660354553; Hm_lvt_cdb524f42f0ce19b169a8071123a4797=1660254093,1660354553,1660356822,1660366722; kw_token=DQ089P4PGZG; Hm_lpvt_cdb524f42f0ce19b169a8071123a4797=1660374530; _gat=1
- Host: www.kuwo.cn
- Referer: http://www.kuwo.cn/play_detail/198554068
- User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47""" #请求头2
- start_page = 1 #起始页
- #让用户选择要将歌曲下载在哪里
- down_load_address = 'E:\\WYH\\音乐\\wu'
- #主循环
- while running:
- main()
- for i in x:
- print(i)
-
复制代码
|
评分
-
查看全部评分
|