HCF 发表于 2020-3-17 22:46:46

爬取QQ音乐

import urllib.request
import chardet
import re
import urllib.parse
from lxml import etree



def url_open(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0')
    response = urllib.request.urlopen(req)
    html = response.read()
    return html

def html_code(html):
    encode = chardet.detect(html)['encoding']#检测编码格式
    if encode == 'GB2312':
      encode = 'GBK'
    html = html.decode(encode,'ignore')
    return html
keyword = urllib.parse.urlencode({"key":'跨时代'})#将关键词编码
#url='http://music.taihe.com/search?%s'%keyword

def get_url(word):
    data = {
            'ct':'24',
    'qqmusic_ver':'1298',
    'new_json':'1',
    'remoteplace':'txt.yqq.song',
    'searchid':'71746174584266098',
    't':'0',
    'aggr':'1',
    'cr':'1',
    'catZhida':'1',
    'lossless':'0',
    'flag_qc':'0',
    'p':'1',
    'n':'10',
    'w':word,
    'g_tk_new_20200303':'5381',
    'g_tk':'5381',
    'loginUin':'2350904752',
    'hostUin':'0',
    'format':'json',
    'inCharset':'utf8',
    'outCharset':'utf-8',
    'notice':'0',
    'platform':'yqq.json',
    'needNewCode':'0'
            }
    urllist = []
    keyword = urllib.parse.urlencode(data)#将关键词编码
    url1 = 'https://c.y.qq.com/soso/fcgi-bin/client_search_cp?%s'%keyword
    html = url_open(url1)
    html = str(html)#里面有部分不是字符串格式
    print('获取file网页:',url1)
    print('\n搜索链接:')
    pattern = r'"songMID".{17}'
    pattern1 = r'"lyric_hilight":"","mid":".{14}'
    b= re.findall(pattern1,html)
    a = re.findall(pattern,html)
    for i in b:
      url = 'https://y.qq.com/n/yqq/song/%s.html'%i[-14:]
      print(url)
      urllist.append(url)
    for each in a:
      url = 'https://y.qq.com/n/yqq/song/%s.html'%each
      urllist.append(url)
    return urllist



def get_albummid(url):#获取该歌曲专辑id
    html = url_open(url)
    html = html_code(html)
#    print(html)
#    H = etree.HTML(html)#标准化,初始化,之后使用xpath提取数据
    a = html.find('albummid')+11
    b = a+14
    album = html
    html = etree.HTML(html)
    global mname
    global msinger
    mname = html.xpath('//h1[@class="data__name_txt"]/text()')
    msinger = html.xpath('//div[@class="data__singer"]/a/text()')

#    print(html)
    return album

def pid_1(albummid):#获得pid
    url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=albuminfoCallback&g_tk=469196449&loginUin=2350904752&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data={"comm":{"ct":24,"cv":10000},"albumDetail":{"module":"music.musichallAlbum.AlbumInfoServer","method":"GetAlbumDetail","param":{"albumMid":"%s"}}}'%albummid
    html = url_open(url)
    html = html_code(html)
    b = html.find('}]}}}}')-1
    a = b-16
    html = html
    return html


def vkey(pid,songmid):
    url = 'https://u.y.qq.com/cgi-bin/musicu.fcg?-=getplaysongvkey%s&g_tk=469196449&loginUin=2350904752&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data={"req":{"module":"CDN.SrfCdnDispatchServer","method":"GetCdnDispatch","param":{"guid":"5651047720","calltype":0,"userip":""}},"req_0":{"module":"vkey.GetVkeyServer","method":"CgiGetVkey","param":{"guid":"5651047720","songmid":["%s"],"songtype":,"uin":"2350904752","loginflag":1,"platform":"20"}},"comm":{"uin":2350904752,"format":"json","ct":24,"cv":0}}'%(pid,songmid)
    html = url_open(url)
    print('音乐密匙链接:\n')
    print(url)
    print('\n')
    html = html_code(html)
#    print(html)
    a = html.find('purl')+8
    b = html.find('fromtag=66')+10
    url = 'http://ws.stream.qqmusic.qq.com/'
    url2 = url + html
#    print(url2)
#    print(html)
    return url2

   
def save_m(url):
    filename = mname+ ' ' +msinger
    html = url_open(url)
    with open('%s.mp3'%filename,'wb') as f:
      f.write(html)
    print('%s下载完毕……'%filename)
      
def main():
#    urllist = ['https://y.qq.com/n/yqq/song/004LBwRJ2ohY7s.html']
    musickey = input('请输入歌曲关键字:')
    urllist = get_url(musickey)
    albummid=get_albummid(urllist)#"003iFCqj1rZ76E"#专辑编码
    songmid = urllist#歌曲编码
    pid = pid_1(albummid)
    url = vkey(pid,songmid)
#    print('专辑id:',albummid)
#    print(pid)
    print('音乐链接',url)
    if url =='http://ws.stream.qqmusic.qq.com/':
      print('\n该歌曲为vip专属!')
      url = input('请手动进入链接切入purl密匙:')
      vipurl='http://ws.stream.qqmusic.qq.com/'+url
      if vipurl == 'http://ws.stream.qqmusic.qq.com/':
            print('该歌曲已经下架!')
      else:
            save_m(vipurl)
    else:
      save_m(url)
   

   
   
if __name__ == "__main__":
    main()
   

widd2004 发表于 2020-3-17 22:57:09

{:5_110:}哈哈,我今天也试了试,爬了许巍top200,好像里面那个下载地址没法下载{:5_104:}

HCF 发表于 2020-3-18 11:58:32

widd2004 发表于 2020-3-17 22:57
哈哈,我今天也试了试,爬了许巍top200,好像里面那个下载地址没法下载

那应该是里面那个vkey,这个数据很难找,很多都不行,要purl:后面的才能用
页: [1]
查看完整版本: 爬取QQ音乐