[已解决]Python爬虫练习——爬取QQ音乐精彩评论（编码问题请高人指点！）

Omed · 发表于 2020-2-17 19:06:39

您需要登录才可以下载或查看，没有账号？立即注册

x

2020年2月17日，采用Requests模块以及正则表达式完成了一次爬虫练习…

爬取QQ音乐精彩评论代码如下（欢迎各位指正）：

import requests
import re
#获取网页源代码
def get_code(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0)"
" Gecko/20100101 Firefox/73.0"}
return requests.get(url, headers=headers).text
#从原网页中提取songid
song_url=input('请输入QQ音乐网页版网址（https://y.qq.com/n/yqq/song/x.html形式）：')
song_doc = get_code(song_url)
id = re.findall('"songid":(.*?),"',song_doc)[0]
name = re.findall('songname":"(.*?)"',song_doc)[0]
tgt = int(input('请输入需要爬取的页数：'))
#从js文件中提取评论
list = [] #防止评论重复
num = 0
for number in range(1,tgt):
comment_url = "https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg?g_tk" \
"=5381&loginUin=0&hostUin=0&format=jsonp&inCharset=utf8&outCharset" \
"=GB2312¬ice=0&platform=yqq&needNewCode=0&cid=205360772&reqtype" \
"=2&biztype=1&topid=%s&cmd=6&needmusiccrit=0&pagenum=%d&pagesize=" \
"15&ct=24&cv=10101010"%(id,number)
comment_doc = get_code(comment_url)
targets = re.findall('"rootcommentcontent":"(.*?)"',comment_doc)
for each in targets:
each = re.sub(r"([em](.*?)[/em])|(\\n)|(\\)|[|]",'',each)
if each in list:
pass
else:
file = open('热门评论\\'+name+'.txt', 'a')
try:
num += 1
num_st = str(num)
list.append(each)
each = re.sub('\[|\]', '', each)
file.write('编号'+num_st+' '+each+'\n')
print('第'+num_st+'个评论已爬取！')
file.close()
except Exception as err:
print(err)
print('爬取成功！请查看当先文件夹下歌名文本!')

复制代码

有时候会出现这样的错误——

复制代码

看来应该是编码的问题，但未找到解决方案…

最佳答案

zltzlt

2020-2-17 21:02:39

Omed 发表于 2020-2-17 21:02

这样试试：

import requests
import re
#获取网页源代码
def get_code(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0)"
" Gecko/20100101 Firefox/73.0"}
return requests.get(url, headers=headers).content.decode("utf-8")
#从原网页中提取songid
song_url=input('请输入QQ音乐网页版网址（https://y.qq.com/n/yqq/song/x.html形式）：')
song_doc = get_code(song_url)
id = re.findall('"songid":(.*?),"',song_doc)[0]
name = re.findall('songname":"(.*?)"',song_doc)[0]
tgt = int(input('请输入需要爬取的页数：'))
#从js文件中提取评论
list = [] #防止评论重复
num = 0
for number in range(1,tgt):
comment_url = "https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg?g_tk" \
"=5381&loginUin=0&hostUin=0&format=jsonp&inCharset=utf8&outCharset" \
"=GB2312¬ice=0&platform=yqq&needNewCode=0&cid=205360772&reqtype" \
"=2&biztype=1&topid=%s&cmd=6&needmusiccrit=0&pagenum=%d&pagesize=" \
"15&ct=24&cv=10101010"%(id,number)
comment_doc = get_code(comment_url)
targets = re.findall('"rootcommentcontent":"(.*?)"',comment_doc)
for each in targets:
each = re.sub(r"([em](.*?)[/em])|(\\n)|(\\)|[|]",'',each)
if each in list:
pass
else:
file = open('热门评论\\'+name+'.txt', 'a', encoding="utf-8") # 修改
try:
num += 1
num_st = str(num)
list.append(each)
each = re.sub(r'\[|\]', '', each)
file.write('编号'+num_st+' '+each+'\n')
print('第'+num_st+'个评论已爬取！')
file.close()
except Exception as err:
print(err)
print('爬取成功！请查看当先文件夹下歌名文本!')

复制代码

zltzlt · 发表于 2020-2-17 19:37:11

import requests
import re
#获取网页源代码
def get_code(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0)"
" Gecko/20100101 Firefox/73.0"}
return requests.get(url, headers=headers).content.decode("utf-8")
#从原网页中提取songid
song_url=input('请输入QQ音乐网页版网址（https://y.qq.com/n/yqq/song/x.html形式）：')
song_doc = get_code(song_url)
id = re.findall('"songid":(.*?),"',song_doc)[0]
name = re.findall('songname":"(.*?)"',song_doc)[0]
tgt = int(input('请输入需要爬取的页数：'))
#从js文件中提取评论
list = [] #防止评论重复
num = 0
for number in range(1,tgt):
comment_url = "https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg?g_tk" \
"=5381&loginUin=0&hostUin=0&format=jsonp&inCharset=utf8&outCharset" \
"=GB2312¬ice=0&platform=yqq&needNewCode=0&cid=205360772&reqtype" \
"=2&biztype=1&topid=%s&cmd=6&needmusiccrit=0&pagenum=%d&pagesize=" \
"15&ct=24&cv=10101010"%(id,number)
comment_doc = get_code(comment_url)
targets = re.findall('"rootcommentcontent":"(.*?)"',comment_doc)
for each in targets:
each = re.sub(r"([em](.*?)[/em])|(\\n)|(\\)|[|]",'',each)
if each in list:
pass
else:
file = open('热门评论\\'+name+'.txt', 'a')
try:
num += 1
num_st = str(num)
list.append(each)
each = re.sub(r'\[|\]', '', each)
file.write('编号'+num_st+' '+each+'\n')
print('第'+num_st+'个评论已爬取！')
file.close()
except Exception as err:
print(err)
print('爬取成功！请查看当先文件夹下歌名文本!')

复制代码

Omed · 发表于 2020-2-17 20:57:44

zltzlt 发表于 2020-2-17 19:37

复制代码

这一行代码修改过后依然会出现这样的情况...

'gbk' codec can't encode character '\u200b' in position 129: illegal multibyte sequence

复制代码

还是谢谢惹！

zltzlt · 发表于 2020-2-17 20:58:00

Omed 发表于 2020-2-17 20:57
这一行代码修改过后依然会出现这样的情况...

还是谢谢惹！

把完整报错信息发上来。

Omed · 发表于 2020-2-17 21:02:05

zltzlt 发表于 2020-2-17 20:58
把完整报错信息发上来。

C:\Users\asus\AppData\Local\Programs\Python\Python37\python.exe C:/Users/asus/Desktop/爬虫/爬取网易云音乐热门评论/comment.py
请输入QQ音乐网页版网址（https://y.qq.com/n/yqq/song/x.html形式）：https://y.qq.com/n/yqq/song/003BkIIU2kGTDn.html
请输入需要爬取的页数：20
Traceback (most recent call last):
File "C:/Users/asus/Desktop/爬虫/爬取网易云音乐热门评论/comment.py", line 37, in <module>
file.write('编号'+num_st+' '+each+'\n')
UnicodeEncodeError: 'gbk' codec can't encode character '\xb4' in position 61: illegal multibyte sequence
Process finished with exit code 1

复制代码

zltzlt · 发表于 2020-2-17 21:02:39

Omed 发表于 2020-2-17 21:02

这样试试：

import requests
import re
#获取网页源代码
def get_code(url):
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0)"
" Gecko/20100101 Firefox/73.0"}
return requests.get(url, headers=headers).content.decode("utf-8")
#从原网页中提取songid
song_url=input('请输入QQ音乐网页版网址（https://y.qq.com/n/yqq/song/x.html形式）：')
song_doc = get_code(song_url)
id = re.findall('"songid":(.*?),"',song_doc)[0]
name = re.findall('songname":"(.*?)"',song_doc)[0]
tgt = int(input('请输入需要爬取的页数：'))
#从js文件中提取评论
list = [] #防止评论重复
num = 0
for number in range(1,tgt):
comment_url = "https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg?g_tk" \
"=5381&loginUin=0&hostUin=0&format=jsonp&inCharset=utf8&outCharset" \
"=GB2312¬ice=0&platform=yqq&needNewCode=0&cid=205360772&reqtype" \
"=2&biztype=1&topid=%s&cmd=6&needmusiccrit=0&pagenum=%d&pagesize=" \
"15&ct=24&cv=10101010"%(id,number)
comment_doc = get_code(comment_url)
targets = re.findall('"rootcommentcontent":"(.*?)"',comment_doc)
for each in targets:
each = re.sub(r"([em](.*?)[/em])|(\\n)|(\\)|[|]",'',each)
if each in list:
pass
else:
file = open('热门评论\\'+name+'.txt', 'a', encoding="utf-8") # 修改
try:
num += 1
num_st = str(num)
list.append(each)
each = re.sub(r'\[|\]', '', each)
file.write('编号'+num_st+' '+each+'\n')
print('第'+num_st+'个评论已爬取！')
file.close()
except Exception as err:
print(err)
print('爬取成功！请查看当先文件夹下歌名文本!')

复制代码

Omed · 发表于 2020-2-17 21:28:38

zltzlt 发表于 2020-2-17 21:02
这样试试：

这样可以的！非常感谢！

账号		自动登录	找回密码
密码			立即注册