|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
在豆瓣音乐标签: 流行里只能爬取标题和链接,没有音乐家和音乐发布时间import requests
from bs4 import BeautifulSoup
import xlwt
url = "https://music.douban.com/tag/%E6%B5%81%E8%A1%8C?start=260&type=T"
headers = {'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36','Cookie':
'll="118254"; bid=hUyLZc0IQ-0; __utmc=30149280; __utmz=30149280.1704177535.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _pk_id.100001.afe6=e3aa424645194a40.1704178112.; __yadk_uid=vQ2OnDmCyRJ7MgONjIYqxroh3kmJOC0n; _vwo_uuid_v2=DC4889CAE9BF69AB947F1C739A359D3B1|c7a477b87f67bc643d65031d1e233036; dbcl2="276885079:IyaMMU84wwk"; ck=y-tF; push_noty_num=0; push_doumail_num=0; __utmv=30149280.27688; ap_v=0,6.0; _pk_ref.100001.afe6=%5B%22%22%2C%22%22%2C1704181059%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.afe6=1; __utma=30149280.678212503.1704177535.1704177535.1704181060.2; __utmt=1;__utmb=30149280.3.9.1704181060'}
r=requests.get(url,headers=headers)
content=r.content.decode('utf-8')
with open('douban.html','wb')as file:
file.write(r.content)
title_list=[]
href_list=[]
for i in range(0,400,20):
url = 'https://music.douban.com/tag/%E6%B5%81%E8%A1%8C?start={i}&type=T'
soup=BeautifulSoup(content,'lxml')
divs=soup.find_all('div',class_="pl2")
for div in divs:
href=div.find('a')['href']
href_list.append(href)
title=div.find('a').text
title_list.append(title)
print(href_list)
print(title_list)
wb=xlwt.Workbook()
sheet1=wb.add_sheet('豆瓣音乐')
sheet1.write(0,0,'标题')
sheet1.write(0,1,'链接')
for row in range(1,len(title_list)+1,1):
sheet1.write(row,0,title_list[row-1])
sheet1.write(row,1,href_list[row-1])
wb.save(r'C:\Users\admin1\Desktop\34qc.xls')
代码有一些问题。首先,在循环中获取链接和标题时,应该将网页请求和解析放在循环内部,这样才能正确获取每一页的数据。其次,在写入Excel表格时,应该将每一个标题和链接写入不同的行,现在是将整个列表写入所有行。下面是修改后的代码:
import requests
from bs4 import BeautifulSoup
import xlwt
url = "https://music.douban.com/tag/%E6%B5%81%E8%A1%8C?start=260&type=T"
headers = {'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36','Cookie':
'll="118254"; bid=hUyLZc0IQ-0; __utmc=30149280; __utmz=30149280.1704177535.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _pk_id.100001.afe6=e3aa424645194a40.1704178112.; __yadk_uid=vQ2OnDmCyRJ7MgONjIYqxroh3kmJOC0n; _vwo_uuid_v2=DC4889CAE9BF69AB947F1C739A359D3B1|c7a477b87f67bc643d65031d1e233036; dbcl2="276885079:IyaMMU84wwk"; ck=y-tF; push_noty_num=0; push_doumail_num=0; __utmv=30149280.27688; ap_v=0,6.0; _pk_ref.100001.afe6=%5B%22%22%2C%22%22%2C1704181059%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.afe6=1; __utma=30149280.678212503.1704177535.1704177535.1704181060.2; __utmt=1;__utmb=30149280.3.9.1704181060'}
r = requests.get(url, headers=headers)
content = r.content.decode('utf-8')
title_list = []
href_list = []
for i in range(0, 400, 20):
url = f'https://music.douban.com/tag/%E6%B5%81%E8%A1%8C?start={i}&type=T'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
divs = soup.find_all('div', class_="pl2")
for div in divs:
href = div.find('a')['href']
href_list.append(href)
title = div.find('a').text.strip()
title_list.append(title)
wb = xlwt.Workbook()
sheet1 = wb.add_sheet('豆瓣音乐')
sheet1.write(0, 0, '标题')
sheet1.write(0, 1, '链接')
for row in range(1, len(title_list) + 1, 1):
sheet1.write(row, 0, title_list[row-1])
sheet1.write(row, 1, href_list[row-1])
wb.save(r'C:\Users\admin1\Desktop\34qc.xls')
修改后的代码中,首先在循环内部发送网页请求和解析HTML,然后分别将标题和链接添加到对应的列表中。最后,在写入Excel表格时,将每一个标题和链接写入不同的行。
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。
|
|