|
2鱼币
报错:IndexError: list index out of range 为什么?
- import requests
- import bs4
- import time
- import sys
- import openpyxl
- def open_url(url):
- proxies = {"http": "119.179.172.243:8060", "https": "124.167.248.230:3128"}
- headers = {
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
- }
- req = requests.get(url, headers=headers, proxies=proxies)
- return req
- def find_books(req):
- bs = bs4.BeautifulSoup(req.text, "html.parser")
- # 书名
- booknames = []
- books = bs.find_all("div", class_="info")
- for each in books:
- booknames.append(each.h2.a.text.strip().replace(" ", "").replace("\n", ""))
- # print(booknames)
- # 评分
- scores = []
- score = bs.find_all("span", class_="rating_nums")
- for each in score:
- scores.append(each.text)
- # print(scores)
- # 出版信息
- pubs = []
- pub = bs.find_all("div", class_="pub")
- for each in pub:
- pubs.append(each.text.strip().replace("\n", ""))
- # print(pubs)
- # url
- urls = []
- url = bs.find_all("div", class_="info")
- for each in url:
- href = each.find_all('a')
- urls.append(href[0].get('href'))
- # print(urls)
- result = []
- length = len(booknames)
- for i in range(length):
- result.append([booknames[i], scores[i], pubs[i], urls[i]])
- # print(result)
- return result
- # 查页数
- def find_pages(req):
- bs = bs4.BeautifulSoup(req.text, 'html.parser')
- page = bs.find("span", class_="next").previous_sibling.previous_sibling.text
- return int(page)
- def save_to_excel(result):
- wb = openpyxl.Workbook()
- ws = wb.active
- ws.append(['书名', '评分', '出版信息', '相关链接'])
- for each in result:
- ws.append(each)
- wb.save("豆瓣图书.xlsx")
- def main():
- host = "https://book.douban.com/tag/%E7%BC%96%E7%A8%8B"
- res = open_url(host)
- depth = find_pages(res)
- result = []
- for i in range(depth):
- url = host + "?start=" + str(i * 20) + "&type=T"
- req = open_url(url)
- result.extend(find_books(req))
- now_jd = ((i + 1) / depth) * 100
- sys.stdout.write(
- "\r爬取中 %d%% [%s%s] 第%d页" % (now_jd, '>' * int((now_jd) / 2), '-' * int((100 - now_jd) / 2),(i + 1)))
- sys.stdout.flush()
- time.sleep(1.5)
- print("爬取完成!")
- print(result)
- '''with open("豆瓣图书.txt", "w", encoding="utf-8")as f:
- for each in result:
- f.write(each)'''
- save_to_excel(result)
- if __name__ == '__main__':
- main()
复制代码
那不用想了,你的列表是空的
你之前就没添加上元素,你用审查元素得到的渲染后代码和源代码不一致,你去源代码看看什么格式,有的可能还通过JS函数等等
|
最佳答案
查看完整内容
那不用想了,你的列表是空的
你之前就没添加上元素,你用审查元素得到的渲染后代码和源代码不一致,你去源代码看看什么格式,有的可能还通过JS函数等等
|