python top250豆瓣图书 indexerror
<div class="blockcode"><blockquote># coding=utf-8# @Time : 2021/12/26 15点41分
# @Author :
# @File :
# @Software:PyCharm
import re# 正则表达式,进行文字匹配
import urllib.error# 制定url,获取网页数据
import urllib.request
import xlwt# 进行excel操作
from bs4 import BeautifulSoup# 网页解析,获取数据
def main():
baseurl = "https://book.douban.com/top250?start="
#1爬取网页
datalist = getData(baseurl)
savepath = "豆瓣图书top250战毅.xls"
#3.保存数据
saveData(datalist, savepath)
#askURL('https://book.douban.com/top250?start=')
findlink = re.compile(r'<a href="(.*?)" onclick=""moreurl') #生成创建正则表达式的规则(字符串模式)
findImgSrc = re.compile(r'<img src="(.*?)"', re.S)
findCTitle = re.compile(r'"" title="(.*)">')
#findFTitle = re.compile(r'<span style="font-size:12px;">(.*)</span>', re.S)
findRating = re.compile(r'<span class="rating_nums">(.*)</span>')
findJudge = re.compile(r'<span class="pl">((\d)人评价)</span>')
findInq = re.compile(r'<span class="inq">(.*)</span>', re.S)
findBd = re.compile(r'<p class="pl">(.*)</p>', re.S)
#爬取网页
def getData(baseurl):
datalist = []
for i inrange(0, 10): #调用页面信息的函数,10次
url = baseurl + str(i*25)
html = askURL(url) #保存获取到的源码
# 2.逐一解析数据
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('tr', class_="item"):
#print(item)#测试:查看全部信息
data = [] #保存一本书全部信息
item = str(item)
#影片详情链接
#link = re.findall(findlink, item) #re库来通过正则表达式找到指定的字符串
#print(link)
link = re.findall(findlink, item)#
data.append(link)
imgSrc = re.findall(findImgSrc, item)
data.append(imgSrc)
titles = re.findall(findCTitle, item)
if (len(titles) == 2):
ctitle = titles
data.append(ctitle)
Ftitle = titles.replace("/", "")
data.append(Ftitle)
else:
data.append(titles)
data.append('')
rating = re.findall(findRating, item)
data.append(rating)
judgeM = re.findall(findJudge, item)#
data.append(judgeM)
inq = re.findall(findInq, item)
if (len(inq) != 0):
inq = inq.replace("。", "")
data.append(inq)
else:
data.append("")
bd = re.findall(findBd, item)
bd = re.sub('<br(\s+)?/>(\s+)?', " ", bd)
bd = re.sub('/', " ", bd)
data.append(bd.strip())
datalist.append(data)
print(datalist)
return datalist
#得到指定的一个url的网页内容
def askURL(url): #模拟浏览器头部信息
head = {"User-Agent": 'Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
} #用户代理我们是什么类型的浏览器(本质上我们能根据啥水准的内容)
request = urllib.request.Request(url, headers=head)
html = ''
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def saveData(datalist, savepath):
print('save....')
book = xlwt.Workbook(encoding="utf-8", style_compression=0)
sheet = book.add_sheet('豆瓣图书top250', cell_overwrite_ok=True)
col = ("图书详情链接", "图片详情链接", "中文书名", "外国书名", "评分", "评价数", "概况", "相关信息")
for i in range(0, 8):
sheet.write(0, i, col) #列名
for i in range (0, 250):
print("第%d条" % (i+1))
data = datalist
for j in range(0, 8):
sheet.write(i+1, j, data) #数据
book.save(savepath) #保存
if __name__ == "__main__":
main()
print('爬取完毕')
请问大佬们这个是什么原因
C:\Users\虫宝\AppData\Local\Programs\Python\Python37\python.exe E:/Users/虫宝/untitled13/test6.py
[[['https://book.douban.com/subject/24531956/'], 'https://img9.doubanio.com/view/subject/s/public/s29101586.jpg', '哈', '', '9.7', [], '从9¾站台开始的旅程', 'J.K.罗琳 (J.K.Rowling) 苏农 人民文学出版社 2008-12-1 498.00元< p>\n<div class="star clearfix">\n<span class="allstar50">< span>\n<span class="rating_nums">9.7< span>\n<span class="pl">(\n 61057人评价\n )< span>\n< div>\n<p class="quote" style="margin: 10px 0; color: #666">\n<span class="inq">从9¾站台开始的旅程< span>'], [['https://book.douban.com/subject/1027191/'], 'https://img9.doubanio.com/view/subject/s/public/s1768916.jpg', '历', '', '9.0', [], '窥见美国社会的一扇窗', '林达 生活·读书·新知三联书店 1997-5 19.00元< p>\n<div class="star clearfix">\n<span class="allstar45">< span>\n<span class="rating_nums">9.0< span>\n<span class="pl">(\n 40895人评价\n )< span>\n< div>\n<p class="quote" style="margin: 10px 0; color: #666">\n<span class="inq">窥见美国社会的一扇窗< span>'], [['https://book.douban.com/subject/3162991/'], 'https://img2.doubanio.com/view/subject/s/public/s3219163.jpg', '艺', '', '9.6', [], '从最早的洞窟绘画到当今的实验艺术', '[英] 贡布里希 (Sir E.H.Gombrich) 范景中 广西美术出版社 2008-04 280.00< p>\n<div class="star clearfix">\n<span class="allstar50">< span>\n<span class="rating_nums">9.6< span>\n<span class="pl">(\n 20397人评价\n )< span>\n< div>\n<p class="quote" style="margin: 10px 0; color: #666">\n<span class="inq">从最早的洞窟绘画到当今的实验艺术< span>'], [['https://book.douban.com/subject/4238362/'], 'https://img1.doubanio.com/view/subject/s/public/s4243447.jpg', '送', '', '8.6', [], '在这本书里,被“审视”的东西杂七杂八', '刘瑜 上海三联书店 2010-1 25.00元< p>\n<div class="star clearfix">\n<span class="allstar45">< span>\n<span class="rating_nums">8.6< span>\n<span class="pl">(\n 135348人评价\n )< span>\n< div>\n<p class="quote" style="margin: 10px 0; color: #666">\n<span class="inq">在这本书里,被“审视”的东西杂七杂八< span>'], [['https://book.douban.com/subject/3026879/'], 'https://img9.doubanio.com/view/subject/s/public/s2990934.jpg', '爱', '', '8.8', [], '谦恭地、勇敢地、真诚地和有纪律地爱他人', '[美] 艾·弗洛姆 李健鸣 上海译文出版社 2008-4 15.00元< p>\n<div class="star clearfix">\n<span class="allstar45">< span>\n<span class="rating_nums">8.8< span>\n<span class="pl">(\n 37817人评价\n )< span>\n< div>\n<p class="quote" style="margin: 10px 0; color: #666">\n<span class="inq">谦恭地、勇敢地、真诚地和有纪律地爱他人< span>'], [['https://book.douban.com/subject/1020961/'], 'https://img1.doubanio.com/view/subject/s/public/s26237958.jpg', '棋', '王', '8.8', [], '我从未真正见过火,也未见过毁灭,更不知新生', '阿城 作家出版社 1999-10 13.00< p>\n<div class="star clearfix">\n<span class="allstar45">< span>\n<span class="rating_nums">8.8< span>\n<span class="pl">(\n 29311人评价\n )< span>\n< div>\n<p class="quote" style="margin: 10px 0; color: #666">\n<span class="inq">我从未真正见过火,也未见过毁灭,更不知新生< span>'], [['https://book.douban.com/subject/1013416/'], 'https://img1.doubanio.com/view/subject/s/public/s23579217.jpg', '雷', '雨', '8.6', [], '一幕人生大悲剧,在一个雷雨夜爆发', '曹禺 人民文学出版社 1999-05 9.20< p>\n<div class="star clearfix">\n<span class="allstar45">< span>\n<span class="rating_nums">8.6< span>\n<span class="pl">(\n 69660人评价\n )< span>\n< div>\n<p class="quote" style="margin: 10px 0; color: #666">\n<span class="inq">一幕人生大悲剧,在一个雷雨夜爆发< span>'], [['https://book.douban.com/subject/1083762/'], 'https://img9.doubanio.com/view/subject/s/public/s1134166.jpg', '人', '', '8.7', [], '十四个影响人类文明的瞬间', '[奥] 斯蒂芬·茨威格 舒昌善 广西师范大学出版社 2004-8 18.00元< p>\n<div class="star clearfix">\n<span class="allstar45">< span>\n<span class="rating_nums">8.7< span>\n<span class="pl">(\n 27005人评价\n )< span>\n< div>\n<p class="quote" style="margin: 10px 0; color: #666">\n<span class="inq">十四个影响人类文明的瞬间< span>'], [['https://book.douban.com/subject/3012517/'], 'https://img1.doubanio.com/view/subject/s/public/s4045138.jpg', '小', '', '8.6', [], '两个女人的史诗', '严歌苓 作家出版社 2008-4 28.00元< p>\n<div class="star clearfix">\n<span class="allstar45">< span>\n<span class="rating_nums">8.6< span>\n<span class="pl">(\n 30331人评价\n )< span>\n< div>\n<p class="quote" style="margin: 10px 0; color: #666">\n<span class="inq">两个女人的史诗< span>'], [['https://book.douban.com/subject/10555486/'], 'https://img2.doubanio.com/view/subject/s/public/s8972073.jpg', '分', '', '8.7', [], '', '[意] 伊塔洛·卡尔维诺 吴正仪 译林出版社 2012-4-1 20.00元']]
save....
Traceback (most recent call last):
File "E:/Users/虫宝/untitled13/test6.py", line 127, in <module>
main()
File "E:/Users/虫宝/untitled13/test6.py", line 21, in main
saveData(datalist, savepath)
File "E:/Users/虫宝/untitled13/test6.py", line 120, in saveData
data = datalist
IndexError: list index out of range
第1条
第2条
第3条
第4条
第5条
第6条
第7条
第8条
第9条
第10条
第11条
进程已结束,退出代码1
页:
[1]