爬虫写入xls时报错

Tihool · 发表于 2022-4-17 20:54:40

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

import re
from bs4 import BeautifulSoup
import urllib.request
import xlwt
url = 'https://movie.douban.com/top250?start='
findlink = re.compile(r'<a href="(.*?)">')#电影链接
findimg = re.compile(r'<img .* src="(.*?)"/>',re.S)#图片链接
findname = re.compile(r'span class="title">(.*?)')#电影名
findpeople = re.compile(r'(\d*)人评价')#评价人数
findpoint = re.compile(r'(.*)')#评分
findother = re.compile(r'(.*?)',re.S) #获取导演等信息
findothers = re.compile(r'(.*?)') #获取其他信息
def main():
#解析网页源码
datas = Gteurl(url)
#保存数据
Savedata(datas)
def Gteurl(url):
for i in range(0,10):
 html = url + str(i*25)
 response = askURL(html)
 bs = BeautifulSoup(response,'html.parser')
 for item in bs.find_all('div',class_ = 'item'):
 data = []
 item = str(item)
 #1
 name = re.findall(findname, item)
 if len(name) == 2:
 cname = name[0]
 data.append(cname)
 oname = name[1].replace("\xa0/\xa0"," ")
 data.append(oname)
 else:
 data.append(name[0])
 data.append(' ')

 #2
 link = re.findall(findlink,item)[0]
 data.append(link)
 #7
 Img = re.findall(findimg,item)[0]
 Img = re.sub(r'width="100','',Img)
 Img = re.sub(r'"', '', Img)

 data.append(Img)
 #3
 nums = re.findall(findpeople,item)[0]
 data.append(nums)
 #6
 point = re.findall(findpoint,item)[0]

 data.append(point)
 #4
 other = re.findall(findother,item)[0]
 new_other = re.sub(' (\s+)?',' ',other)
 new_other = re.sub("\n",'',new_other)
 new_other = re.sub("\xa0", ' ', new_other)

 data.append(new_other)
 #5
 others = re.findall(findothers,item)
 if len(others) !=0:
 data.append(others[0])
 else:
 data.append(' ')

return data

def askURL(url): #封装数据，得到网页源码
head = {
 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39'
}
req =urllib.request.Request(url = url,headers=head,)#封装的信息，构建请求对象
response = urllib.request.urlopen(req)#发出请求
return response
def Savedata(data):
col = ('电影中文名','电影外国名','电影链接','图片链接','评价人数','评分','导演等信息','其他信息')
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('sheet1')
for a in range(8):
 worksheet.write(0,a,col[a])
for i in range(0,250):
 Data = data[i]
 for j in (0,8):
 worksheet.write(i+1,j,Data[j])#此处报错，string index out of range
worksheet.save('豆瓣250.xls')
if __name__ == '__main__':
main()
print('爬取完毕！')

isdkz · 发表于 2022-4-18 18:43:26

import re

from bs4 import BeautifulSoup

import urllib.request

import xlwt

url = 'https://movie.douban.com/top250?start='

findlink = re.compile(r'<a href="(.*?)">')#电影链接

findimg = re.compile(r'<img .* src="(.*?)"/>',re.S)#图片链接

findname = re.compile(r'span class="title">(.*?)')#电影名

findpeople = re.compile(r'(\d*)人评价')#评价人数

findpoint = re.compile(r'(.*)')#评分

findother = re.compile(r'(.*?)',re.S) #获取导演等信息

findothers = re.compile(r'(.*?)') #获取其他信息

def main():

#解析网页源码

datas = Gteurl(url)

#保存数据

Savedata(datas)

def Gteurl(url):

datas = [] # 加了这行

for i in range(0,10):

 html = url + str(i*25)

 response = askURL(html)

 bs = BeautifulSoup(response,'html.parser')

 for item in bs.find_all('div',class_ = 'item'):

 data = []

 item = str(item)

 #1

 name = re.findall(findname, item)

 if len(name) == 2:

 cname = name[0]

 data.append(cname)

 oname = name[1].replace("\xa0/\xa0"," ")

 data.append(oname)

 else:

 data.append(name[0])

 data.append(' ')

 #2

 link = re.findall(findlink,item)[0]

 data.append(link)

 #7

 Img = re.findall(findimg,item)[0]

 Img = re.sub(r'width="100','',Img)

 Img = re.sub(r'"', '', Img)

 data.append(Img)

 #3

 nums = re.findall(findpeople,item)[0]

 data.append(nums)

 #6

 point = re.findall(findpoint,item)[0]

 data.append(point)

 #4

 other = re.findall(findother,item)[0]

 new_other = re.sub(' (\s+)?',' ',other)

 new_other = re.sub("\n",'',new_other)

 new_other = re.sub("\xa0", ' ', new_other)

 data.append(new_other)

 #5

 others = re.findall(findothers,item)

 if len(others) !=0:

 data.append(others[0])

 else:

 data.append(' ')

 datas.append(data) # 加了这行

return datas # 改了这行

def askURL(url): #封装数据，得到网页源码

head = {

 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39'

}

req =urllib.request.Request(url = url,headers=head,)#封装的信息，构建请求对象

response = urllib.request.urlopen(req)#发出请求

return response

def Savedata(data):

col = ('电影中文名','电影外国名','电影链接','图片链接','评价人数','评分','导演等信息','其他信息')

workbook = xlwt.Workbook(encoding='utf-8')

worksheet = workbook.add_sheet('sheet1')

for a in range(8):

 worksheet.write(0,a,col[a])

for i, _ in enumerate(data): # 改了这行

 Data = data[i]

 for j, v in enumerate(Data): # 改了这行

 worksheet.write(i+1,j,v) # 改了这行

workbook.save('豆瓣250.xls') # 改了这行

if __name__ == '__main__':

main()

print('爬取完毕！')
复制代码

账号		自动登录	找回密码
密码			立即注册