|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
显示错误如下:
Traceback (most recent call last):
File "C:/Users/mct檬椿123/Desktop/12爬虫.py", line 110, in <module>
main()
File "C:/Users/mct檬椿123/Desktop/12爬虫.py", line 26, in main
datalist = getdata(baseurl)
File "C:/Users/mct檬椿123/Desktop/12爬虫.py", line 35, in getdata
url = baseurl + str(i*25)
NameError: name 'baseurl' is not defined
- from bs4 import BeautifulSoup #网页解析
- import re #正则表达式,进行文字匹配
- import urllib.request, urllib.error
- import xlwt#进行excel操作
- import sqlite3#进行SQlite数据库操作
- #影片详情链接的规则
- findlink = re.compile(r'<a href="(.*?)">')#创建正则表达式对象,表示规则(字符串模式)
- #影片图片的链接
- findimgscr = re.compile(r'<img.*src="(.*?)"',re.S) #re.S让换行符包含在字符中
- #影片片名
- findtitle = re.compile(r'<span class="title">(.*)</span>')
- #影片的评分
- findrating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
- #找到评价人数
- findnum = re.compile(r'<span>(\d*)人评价</span>')
- #找到概况
- findgk = re.compile(r'<span class="inq">(.*)</span>')
- #找到影片的相关内容
- findabout = re.compile(r' <p class="">(.*?) </p>,re.S')
- def main():
- baseurl = "https://movie.douban.com/top250?start="
- #爬取网页
- datalist = getdata(baseurl)
- # 前面的为用excel表格存储
- savepath = "豆瓣电影top250.xls"
- savedata(datalist,savepath)
- #爬取网页
- def getdata(basurl):
- datalist = []
- for i in range(0,10):#调用获取页面信息的函数,10次
- url = baseurl + str(i*25)
- html = askURL(url)#保存获取到的网页源码
- #逐一解析数据
- soup = BeautifulSoup(html,"html.parser")
- for item in soup.find_all('div',class_="item"):#查找符合要求的字符串,形成列表
- #测试查看电影item全部信息
- data = []#保存一部电影的全部信息
- item = str(item)
- # 获取影片的超链接
- link = re.findall(findlink,item)[0] #re库用来通过正则表达式查找指定的字符串
- imgscr = re.findall(findimgscr,item)[0] #添加图片
- data.append(findimgscr)
- titles = re.findall(findtitle,item)[0]#片名可能只有一个中文名,没有外文
- if (len(title==2)):
- ctitle = title[0]
- data.append(ctitle)#添加中文名
- otitle = title[1].replace("/","")#去掉无关的符号
- data.append(otitle)#添加外国名
- else:
- data.append(title[0])
- data.append('')#外国名字留空
- rating = re.findall(findrating,item)[0]#添加评分
- data.append(rating[0])
- num =re.findall(findnum,item)[0]
- data.append(num[0])#添加评价人数
- gk = re.findall(findgk,item)
- if len(gk)!= 0:
- gk =gk[0].replace("。","")
- data.append(gk) # 添加概况
- else:
- data.append(" ")
- about = re.findall(findabout,item)[0]
- about = re.sub('<br(\s+)?/>>(\s+?)'," ",about)#去掉<br/>
- bout = re.sub('/'," ",about)#替换/
- data.append(about.strip())#去掉前面的空格
- datalist.append(data)#把处理好的一部电影信息放入datalist
- return datalist
- #得到指定一个url的网页内容
- def askURL(url):
- head = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Mobile Safari/537.36"}
- request = urllib.request.Request(url,headers= head)
- html = ""
- try:
- response = urllib.request.urlopen(request)
- html = response.read().decode("utf-8")
- except urllib.error.URLError as e:
- if hasattr(e,"code"):
- print(e.code)
- if hasattr(e,"reason"):
- print(e.reason)
- return html
- #保存数据
- #用excel
- def savedata(savepath):
- print("savepath……")
- book = xlwt.Workbook(encoding="utf-8",style_compression=0)#创建workbook对象
- sheet = workbook.add_sheet('豆瓣电影top250',cell_overwrite_ok=True)#创建工作表
- col = ("电影详情链接","图片链接","影片中文名","影片外国名","评分","评价数","概况","相关信息")
- for i in range(0,8):
- sheet.write(0,i,col[i])#列名
- for i in range(0,250):
- #print("第%d条"%(i+1))
- data = datalist[i]
- for j in range(0,8):
- sheet.write(i+1,j,data[j])#数据
- book.save(savepath)#保存
- if __name__ == "__main__": #调用函数,当程序执行时
- main()
- #用数据库时
- #if __name__ == "__main__":
- # init_db("movietest,db")
- print("爬取完成")
- 哪位好心人可以帮我看看baseurl那里为什么错了吗,谢谢了,刚刚接触爬虫,不大懂,搞了好久
复制代码
第32行错了,def getdata(basurl): 里面的形参 baseurl 写成了basurl,少了一个e
|
|