|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
第一次尝试,用的PyCharm不是很熟练,社区版也没搞懂数据库怎么用,就只能用excel保存了
- # -*-codeing = utf-8 -*-
- # @Time : 2021/10/1 15:57
- # @Author : 有点冷丶
- # @File : pachong.py
- # @Software: PyCharm
- #引入模块
- from bs4 import BeautifulSoup #网页解析
- import re #正则表达式,文字匹配
- import urllib.request #制定upl,获取网页数据
- import urllib.error
- import xlwt #进行excel操作
- import sqlite3 #进行sqlite数据库操作
- def main():
- baseurl = "http://movie.douban.com/top250?start="
- #1.爬取网页
- datalist = getData(baseurl)
- #3.保存数据
- savepath = r".\\豆瓣电影top250.xls"
- saveData(datalist,savepath) #保存到excel表格
- dppath = "movie250.db"
- #savedata2db(datalist,dbpath) #保存到数据库
- #影片详情链接规则
- findLink = re.compile(r'<a href="(.*?)">') #创建正则表达式对象,表示规则,字符串模式
- #影片图片的链接
- findImgSrc = re.compile(r'<img.*src="(.*?)"',re.S)
- #影片的片名
- findname = re.compile(r'<span class="title">(.*?)</span>')
- #影片的评分
- finddafen = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
- #评价人数
- findpeople = re.compile(r'<span>(\d*)人评价</span>')
- #找到概况
- findInq = re.compile(r'<span class="inq">(.*)</span>')
- #相关内容
- finddy = re.compile(r'<p class="">(.*?)</p>',re.S)
- #爬取网页
- def getData(baseurl):
- datalist = []
- for i in range(0,1): #调用获取页面
- url = baseurl + str(i*25)
- html = askURL(url) #保存获取到的网页源码
- #print(html)
- # 2.逐一解析数据
- soup = BeautifulSoup(html,"html.parser")
- for item in soup.find_all('div',class_="item"): #查找符合要求的字符串,形成列表
- #print(item) #测试查看电影全部item信息
- data = [] #保存一个电影的全部信息
- item = str(item)
- #影片详情超链接
- link = re.findall(findLink,item)[0] #re库
- data.append(link) #添加链接
- imgSrc = re.findall(findImgSrc,item)[0]
- data.append(imgSrc) #添加图片
- names = re.findall(findname,item)
- if(len(names) == 2):
- cname = names[0] #添加中文名
- data.append(cname)
- oname = names[1].replace("/","") #去掉无关的符号
- data.append(oname) #添加外国名
- else:
- data.append(names[0])
- data.append('') #外国名留空
- dafen = re.findall(finddafen,item)[0]
- data.append(dafen) #添加评分
- people = re.findall(findpeople,item)[0]
- data.append(people) #添加打分人数
- inq = re.findall(findInq,item) #添加概况
- if len(inq) != 0:
- inq = inq[0].replace("。","")
- data.append(inq) #添加概述
- else:
- data.append("") #留空
- dy = re.findall(finddy,item)[0] #添加相关内容
- dy = re.sub('<br(\s+)?/>(\s+)?'," ",dy)
- dy = re.sub('/'," ",dy)
- data.append(dy.strip()) #strip去掉前后空格
- datalist.append(data) #把处理好的信息储存进datalist
- #print(datalist, "\n")
- return datalist
- #得到指定一个url的网页的内容
- def askURL(url):
- head = {
- "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31"
- }
- #用户代理,伪装
- request = urllib.request.Request(url,headers=head)
- html = ""
- try:
- response = urllib.request.urlopen(request)
- html = response.read().decode("utf-8")
- except urllib.error.URLError as e:
- if hasattr(e,"code"):
- print(e.code)
- if hasattr(e,"reason"):
- print(e.reason)
- return html
- #保存数据到excel
- def saveData(datalist,savepath):
- print("save...")
- book = xlwt.Workbook(encoding="utf-8",style_compression=0) # 创建一个workbook对象
- sheet = book.add_sheet('豆瓣电影top250',cell_overwrite_ok=True) # 创建工作表
- col = ("电影详情链接","图片链接","影片中文名","影片外国名","评分","评价数","概况","相关信息")
- for i in range(0,8):
- sheet.write(0,i,col[i]) #列名
- for i in range(0,250):
- print("第%d条"%(i+1))
- data = datalist[i]
- for j in range(0,8):
- sheet.write(i+1,j,data[j])
- book.save(savepath) # 保存数据表
- #保存到数据库
- def savedata2db(datalist,dbpath):
- init_db(dbpath)
- conn = sqlite3.connect(dbpath)
- cur = conn.cursor()
- for data in datalist:
- for index in range(len(data)):
- if index == 4 or index == 5:
- continue
- data[index] = '"'+data[index]+'"'
- sql = '''
- insert into movie250(
- info_link,pic_link,cname,ename,score,rated,introduction,info)
- values(%s)'''%",".join(data)
- cur.execute(sql)
- conn.commit()
- cur.close()
- conn.close()
-
- #创建数据库
- def init_db(dbpath):
- sql = '''
- create table movie250
- (
- id integer primary key autoincrement,
- info_link text,
- pic_link text,
- cname varchar,
- ename varchar,
- score numeric,
- rated numeric,
- introduction text,
- info text
- )
- '''
- conn = sqlite3.connect(dbpath)
- cursor = conn.cursor()
- cursor.execute(sql)
- conn.commit()
- conn.close()
- if __name__ == "__main__":
- main()
- #init_db("test.db") #测试数据库
- print("爬取完毕")
复制代码 |
|