|
楼主 |
发表于 2022-5-10 20:40:09
|
显示全部楼层
#看数据库部分
import re
from bs4 import BeautifulSoup
import urllib.request
import sqlite3
url = 'https://movie.douban.com/top250?start='
findlink = re.compile(r'<a href="(.*?)">')#电影链接
findimg = re.compile(r'<img .* src="(.*?)"/>',re.S)#图片链接
findname = re.compile(r'span class="title">(.*?)</span>')#电影名
findpeople = re.compile(r'<span>(\d*)人评价</span>')#评价人数
findpoint = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')#评分
findother = re.compile(r'<p class="">(.*?)</p>',re.S) #获取导演等信息
findothers = re.compile(r'<span class="inq">(.*?)</span>') #获取其他信息
def main():
#解析网页源码
datas = Gteurl(url)
#保存数据
#Savedata(datas)
#save_txt(datas)
save_sql(datas)#这里
def save_sql(datas):#看这里
connect = sqlite3.connect("movie250.db")
cur = connect.cursor()
sql = '''
create table if not exists 电影排行榜
(电影中文名 varchar(10) primary key,
电影外国名 varchar(10),
电影链接 varchar(100),
图片链接 varchar(100),
评价人数 int,
评分 int,
导演等信息 varchar(100),
其他信息 varchar(100)
);
'''
cur.execute(sql)
for each in datas:
sql = ("insert into 电影排行榜('电影中文名','电影外国名','电影链接','图片链接','评价人数','评分','导演等信息','其他信息')"
f"values({','.join(map(repr,each))})")#f"values({','.join(iter(each))})"map不是返回一个迭代器对象吗,each里面的元素已经是字符串了,为什么用iter会出错?
cur.execute(sql)
connect.commit()
cur.close()
connect.close()
def Gteurl(url):
datas = []
for i in range(0,10):
html = url + str(i*25)
response = askURL(html)
bs = BeautifulSoup(response,'html.parser')
for item in bs.find_all('div',class_ = 'item'):
data = []
item = str(item)
#1
name = re.findall(findname, item)
if len(name) == 2:
cname = name[0]
data.append(cname)
oname = name[1].replace("\xa0/\xa0"," ")
data.append(oname)
else:
data.append(name[0])
data.append(' ')
#2
link = re.findall(findlink,item)[0]
data.append(link)
#7
Img = re.findall(findimg,item)[0]
Img = re.sub(r'width="100','',Img)
Img = re.sub(r'"', '', Img)
data.append(Img)
#3
nums = re.findall(findpeople,item)[0]
data.append(nums)
#6
point = re.findall(findpoint,item)[0]
data.append(point)
#4
other = re.findall(findother,item)[0]
new_other = re.sub('<br/>(\s+)?',' ',other)
new_other = re.sub("\n",'',new_other)
new_other = re.sub("\xa0", ' ', new_other)
data.append(new_other)
#5
others = re.findall(findothers,item)
if len(others) !=0:
data.append(others[0])
else:
data.append(' ')
print(data)
return datas
def askURL(url): #封装数据,得到网页源码
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39'
}
req =urllib.request.Request(url = url,headers=head,)#封装的信息,构建请求对象
response = urllib.request.urlopen(req)#发出请求
return response
if __name__ == '__main__':
main()
print('爬取完毕!') |
|