|
发表于 2022-4-18 18:43:26
|
显示全部楼层
- import re
- from bs4 import BeautifulSoup
- import urllib.request
- import xlwt
- url = 'https://movie.douban.com/top250?start='
- findlink = re.compile(r'<a href="(.*?)">')#电影链接
- findimg = re.compile(r'<img .* src="(.*?)"/>',re.S)#图片链接
- findname = re.compile(r'span class="title">(.*?)</span>')#电影名
- findpeople = re.compile(r'<span>(\d*)人评价</span>')#评价人数
- findpoint = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')#评分
- findother = re.compile(r'<p class="">(.*?)</p>',re.S) #获取导演等信息
- findothers = re.compile(r'<span class="inq">(.*?)</span>') #获取其他信息
- def main():
- #解析网页源码
- datas = Gteurl(url)
- #保存数据
- Savedata(datas)
- def Gteurl(url):
- datas = [] # 加了这行
- for i in range(0,10):
- html = url + str(i*25)
- response = askURL(html)
- bs = BeautifulSoup(response,'html.parser')
- for item in bs.find_all('div',class_ = 'item'):
- data = []
- item = str(item)
- #1
- name = re.findall(findname, item)
- if len(name) == 2:
- cname = name[0]
- data.append(cname)
- oname = name[1].replace("\xa0/\xa0"," ")
- data.append(oname)
- else:
- data.append(name[0])
- data.append(' ')
- #2
- link = re.findall(findlink,item)[0]
- data.append(link)
- #7
- Img = re.findall(findimg,item)[0]
- Img = re.sub(r'width="100','',Img)
- Img = re.sub(r'"', '', Img)
- data.append(Img)
- #3
- nums = re.findall(findpeople,item)[0]
- data.append(nums)
- #6
- point = re.findall(findpoint,item)[0]
- data.append(point)
- #4
- other = re.findall(findother,item)[0]
- new_other = re.sub('<br/>(\s+)?',' ',other)
- new_other = re.sub("\n",'',new_other)
- new_other = re.sub("\xa0", ' ', new_other)
- data.append(new_other)
- #5
- others = re.findall(findothers,item)
- if len(others) !=0:
- data.append(others[0])
- else:
- data.append(' ')
- datas.append(data) # 加了这行
- return datas # 改了这行
- def askURL(url): #封装数据,得到网页源码
- head = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39'
- }
- req =urllib.request.Request(url = url,headers=head,)#封装的信息,构建请求对象
- response = urllib.request.urlopen(req)#发出请求
- return response
- def Savedata(data):
- col = ('电影中文名','电影外国名','电影链接','图片链接','评价人数','评分','导演等信息','其他信息')
- workbook = xlwt.Workbook(encoding='utf-8')
- worksheet = workbook.add_sheet('sheet1')
- for a in range(8):
- worksheet.write(0,a,col[a])
- for i, _ in enumerate(data): # 改了这行
- Data = data[i]
- for j, v in enumerate(Data): # 改了这行
- worksheet.write(i+1,j,v) # 改了这行
- workbook.save('豆瓣250.xls') # 改了这行
- if __name__ == '__main__':
- main()
- print('爬取完毕!')
复制代码 |
|