利用python爬取豆瓣数据【入门篇】
python前置环境urllib bs4urllib应该是解释器里携带
没有第三方库的朋友可以 手动安装: pip install bs4
需要用到urllib.request里的urlopen Request
还有urllib 里的 parse 用于字符串编码转换
bs4里的BeautifulSoup
源码:
from urllib.request import urlopen, Request
from urllib import parse
from bs4 import BeautifulSoup
def read(url,page):
header = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36 Edg/93.0.961.38"
}
data = {
'start': page,
'filter': ''
}
data = parse.urlencode(data).encode('utf-8') #
request = Request(url,data,header,'get')
response = urlopen(request)
html = response.read().decode('utf-8')
return html
def read_data(html): #获取数据
soup = BeautifulSoup(html,'html.parser')
# div class ="item"
lists = []
for item in soup.find_all('div', class_="item"):
movies = []
# imgUrl = item.find('img').get('src')# 电影图片的网址信息
# movies.append(imgUrl)
title = item.select('.title').text
movies.append(title)
# title = item.find('span',class_="title").text #电影名称
# movies.append(title)
imgurl = item.img['src']
movies.append(imgurl)
# moiveurl = item.find('div', class_="hd").find('a').get('href')# 电影详情链接div class ="hd"
# movies.append(moiveurl)
moiveurl = item.select('.hd a')['href']
movies.append(moiveurl)
# moivescore = item.find('span', class_="rating_num").text # 获取评分
# movies.append(moivescore)
moiveScore = item.find('span',class_="rating_num").text
movies.append(moiveScore)
movienum = item.select('.star span').text
movies.append(movienum)
lists.append(movies)
print(lists)
# print('名称:',title,'\n','电影连接:','\n',moiveurl,'\n','电影原图片:','\n',imgUrl,'电影评分:',moivescore) #依次打印标题 电影连接 电影图片 电影评分
def main():
for x in range(0,10): #爬取的页数 (因为豆瓣一共十页所以最大数字是10)
url = 'https://movie.douban.com/top250?'
# url = url + str(x*25)
page = x*25
htm = read(url=url,page=page)
read_data(htm)
if __name__ == '__main__':
main()
页:
[1]