|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
import requests
import bs4
#使用session保持通话
session = requests.Session()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:61.0) Gecko/20100101 Firefox/61.0',
'Referer':'https://www.douban.com/'
}
#登陆获取权限
def login():
data = {
'form_email':'2257598360@qq.com',
'form_password':'chenjinxin123',
'source':'index_nav'
}
session.post('https://www.douban.com/accounts/login',headers = headers,data = data)
#打开网页
def open_url(url):
res = session.get(url,headers = headers)
return res
#找到影评的标签
def find_comments(res):
soup = bs4.BeautifulSoup(res.text,'html.parser')
comments = []
targets = soup.find_all('span',class_ = 'short')
for each in targets:
comments.append(each.text.strip()+'\n')
return comments
#获取电影的编号
def get_name_id(host):
name_id = host.split('/')[4]
return str(name_id)
#爬取的主函数
def main():
login()
soure_host = 'https://movie.douban.com/subject/{}/comments?start='.format(get_name_id('https://movie.douban.com/subject/26322774/comments?sort=new_score&status=P'))
i = 0
results = []
while True:
url = soure_host + str(i*20) + '&limit=20&sort=new_score&status=P'
res = open_url(url)
if res:
comments = find_comments(res)
results.extend(comments)
with open('逐梦演艺圈评论短评.txt','w',encoding='utf-8') as f:
for each in results:
f.write(each)
i += 1
else:
print('已经爬取到最后一页结束!!')
break
print(i)
if __name__ =='__main__':
main()
在学习了小甲鱼的爬取豆瓣TOP250之后我觉得听他说豆瓣评分低的一定是烂片
我看中了逐梦演艺圈
果然爬取结果是一片差评不断
如果不登陆大概可以爬取10页
登陆了就不懂了可以调整i的值
这里是使用session保持会话,当然这个不如妹子图来的舒服
|
-
|