aaron.yang
发表于 2020-8-2 18:59:24
联想知道
反清复明君
发表于 2020-8-3 11:37:14
import requests
import bs4
import re
import openpyxl
def open_url(url):
# 使用代理
# proxies = {"http": "127.0.0.1:1080", "https": "127.0.0.1:1080"}
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}
# res = requests.get(url, headers=headers, proxies=proxies)
res = requests.get(url, headers=headers)
return res
def find_movies(res):
soup = bs4.BeautifulSoup(res.text, 'html.parser')
# 电影名
movies = []
targets = soup.find_all("div", class_="hd")
for each in targets:
movies.append(each.a.span.text)
# 评分
ranks = []
targets = soup.find_all("span", class_="rating_num")
for each in targets:
ranks.append(each.text)
# 资料
messages = []
targets = soup.find_all("div", class_="bd")
for each in targets:
try:
messages.append(each.p.text.split('\n').strip() + each.p.text.split('\n').strip())
except:
continue
result = []
length = len(movies)
for i in range(length):
result.append(, ranks, messages])
return result
# 找出一共有多少个页面
def find_depth(res):
soup = bs4.BeautifulSoup(res.text, 'html.parser')
depth = soup.find('span', class_='next').previous_sibling.previous_sibling.text
return int(depth)
def save_to_excel(result):
wb = openpyxl.Workbook()
ws = wb.active
ws['A1'] = "电影名称"
ws['B1'] = "评分"
ws['C1'] = "资料"
for each in result:
ws.append(each)
wb.save("豆瓣TOP250电影.xlsx")
def main():
host = "https://movie.douban.com/top250"
res = open_url(host)
depth = find_depth(res)
result = []
for i in range(depth):
url = host + '/?start=' + str(25 * i)
res = open_url(url)
result.extend(find_movies(res))
'''
with open("test.txt", "w", encoding="utf-8") as f:
for each in result:
f.write(each)
'''
save_to_excel(result)
if __name__ == "__main__":
main()
winnerfire
发表于 2020-8-5 04:49:12
朕想知道
不重要的用户1
发表于 2020-8-5 10:44:20
朕想知道
cwhsmile
发表于 2020-8-5 21:11:49
朕想知道
NowGofree
发表于 2020-8-6 13:08:58
朕想知道
zcrui9911
发表于 2020-8-7 16:17:24
朕想知道
阿傻
发表于 2020-8-8 10:56:30
朕想知道
hsy1
发表于 2020-8-9 18:15:54
朕想知道
crins
发表于 2020-8-10 13:28:24
朕想知道
python萌新宝宝
发表于 2020-8-10 15:24:45
朕想知道
lwhhlb
发表于 2020-8-10 20:40:29
朕想知道
hjg
发表于 2020-8-13 15:13:10
朕想知道
I.T.123
发表于 2020-8-13 15:38:25
朕想知道
lijianbo7909
发表于 2020-8-14 21:38:02
想学习!
lznanjing
发表于 2020-8-15 13:26:40
真想知道
冷不防
发表于 2020-8-16 16:08:46
朕想知道
mobius_liu
发表于 2020-8-20 13:33:43
朕想知道
uthgavin
发表于 2020-8-20 14:54:03
{:5_109:}
木子平方
发表于 2020-8-20 15:13:40
朕想知道