|
发表于 2019-10-17 11:32:42
|
显示全部楼层
import requests
import bs4
def open_url(keyword, page, each): #-------------------------打开链接
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
"referer": "https://search.bilibili.com"}
order = ["totalrank", "click", "pubdate", "dm", "stow"]
res = []
for i in range(page):
params = {"keyword": keyword,
"order": order[each],
"duration": "4", # 时长:60分钟以上
"tids_1": "36", # 分区:科技
"page": f"{i+1}"}
url = "https://search.bilibili.com/all"
res.append(requests.get(url, params=params, headers=headers).text)
return res
def get_data(res, page): #-----------------------------------获取数据
title = []
link = []
msg = []
for i in range(page):
soup = bs4.BeautifulSoup(res[i], "html.parser")
target = soup.find_all("li", class_="video-item matrix")
title.extend([each.a["title"] for each in target])
link.extend([each.a["href"] for each in target])
target = soup.find_all("span", class_="so-icon")
msg.extend([each.text.strip() for each in target])
msg = [msg[i:i+4] for i in range(0,page*80,4)] # 将列表内部划分成多个子列表
return title, link, msg
def save_file(data, page, filename): #-----------------------保存文件
result = []
for i in range(page*20):
result.append(f"【{i+1}】 " + data[0][i] + " " + data[1][i][2:-12] + " " + " ".join(data[2][i]) + "\n\n")
with open(f"E:\\{filename}.txt", "w", encoding="utf-8") as f:
f.writelines(each for each in result)
def main():
keyword = input("请输入关键词:")
page = int(input("请输入需要爬取的页数(1~50):"))
name = ["综合排序", "最多点击", "最新发布", "最多弹幕", "最多收藏"]
for each in range(len(name)):
res = open_url(keyword, page, each)
data = get_data(res, page)
save_file(data, page, name[each])
if __name__ == "__main__":
main()
=================================================================
感觉我写的比小甲鱼的资源消耗大一些 |
|