爬取论坛 xx 版块的帖子的标题和链接

一个账号 · 发表于 2020-3-25 18:00:14

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

本帖最后由一个账号于 2020-3-30 18:37 编辑

import requests
import bs4
import re
import sys
import threading
import time
from easygui import exceptionbox
# 返回最后一页
def return_last(res, method=1):
try:
soup = bs4.BeautifulSoup(res.text, "html.parser")
if method == 1:
try:
content = soup.find("a", class_="last")
p = re.compile(r"\d")
result = p.findall(content.text)
num = int("".join(result))
return num
except AttributeError:
return return_last(res, 2)
else:
content = soup.find("div", class_="pg")
num = content.text.find("/")
p = re.compile(r"\d")
result = p.findall(content.text[num:])
num = int("".join(result))
return num
except:
return False
# 写入内容并保存
def write(content, num=1, end=False, url=""):
file = open("帖子.txt", "a", encoding="utf-8")
prefix = ""
# 读取每一行并写入
for each in content:
file.write(str(num) + ". " + each.text + " ————> " + prefix + each["href"] + "\n\n")
num += 1
if not end:
file.write("-" * 110 + "\n\n")
file.close()
return num
# 寻找帖子
def find_data(res):
soup = bs4.BeautifulSoup(res.text, "html.parser")
content = soup.find_all(class_="s xst")
return content
# 打开链接
def open_url(url, cookie):
headers = {}
headers["User-Agent"] = "Mozilla/5.0"
if cookie != None:
headers["cookie"] = cookie
res = requests.post(url, headers=headers)
return res
# 主函数
def main(url, page, interval, cookie):
# 用于清除文件里面的内容
with open("帖子.txt", "w", encoding="utf-8"):
pass
if page == 0:
res = open_url(url)
page = return_last(res)
subject_num = 1
judge = False # 用于判断是否是最后一页，如果是，则不写入分隔线
method = 1 # 用于分辨使用哪种方法来分隔
if "&" not in url:
new_url = url.split("-")
p = re.compile(r"\d")
num = p.findall(new_url[2]) # 用于翻页
num = int("".join(num))
new_url[2] = str(num)+".html"
new_url = "-".join(new_url)
method = 1
else:
if "&page" not in url:
new_url = url
new_url += "&page="
num = 1
else:
new_url = url.split("=")
num = int(new_url[-1])
# 查找页数位置
new_url = url
p = re.compile(r"&page=")
index = p.search(new_url).end()
# 删除数字
new_url = list(new_url)
del new_url[index:]
new_url = "".join(new_url)
method = 2
page += num - 1
while num <= page:
if num == page:
judge = True
content = ""
while not content:
res = open_url(new_url, cookie)
content = find_data(res)
subject_num = write(content, subject_num, judge, url)
print(f"以爬取第 {num} 页...")
num += 1
if method == 1:
new_url = url.split("-")
new_url[2] = str(num)+".html"
new_url = "-".join(new_url)
else:
new_url += str(num)
time.sleep(interval)
if __name__ == "__main__":
try:
url = input("请输入板块地址：")
page = int(input('请输入爬取的页数("0"表示全部)：'))
interval = eval(input("请输入爬取间隔："))
cookie = None
if input("是否使用授权(y/n)：").lower() == "y":
cookie = input("请输入 cookie：")
main(url, page, interval, cookie)
print("爬取完毕！")
input()
except SystemExit:
pass
except:
exceptionbox("爬取失败，请将以下错误报告和您输入的内容反馈给管理员：")

复制代码

账号		自动登录	找回密码
密码			立即注册

爬取论坛 xx 版块的帖子的标题和链接

马上注册，结交更多好友，享用更多功能^_^

本帖被以下淘专辑推荐: