爬取论坛 xx 版块的帖子的标题和链接,Python交流,编程语言专区,鱼C论坛

一个账号 发表于 2020-3-25 18:00:14

爬取论坛 xx 版块的帖子的标题和链接

本帖最后由一个账号于 2020-3-30 18:37 编辑

import requests
import bs4
import re
import sys
import threading
import time

from easygui import exceptionbox

# 返回最后一页
def return_last(res, method=1):
try:
   soup = bs4.BeautifulSoup(res.text, "html.parser")

   if method == 1:
         try:
            content = soup.find("a", class_="last")
            p = re.compile(r"\d")
            result = p.findall(content.text)
            num = int("".join(result))

            return num

         except AttributeError:
            return return_last(res, 2)

   else:
         content = soup.find("div", class_="pg")
         num = content.text.find("/")
         p = re.compile(r"\d")
         result = p.findall(content.text)
         num = int("".join(result))

         return num
except:
   return False

# 写入内容并保存
def write(content, num=1, end=False, url=""):
file = open("帖子.txt", "a", encoding="utf-8")
prefix = ""

# 读取每一行并写入
for each in content:
   file.write(str(num) + ". " + each.text + "————>" + prefix + each["href"] + "\n\n")
   num += 1

if not end:
   file.write("-" * 110 + "\n\n")

file.close()

return num

# 寻找帖子
def find_data(res):
soup = bs4.BeautifulSoup(res.text, "html.parser")
content = soup.find_all(class_="s xst")

return content

# 打开链接
def open_url(url, cookie):
headers = {}

headers["User-Agent"] = "Mozilla/5.0"

if cookie != None:
   headers["cookie"] = cookie

res = requests.post(url, headers=headers)

return res

# 主函数
def main(url, page, interval, cookie):
# 用于清除文件里面的内容
with open("帖子.txt", "w", encoding="utf-8"):
   pass

if page == 0:
   res = open_url(url)
   page = return_last(res)

subject_num = 1
judge = False # 用于判断是否是最后一页，如果是，则不写入分隔线
method = 1# 用于分辨使用哪种方法来分隔

if "&" not in url:
   new_url = url.split("-")

   p = re.compile(r"\d")
   num = p.findall(new_url) # 用于翻页
   num = int("".join(num))

   new_url = str(num)+".html"
   new_url = "-".join(new_url)
   method = 1

else:
   if "&page" not in url:
         new_url = url
         new_url += "&page="
         num = 1
   else:
         new_url = url.split("=")
         num = int(new_url[-1])

         # 查找页数位置
         new_url = url
         p = re.compile(r"&page=")
         index = p.search(new_url).end()

         # 删除数字
         new_url = list(new_url)
         del new_url

         new_url = "".join(new_url)

   method = 2

page += num - 1

while num <= page:
   if num == page:
         judge = True

   content = ""

   while not content:
         res = open_url(new_url, cookie)
         content = find_data(res)

   subject_num = write(content, subject_num, judge, url)

   print(f"以爬取第 {num} 页...")

   num += 1

   if method == 1:
         new_url = url.split("-")
         new_url = str(num)+".html"
         new_url = "-".join(new_url)

   else:
         new_url += str(num)

   time.sleep(interval)

if __name__ == "__main__":
try:
   url = input("请输入板块地址：")
   page = int(input('请输入爬取的页数("0"表示全部)：'))
   interval = eval(input("请输入爬取间隔："))

   cookie = None

   if input("是否使用授权(y/n)：").lower() == "y":
         cookie = input("请输入 cookie：")

   main(url, page, interval, cookie)

   print("爬取完毕！")
   input()

except SystemExit:
   pass

except:
   exceptionbox("爬取失败，请将以下错误报告和您输入的内容反馈给管理员：")

页: [1]

鱼C论坛's Archiver

爬取论坛 xx 版块的帖子的标题和链接