|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 一个账号 于 2020-3-30 18:37 编辑
- import requests
- import bs4
- import re
- import sys
- import threading
- import time
- from easygui import exceptionbox
- # 返回最后一页
- def return_last(res, method=1):
- try:
- soup = bs4.BeautifulSoup(res.text, "html.parser")
-
- if method == 1:
- try:
- content = soup.find("a", class_="last")
- p = re.compile(r"\d")
- result = p.findall(content.text)
- num = int("".join(result))
- return num
-
- except AttributeError:
- return return_last(res, 2)
-
- else:
- content = soup.find("div", class_="pg")
- num = content.text.find("/")
- p = re.compile(r"\d")
- result = p.findall(content.text[num:])
- num = int("".join(result))
-
- return num
- except:
- return False
- # 写入内容并保存
- def write(content, num=1, end=False, url=""):
- file = open("帖子.txt", "a", encoding="utf-8")
- prefix = ""
- # 读取每一行并写入
- for each in content:
- file.write(str(num) + ". " + each.text + " ————> " + prefix + each["href"] + "\n\n")
- num += 1
- if not end:
- file.write("-" * 110 + "\n\n")
- file.close()
- return num
- # 寻找帖子
- def find_data(res):
- soup = bs4.BeautifulSoup(res.text, "html.parser")
- content = soup.find_all(class_="s xst")
- return content
- # 打开链接
- def open_url(url, cookie):
- headers = {}
- headers["User-Agent"] = "Mozilla/5.0"
- if cookie != None:
- headers["cookie"] = cookie
-
- res = requests.post(url, headers=headers)
-
- return res
- # 主函数
- def main(url, page, interval, cookie):
- # 用于清除文件里面的内容
- with open("帖子.txt", "w", encoding="utf-8"):
- pass
- if page == 0:
- res = open_url(url)
- page = return_last(res)
- subject_num = 1
- judge = False # 用于判断是否是最后一页,如果是,则不写入分隔线
- method = 1 # 用于分辨使用哪种方法来分隔
- if "&" not in url:
- new_url = url.split("-")
-
- p = re.compile(r"\d")
- num = p.findall(new_url[2]) # 用于翻页
- num = int("".join(num))
-
- new_url[2] = str(num)+".html"
- new_url = "-".join(new_url)
- method = 1
- else:
- if "&page" not in url:
- new_url = url
- new_url += "&page="
- num = 1
- else:
- new_url = url.split("=")
- num = int(new_url[-1])
- # 查找页数位置
- new_url = url
- p = re.compile(r"&page=")
- index = p.search(new_url).end()
- # 删除数字
- new_url = list(new_url)
- del new_url[index:]
-
- new_url = "".join(new_url)
-
- method = 2
- page += num - 1
-
- while num <= page:
- if num == page:
- judge = True
- content = ""
- while not content:
- res = open_url(new_url, cookie)
- content = find_data(res)
-
- subject_num = write(content, subject_num, judge, url)
- print(f"以爬取第 {num} 页...")
- num += 1
- if method == 1:
- new_url = url.split("-")
- new_url[2] = str(num)+".html"
- new_url = "-".join(new_url)
-
- else:
- new_url += str(num)
- time.sleep(interval)
- if __name__ == "__main__":
- try:
- url = input("请输入板块地址:")
- page = int(input('请输入爬取的页数("0"表示全部):'))
- interval = eval(input("请输入爬取间隔:"))
-
- cookie = None
- if input("是否使用授权(y/n):").lower() == "y":
- cookie = input("请输入 cookie:")
- main(url, page, interval, cookie)
- print("爬取完毕!")
- input()
- except SystemExit:
- pass
-
- except:
- exceptionbox("爬取失败,请将以下错误报告和您输入的内容反馈给管理员:")
复制代码 |
|