|
发表于 2025-9-8 20:04:59
|
显示全部楼层
当然可以!将代码按照功能分层是一个很好的做法,这样不仅提高了代码的可读性和可维护性,还能方便后续扩展和优化。下面我将为你提供一个基于 crawler / parser / store / scheduler 四层架构的简单示例,使用 Python 实现。
- import requests
- from bs4 import BeautifulSoup
- import sqlite3
- import os
- from urllib.parse import urljoin, urlparse
- import time
- from queue import Queue
- from threading import Thread
- # ==================== Crawler 层 ====================
- class Crawler:
- def fetch(self, url):
- try:
- response = requests.get(url, timeout=10)
- response.raise_for_status()
- return response.text
- except requests.RequestException as e:
- print(f"抓取失败: {url}, 错误: {e}")
- return None
- # ==================== Parser 层 ====================
- class Parser:
- def parse(self, html, base_url):
- if not html:
- return None
- soup = BeautifulSoup(html, 'html.parser')
- # 示例:提取标题和所有链接
- title = soup.title.string if soup.title else "无标题"
- links = [urljoin(base_url, a.get('href')) for a in soup.find_all('a', href=True)]
- return {
- 'title': title,
- 'links': links
- }
- # ==================== Store 层 ====================
- class Store:
- def __init__(self, db_path='data.db', storage_dir='pages'):
- self.db_path = db_path
- self.storage_dir = storage_dir
- os.makedirs(storage_dir, exist_ok=True)
- self.init_db()
- def init_db(self):
- conn = sqlite3.connect(self.db_path)
- cursor = conn.cursor()
- cursor.execute('''
- CREATE TABLE IF NOT EXISTS pages (
- url TEXT PRIMARY KEY,
- title TEXT,
- file_path TEXT,
- created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
- )
- ''')
- conn.commit()
- conn.close()
- def save_page(self, url, title, content):
- # 生成文件名
- parsed_url = urlparse(url)
- file_name = f"{parsed_url.netloc}{parsed_url.path}".replace('/', '_')
- if not file_name:
- file_name = 'index'
- file_path = os.path.join(self.storage_dir, file_name + '.html')
-
- # 保存HTML内容到文件
- with open(file_path, 'w', encoding='utf-8') as f:
- f.write(content)
-
- # 存储元数据到SQLite
- conn = sqlite3.connect(self.db_path)
- cursor = conn.cursor()
- cursor.execute('''
- INSERT OR REPLACE INTO pages (url, title, file_path)
- VALUES (?, ?, ?)
- ''', (url, title, file_path))
- conn.commit()
- conn.close()
- # ==================== Scheduler 层 ====================
- class Scheduler:
- def __init__(self, max_workers=3):
- self.queue = Queue()
- self.visited = set()
- self.crawler = Crawler()
- self.parser = Parser()
- self.store = Store()
- self.max_workers = max_workers
- def add_url(self, url):
- if url not in self.visited:
- self.queue.put(url)
- self.visited.add(url)
- def worker(self):
- while True:
- url = self.queue.get()
- if url is None:
- break
-
- # 1. 抓取
- html = self.crawler.fetch(url)
- if not html:
- self.queue.task_done()
- continue
-
- # 2. 解析
- parsed_data = self.parser.parse(html, url)
- if parsed_data:
- # 3. 存储
- self.store.save_page(url, parsed_data['title'], html)
-
- # 4. 将新链接加入队列
- for link in parsed_data['links']:
- self.add_url(link)
-
- self.queue.task_done()
- time.sleep(1) # 礼貌性延迟
- def start(self, start_url):
- self.add_url(start_url)
- threads = []
- for _ in range(self.max_workers):
- t = Thread(target=self.worker)
- t.start()
- threads.append(t)
-
- self.queue.join()
-
- # 停止工作线程
- for _ in range(self.max_workers):
- self.queue.put(None)
- for t in threads:
- t.join()
- # ==================== 使用示例 ====================
- if __name__ == "__main__":
- scheduler = Scheduler()
- scheduler.start('[url=https://example.com')]https://example.com')[/url]
复制代码
说明:
- Crawler 层:只负责通过 HTTP 获取原始 HTML 内容。
- Parser 层:负责解析 HTML,提取所需的结构化数据(如标题、链接)。
- Store 层:负责将数据存储到文件系统,并使用 SQLite 维护索引。
- Scheduler 层:负责任务调度、URL 去重、多线程抓取以及协调其他组件。
你可以根据实际需求进一步扩展每层的功能,例如增加代理支持、异常重试机制、更复杂的数据解析等。希望这个示例能帮助你更好地组织代码!
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。 |
|