编写了一个调用大模型校对的软件

shadowmage · 发表于 2025-11-16 08:26:06

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有账号？立即注册

x

如题，写了一个利用大模型校对的Py程序，代码如下：

"""
文本自动校对
版本: 1.1
作者:shadowmage
创建日期: 2025年11月
配置文件示例 (config.ini):
==============
[API]
base_url = https://api.siliconflow.cn
api_key = your_api_key_here
model_id = deepseek-ai/DeepSeek-R1-0528-Qwen3-8B
chat_endpoint = /v1/chat/completions
[Processing]
max_chunk_size = 1500
max_retries = 3
backoff_factor = 1
timeout = 600
long_text_threshold = 1000
[Paths]
[Spacy]
language_model = zh_core_web_sm
fallback_model = en_core_web_sm
许可证: MIT License
"""
import os
import json
import requests
import difflib
import re
import spacy
import configparser
from pathlib import Path
import time
import random
import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import csv
from collections import defaultdict
class OptimizedConfigManager:
def __init__(self, config_path='config.ini'):
self.config_path = Path(config_path)
self.config = configparser.ConfigParser()
self.load_config()
def load_config(self):
if not self.config_path.exists():
raise FileNotFoundError(f"配置文件 {self.config_path} 不存在")
self.config.read(self.config_path, encoding='utf-8')
required_sections = ['API', 'Processing']
for section in required_sections:
if not self.config.has_section(section):
raise ValueError(f"配置文件中缺少必要的 [{section}] 部分")
def get_api_config(self):
return {
'base_url': self.config.get('API', 'base_url', fallback='https://api.siliconflow.cn'),
'api_key': self.config.get('API', 'api_key'),
'model_id': self.config.get('API', 'model_id', fallback='deepseek-ai/DeepSeek-R1-0528-Qwen3-8B'),
'chat_endpoint': self.config.get('API', 'chat_endpoint', fallback='/v1/chat/completions'),
'batch_size': self.config.getint('API', 'batch_size', fallback=3),
'max_workers': self.config.getint('API', 'max_workers', fallback=5)
}
def get_processing_config(self):
return {
'max_chunk_size': self.config.getint('Processing', 'max_chunk_size', fallback=1500),
'max_retries': self.config.getint('Processing', 'max_retries', fallback=3),
'backoff_factor': self.config.getfloat('Processing', 'backoff_factor', fallback=1),
'timeout': self.config.getint('Processing', 'timeout', fallback=600),
'long_text_threshold': self.config.getint('Processing', 'long_text_threshold', fallback=1000),
'enable_cache': self.config.getboolean('Processing', 'enable_cache', fallback=True),
'noun_dict_path': self.config.get('Processing', 'noun_dict_path', fallback='noun_correction_dict.csv')
}
def get_paths_config(self):
return {
'source_suffix': self.config.get('Paths', 'source_suffix', fallback='_校对结果'),
'report_prefix': self.config.get('Paths', 'report_prefix', fallback='校对报告_')
}
def get_spacy_config(self):
return {
'language_model': self.config.get('Spacy', 'language_model', fallback='zh_core_web_sm'),
'fallback_model': self.config.get('Spacy', 'fallback_model', fallback='en_core_web_sm')
}
class NounCorrectionManager:
def __init__(self, dict_path='noun_correction_dict.csv'):
self.dict_path = Path(dict_path)
self.correction_dict = {}
self.load_dictionary()
def load_dictionary(self):
if self.dict_path.exists():
try:
with open(self.dict_path, 'r', encoding='utf-8', newline='') as f:
reader = csv.reader(f)
for row in reader:
if len(row) >= 2:
original, corrected = row[0], row[1]
self.correction_dict[original] = corrected
print(f"已加载名词修正字典: {len(self.correction_dict)} 条记录")
except Exception as e:
print(f"加载名词修正字典失败: {e}")
def save_dictionary(self):
try:
with open(self.dict_path, 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
for original, corrected in self.correction_dict.items():
writer.writerow([original, corrected])
print(f"名词修正字典已保存: {len(self.correction_dict)} 条记录")
except Exception as e:
print(f"保存名词修正字典失败: {e}")
def add_correction(self, original, corrected):
if original and corrected and original != corrected:
self.correction_dict[original] = corrected
def apply_corrections(self, text):
if not self.correction_dict:
return text
for original, corrected in self.correction_dict.items():
text = re.sub(r'\b' + re.escape(original) + r'\b', corrected, text)
return text
class OptimizedTextProofreader:
def __init__(self, config_path='config.ini'):
self._log_lock = threading.Lock()
self.config_manager = OptimizedConfigManager(config_path)
self.api_config = self.config_manager.get_api_config()
self.processing_config = self.config_manager.get_processing_config()
self.paths_config = self.config_manager.get_paths_config()
self.spacy_config = self.config_manager.get_spacy_config()
self.noun_manager = NounCorrectionManager(self.processing_config['noun_dict_path'])
self.chat_endpoint = f"{self.api_config['base_url']}{self.api_config['chat_endpoint']}"
self.text_cache = {} if self.processing_config['enable_cache'] else None
self.session = self._create_session()
self.nlp = self._initialize_spacy()
self.noun_changes = defaultdict(list)
self._log("优化版文本校对器初始化完成")
self._log(f"API模型: {self.api_config['model_id']}")
self._log(f"最大工作线程: {self.api_config['max_workers']}")
self._log(f"批量大小: {self.api_config['batch_size']}")
self._log(f"缓存启用: {self.processing_config['enable_cache']}")
self._log(f"名词修正字典: {self.processing_config['noun_dict_path']}")
def _create_session(self):
session = requests.Session()
retry_strategy = Retry(
total=self.processing_config['max_retries'],
backoff_factor=self.processing_config['backoff_factor'],
status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy, pool_connections=10, pool_maxsize=100)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
def _get_timestamp(self):
return datetime.datetime.now().strftime("%Y/%m/%d-%H:%M:%S")
def _log(self, message):
timestamp = self._get_timestamp()
with self._log_lock:
print(f"{timestamp} {message}")
def _initialize_spacy(self):
try:
nlp = spacy.load(self.spacy_config['language_model'])
self._log(f"spaCy模型加载成功: {self.spacy_config['language_model']}")
return nlp
except OSError:
self._log(f"警告: 未找到spaCy模型 {self.spacy_config['language_model']}，尝试备用模型...")
try:
nlp = spacy.load(self.spacy_config['fallback_model'])
self._log(f"使用spaCy备用模型: {self.spacy_config['fallback_model']}")
return nlp
except OSError:
self._log("错误: 未找到任何spaCy模型，将使用基于标点的分句方案")
return None
def _get_cache_key(self, text):
return hash(text[:100] + text[-100:]) # 使用首尾各100字符的哈希作为键
def sentence_segmentation(self, text):
sentences = []
if self.nlp:
doc = self.nlp(text)
for sent in doc.sents:
sentence_text = sent.text.strip()
if sentence_text:
sentences.append(sentence_text)
else:
sentence_endings = r'([。！？!?]+)'
parts = re.split(sentence_endings, text)
current_sentence = ""
for i, part in enumerate(parts):
if i % 2 == 0: # 句子内容
current_sentence = part.strip()
else: # 句子结束标点
if current_sentence:
sentences.append(current_sentence + part)
current_sentence = ""
if current_sentence:
sentences.append(current_sentence)
return sentences
def remove_duplicate_paragraphs(self, text):
if not text.strip():
return text
paragraphs = re.split(r'\n\s*\n', text)
unique_paragraphs = []
seen_paragraphs = set()
for para in paragraphs:
clean_para = re.sub(r'\s+', ' ', para.strip())
if clean_para and clean_para not in seen_paragraphs:
seen_paragraphs.add(clean_para)
unique_paragraphs.append(para)
if len(unique_paragraphs) < len(paragraphs):
self._log(f"删除了 {len(paragraphs) - len(unique_paragraphs)} 个重复段落")
return '\n\n'.join(unique_paragraphs)
def batch_call_api(self, text_sentences):
if not text_sentences:
return []
if self.text_cache is not None:
cached_results = []
uncached_sentences = []
uncached_indices = []
for i, sentence in enumerate(text_sentences):
cache_key = self._get_cache_key(sentence)
if cache_key in self.text_cache:
cached_results.append((i, self.text_cache[cache_key]))
else:
uncached_sentences.append(sentence)
uncached_indices.append(i)
if cached_results:
self._log(f"缓存命中: {len(cached_results)}/{len(text_sentences)} 个句子")
else:
uncached_sentences = text_sentences
uncached_indices = list(range(len(text_sentences)))
cached_results = []
if uncached_sentences:
results = self._parallel_process_sentences(uncached_sentences, uncached_indices)
all_results = cached_results + results
all_results.sort(key=lambda x: x[0])
return [result[1] for result in all_results]
else:
cached_results.sort(key=lambda x: x[0])
return [result[1] for result in cached_results]
def _parallel_process_sentences(self, sentences, indices):
results = []
batch_size = self.api_config['batch_size']
max_workers = min(self.api_config['max_workers'], len(sentences))
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_index = {
executor.submit(self._safe_call_api, sentence): idx
for sentence, idx in zip(sentences, indices)
}
completed = 0
for future in as_completed(future_to_index):
idx = future_to_index[future]
try:
result = future.result()
results.append((idx, result))
completed += 1
if self.text_cache is not None and result is not None:
cache_key = self._get_cache_key(sentences[indices.index(idx)])
self.text_cache[cache_key] = result
if completed % 10 == 0: # 每10个句子报告一次进度
self._log(f"进度: {completed}/{len(sentences)} 个句子处理完成")
except Exception as e:
self._log(f"句子处理失败: {str(e)}")
results.append((idx, None)) # 失败时返回None
return results
def _safe_call_api(self, text):
try:
return self.call_siliconflow_api(text)
except Exception as e:
self._log(f"API调用异常: {str(e)}")
return None
def _clean_markdown_annotations(self, text):
text = re.sub(r'\{[^}]*?(?:句式修正|补全|修正|优化)[^}]*?\}', '', text)
text = re.sub(r'\*\*.*?\*\*', '', text) # 加粗
text = re.sub(r'\*.*?\*', '', text) # 斜体
text = re.sub(r'`.*?`', '', text) # 代码标记
text = re.sub(r'#+\s*', '', text) # 标题标记
text = re.sub(r'\s+', ' ', text).strip()
return text
def _extract_noun_changes(self, original, corrected):
if not original or not corrected or original == corrected:
return
if self.nlp:
try:
doc_orig = self.nlp(original)
doc_corr = self.nlp(corrected)
orig_nouns = [token.text for token in doc_orig if token.pos_ in ['NOUN', 'PROPN']]
corr_nouns = [token.text for token in doc_corr if token.pos_ in ['NOUN', 'PROPN']]
for orig_noun in orig_nouns:
if orig_noun not in corr_nouns:
# 在修正文本中查找可能对应的名词
for corr_noun in corr_nouns:
if corr_noun not in orig_nouns and len(corr_noun) > 1:
self.noun_changes[orig_noun].append(corr_noun)
self.noun_manager.add_correction(orig_noun, corr_noun)
self._log(f"名词修改记录: '{orig_noun}' -> '{corr_noun}'")
except Exception as e:
self._log(f"名词分析失败: {e}")
def call_siliconflow_api(self, text):
headers = {
"Authorization": f"Bearer {self.api_config['api_key']}",
"Content-Type": "application/json"
}
system_prompt = """你是一位专业的小说文本校对专家。请严格按照以下要求进行校对：
## 校对要求
1. 只进行错别字修正、标点符号校正、语法错误修复
2. 保持原文意思和风格不变
3. 不要添加任何说明、注释、标记或解释
4. 绝对禁止使用 {句式修正}、{补全} 等任何格式的标记
5. 不要添加任何MD格式（如**加粗**、*斜体*、`代码`等）
6. 输出必须是纯净的校对后文本
## 输出格式
直接输出校对后的纯净文本，不要添加任何额外内容。"""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": f"请对以下文本进行专业校对，只修正语言错误，不要改变意思和风格，不要添加任何标记：\n\n{text}"}
]
payload = {
"model": self.api_config['model_id'],
"messages": messages,
"temperature": 0.1,
"max_tokens": 4000,
"top_p": 0.9,
"stream": False
}
max_retries = self.processing_config['max_retries']
for attempt in range(max_retries + 1):
try:
self._log(f"正在调用校对API，文本长度: {len(text)} 字符 (第{attempt + 1}次尝试)")
response = self.session.post(
self.chat_endpoint,
headers=headers,
json=payload,
timeout=self.processing_config['timeout']
)
if response.status_code != 200:
if attempt < max_retries:
wait_time = random.randint(5, 30)
time.sleep(wait_time)
continue
else:
return None
response.raise_for_status()
result = response.json()
corrected_text = result['choices'][0]['message']['content'].strip()
corrected_text = self._clean_markdown_annotations(corrected_text)
self._extract_noun_changes(text, corrected_text)
return corrected_text
except requests.exceptions.RequestException as e:
if attempt < max_retries:
wait_time = random.randint(5, 30)
time.sleep(wait_time)
continue
else:
return None
except (KeyError, IndexError, json.JSONDecodeError) as e:
return None
def process_text_sentences(self, text):
preprocessed_text = self.noun_manager.apply_corrections(text)
if preprocessed_text != text:
self._log("已应用名词修正字典进行预处理")
sentences = self.sentence_segmentation(preprocessed_text)
self._log(f"检测到句子数: {len(sentences)}")
if len(sentences) == 0:
return text
start_time = time.time()
corrected_sentences = self.batch_call_api(sentences)
processing_time = time.time() - start_time
self._log(f"句子处理完成，耗时: {processing_time:.2f}秒")
final_sentences = []
for i, (original, corrected) in enumerate(zip(sentences, corrected_sentences)):
if corrected is None:
final_sentences.append(original)
else:
final_sentences.append(corrected)
result_text = ' '.join(final_sentences)
result_text = self.remove_duplicate_paragraphs(result_text)
return result_text
def process_chapter(self, input_path, output_dir):
try:
with open(input_path, 'r', encoding='utf-8') as f:
original_text = f.read()
self._log(f"读取文件成功，长度: {len(original_text)} 字符")
except Exception as e:
self._log(f"文件读取失败: {str(e)}")
return False
start_time = time.time()
corrected_text = self.process_text_sentences(original_text)
processing_time = time.time() - start_time
if corrected_text is None:
self._log("处理失败，跳过此文件")
return False
self._log(f"文本处理完成，耗时: {processing_time:.2f}秒")
if self.noun_changes:
self.noun_manager.save_dictionary()
output_path = output_dir / input_path.name
try:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(corrected_text)
self._log(f"已保存: {output_path.name}")
return True
except Exception as e:
self._log(f"文件保存失败: {str(e)}")
return False
def main(self):
if self.nlp is None:
self._log("警告: spaCy模型未正确加载，使用备用方案")
book_name = input("请输入需要校对的书名: ").strip()
if not book_name:
print("错误: 书名不能为空")
return
source_dir = Path(book_name)
if not source_dir.exists() or not source_dir.is_dir():
print(f"错误: 找不到 '{book_name}' 文件夹")
return
output_dir = Path(f"{book_name}{self.paths_config['source_suffix']}")
try:
output_dir.mkdir(exist_ok=True)
except Exception as e:
print(f"创建输出目录失败: {str(e)}")
return
txt_files = list(source_dir.glob("*.txt"))
if not txt_files:
print(f"警告: 没有找到txt文件")
return
total_start = time.time()
success_count = 0
print(f"开始逐句处理 {len(txt_files)} 个文件...")
for i, file_path in enumerate(txt_files, 1):
print(f"\n[{i}/{len(txt_files)}] 处理: {file_path.name}")
if self.process_chapter(file_path, output_dir):
success_count += 1
if self.noun_changes:
self.noun_manager.save_dictionary()
print(f"\n名词修正字典已更新: {len(self.noun_manager.correction_dict)} 条记录")
total_time = time.time() - total_start
print("\n" + "="*50)
print(f"全部处理完成，总耗时: {total_time:.2f}秒")
print(f"平均每个文件: {total_time/len(txt_files):.2f}秒")
print(f"结果目录: {output_dir.resolve()}")
print(f"成功处理: {success_count}/{len(txt_files)} 个文件")
def main():
try:
proofreader = OptimizedTextProofreader('config.ini')
proofreader.main()
except FileNotFoundError as e:
print(f"错误: {e}")
print("请确保 config.ini 配置文件存在")
except ValueError as e:
print(f"配置错误: {e}")
except Exception as e:
print(f"程序初始化失败: {e}")
if __name__ == "__main__":
main()

复制代码

目前来看，效率还有待提高，有些文本会瞎改，这个是大模型的问题吗？有没有什么改进方法？

FishC · 发表于 2025-11-16 08:26:49

你的文本自动校对软件设计得很棒，利用了多线程和缓存机制来提升效率！关于你提到的问题，我来逐一分析并提供改进建议。

问题1：效率有待提高
当前代码使用多线程和批处理，但可能受限于API响应速度或配置参数。以下是一些优化方向：

调整批处理大小：在config.ini中，尝试减小 batch_size（例如设为1-3），避免API过载。
优化线程数：根据你的网络和API限制，调整 max_workers（例如3-5个线程）。
句子分割优化：如果文本句子过长，可降低 max_chunk_size（例如1000字符），但需平衡API调用次数。
启用缓存：确保 enable_cache = True，避免重复处理相同句子。

问题2：大模型“瞎改”文本
这确实部分源于大模型的不稳定性，但可以通过以下方法缓解：

强化提示词：在 system_prompt 中更明确地限制模型行为。例如：
1. system_prompt = """你是一位专业校对员，严格遵循：
2. 1. 仅修正错别字、标点和语法错误，绝不改变原意。
3. 2. 输出纯净文本，无任何标记或解释。
4. 3. 如果原文无误，直接返回原文。"""
复制代码
添加后处理验证：在API返回后，对比原文本和修正文本，如果修改比例过高（例如超过30%），则回退到原文本。示例代码：
1. def validate_correction(original, corrected, threshold=0.3):
2. if corrected is None:
3. return original
4. diff = difflib.SequenceMatcher(None, original, corrected).ratio()
5. if diff < (1 - threshold): # 如果修改过多
6. return original
7. return corrected
复制代码
调整API参数：降低 temperature（当前0.1已较好）和 top_p，减少随机性。
使用名词字典：你已实现名词修正，可定期更新 noun_correction_dict.csv 来纠正常见错误。

其他通用建议

监控API响应：记录失败请求，并实现指数退避重试（你的代码已有）。
测试不同模型：如果API支持，尝试其他更稳定的模型。
分阶段处理：先处理短文本测试效果，再逐步扩展。

总之，效率问题可通过参数调优解决，“瞎改”问题需结合提示词优化和后处理验证。希望这些建议能帮到你！如果还有具体问题，欢迎继续讨论。

以上回复来自 FishC AI 小助理，如未能正确解答您的问题，请继续追问。

shadowmage · 发表于 2025-11-16 09:57:53

FishC 发表于 2025-11-16 08:26
你的文本自动校对软件设计得很棒，利用了多线程和缓存机制来提升效率！关于你提到的问题，我来逐一分析并提 ...

我现在是逐句校对，如果采取更大的段落修改之类，会不会提高效率？

FishC · 发表于 2025-11-16 09:59:05

shadowmage 发表于 2025-11-16 09:57
FishC 发表于 2025-11-16 08:26
你的文本自动校对软 ...

是的，采用更大的段落（而不是逐句）进行校对可以显著提高效率！这主要是因为减少了API调用次数，从而降低了网络延迟和开销。让我详细解释一下原理和实现方法。

## 为什么段落校对更高效？

减少API调用次数：逐句校对意味着每个句子都要单独调用API，而段落校对可以将多个句子合并为一次调用
降低网络开销：每次API调用都有固定的网络延迟，合并后总体延迟大大减少
更好地利用批处理：大模型本身擅长处理连贯文本，段落级校对可以保持上下文一致性

## 具体实现方案

### 1. 修改文本分割逻辑
将原来的句子分割改为段落分割：

def paragraph_segmentation(self, text):
"""将文本分割为段落，而不是句子"""
# 首先按空行分割自然段落
raw_paragraphs = re.split(r'\n\s*\n', text)
merged_paragraphs = []
current_chunk = ""
for para in raw_paragraphs:
para = para.strip()
if not para:
continue
# 如果当前段落加上新段落不超过最大长度，就合并
if len(current_chunk) + len(para) <= self.processing_config['max_chunk_size']:
if current_chunk:
current_chunk += "\n\n" + para
else:
current_chunk = para
else:
# 当前块已满，保存并开始新块
if current_chunk:
merged_paragraphs.append(current_chunk)
current_chunk = para
# 添加最后一个块
if current_chunk:
merged_paragraphs.append(current_chunk)
self._log(f"文本分割为 {len(merged_paragraphs)} 个段落")
return merged_paragraphs

复制代码

### 2. 更新处理流程
在 `process_text_sentences` 方法中替换分割方式：

def process_text_paragraphs(self, text):
preprocessed_text = self.noun_manager.apply_corrections(text)
# 使用段落分割而不是句子分割
paragraphs = self.paragraph_segmentation(preprocessed_text)
self._log(f"检测到段落数: {len(paragraphs)}")
if len(paragraphs) == 0:
return text
start_time = time.time()
corrected_paragraphs = self.batch_call_api(paragraphs) # 这里处理的是段落
processing_time = time.time() - start_time
self._log(f"段落处理完成，耗时: {processing_time:.2f}秒")
final_paragraphs = []
for i, (original, corrected) in enumerate(zip(paragraphs, corrected_paragraphs)):
if corrected is None:
final_paragraphs.append(original)
else:
final_paragraphs.append(corrected)
# 用两个换行符重新连接段落
result_text = '\n\n'.join(final_paragraphs)
result_text = self.remove_duplicate_paragraphs(result_text)
return result_text

复制代码

### 3. 调整配置参数
在 config.ini 中相应调整：

[Processing]
max_chunk_size = 3000 # 增加到段落级别
long_text_threshold = 2000

## 优化后的提示词

为了确保段落校对质量，需要强化系统提示词：

system_prompt = """你是一位专业的小说文本校对专家。请严格按照以下要求进行段落级校对：
## 校对要求
1. 对整个段落进行连贯性校对，保持原文意思、风格和语气不变
2. 只修正错别字、标点符号错误、语法错误
3. 保持段落的自然流畅，不要改变段落结构
4. 绝对禁止添加任何说明、注释、标记或解释
5. 输出必须是纯净的校对后段落文本
## 重要原则
- 如果原文没有明显错误，直接返回原文
- 保持人物对话的自然性和口语化特征
- 保留原文的文学修辞和表达风格
直接输出校对后的纯净段落文本，不要添加任何额外内容。"""

复制代码

## 预期效果

效率提升：通常能减少50-80%的API调用次数，总体处理时间大幅缩短
质量改进：段落级校对能更好地保持上下文连贯性，减少"瞎改"现象
成本降低：API调用次数减少，相应的使用成本也会降低

## 注意事项

段落大小要适中：建议 max_chunk_size 在2000-4000字符之间，避免超出模型处理能力
监控质量变化：初期建议对比测试，确保段落校对不会引入新的问题
保留逐句选项：对于特别重要的文本，可以保留逐句校对作为备选方案

建议你先在小规模文本上测试这种段落校对方式，确认效果后再全面应用。如果发现某些段落被过度修改，可以进一步调整提示词或减小段落大小。

以上回复来自 FishC AI 小助理，如未能正确解答您的问题，请继续追问。

shadowmage · 发表于 2025-11-16 20:17:03

FishC 发表于 2025-11-16 09:59
是的，采用更大的段落（而不是逐句）进行校对可以显著提高效率！这主要是因为减少了API调用次数，从而 ...

如果不借助大模型，python可以自主校对吗？

FishC · 发表于 2025-11-16 20:18:33

shadowmage 发表于 2025-11-16 20:17
FishC 发表于 2025-11-16 09:59
是的，采用更大的段 ...

是的，Python可以自主进行一些基础校对，但能力有限。让我详细解释一下Python自主校对的能力范围和具体实现方法：

## Python自主校对的能力范围

### 可以实现的功能：

拼写检查：使用 pycorrector 或 pyspellchecker
标点符号校正：修复中英文标点混用
重复字符检测：删除多余的空格、重复的标点
简单语法检查：基于规则的模式匹配
专有名词一致性：通过自定义词典
格式规范化：统一数字、日期格式

### 无法实现的功能：

语义理解错误
上下文相关的语法错误
逻辑矛盾检测
风格一致性调整

## 具体实现方案

### 1. 安装必要的库

pip install pycorrector
pip install pyspellchecker
pip install language-tool-python

复制代码

### 2. 基础自主校对类

import re
import pycorrector
from spellchecker import SpellChecker
import language_tool_python
class BasicAutoProofreader:
def __init__(self):
self.chinese_corrector = pycorrector
self.english_spellchecker = SpellChecker()
self.grammar_tool = language_tool_python.LanguageTool('zh-CN')
def correct_punctuation(self, text):
"""校正标点符号"""
# 中英文标点转换
text = re.sub(r',', '，', text)
text = re.sub(r'\.', '。', text)
text = re.sub(r'!', '！', text)
text = re.sub(r'\?', '？', text)
text = re.sub(r':', '：', text)
text = re.sub(r';', '；', text)
# 修复重复标点
text = re.sub(r'。{2,}', '。', text)
text = re.sub(r'，{2,}', '，', text)
text = re.sub(r'！{2,}', '！', text)
return text
def remove_extra_spaces(self, text):
"""删除多余空格"""
text = re.sub(r' +', ' ', text) # 多个空格变一个
text = re.sub(r'\s+$', '', text, flags=re.MULTILINE) # 行尾空格
text = re.sub(r'^\s+', '', text, flags=re.MULTILINE) # 行首空格
return text
def correct_chinese_spelling(self, text):
"""中文拼写纠错"""
corrected, details = self.chinese_corrector.correct(text)
return corrected
def correct_english_spelling(self, text):
"""英文拼写纠错"""
words = text.split()
corrected_words = []
for word in words:
# 只对看起来像英文单词的部分进行纠错
if re.match(r'^[a-zA-Z]+$', word):
corrected_word = self.english_spellchecker.correction(word)
corrected_words.append(corrected_word if corrected_word else word)
else:
corrected_words.append(word)
return ' '.join(corrected_words)
def check_grammar(self, text):
"""基础语法检查"""
matches = self.grammar_tool.check(text)
corrected = self.grammar_tool.correct(text)
return corrected, matches
def normalize_numbers(self, text):
"""数字格式规范化"""
# 中文数字转阿拉伯数字（简单版本）
text = re.sub(r'零', '0', text)
text = re.sub(r'一', '1', text)
text = re.sub(r'二', '2', text)
text = re.sub(r'三', '3', text)
text = re.sub(r'四', '4', text)
text = re.sub(r'五', '5', text)
text = re.sub(r'六', '6', text)
text = re.sub(r'七', '7', text)
text = re.sub(r'八', '8', text)
text = re.sub(r'九', '9', text)
return text
def comprehensive_correction(self, text):
"""综合校对"""
original_text = text
# 按顺序应用各种校正
text = self.remove_extra_spaces(text)
text = self.correct_punctuation(text)
text = self.correct_chinese_spelling(text)
text = self.correct_english_spelling(text)
text, grammar_issues = self.check_grammar(text)
text = self.normalize_numbers(text)
return text

复制代码

### 3. 使用示例

# 初始化校对器
proofreader = BasicAutoProofreader()
# 测试文本
test_text = "今天天气很好,我打算去公园玩. There are many people in the park."
# 进行校对
corrected_text = proofreader.comprehensive_correction(test_text)
print(f"原文: {test_text}")
print(f"校对后: {corrected_text}")

复制代码

## 混合方案建议

对于实际应用，我建议采用**混合方案**：

### 方案1：先自主后大模型

先用自主校对处理明显错误
再用大模型处理复杂问题
这样减少大模型的工作量，提高效率

### 方案2：分层处理

def hybrid_proofreading(text):
# 第一层：快速自主校对
basic_corrected = basic_proofreader.comprehensive_correction(text)
# 第二层：大模型深度校对（仅在必要时）
if needs_deep_correction(basic_corrected):
final_text = llm_proofreader.process_text(basic_corrected)
else:
final_text = basic_corrected
return final_text
def needs_deep_correction(text):
"""判断是否需要大模型深度校对"""
# 基于文本复杂度、长度、错误密度等判断
word_count = len(text)
complexity_score = calculate_complexity(text)
return word_count > 500 or complexity_score > 0.1

复制代码

## 总结

Python自主校对适合处理规则明确的简单错误
对于语义理解和复杂语法，仍需大模型
混合方案能平衡效率和质量
自主校对可以作为预处理步骤，减少API调用成本

如果你主要处理的是技术文档或格式相对规范的文本，自主校对可能已经能满足大部分需求。但对于小说等创意文本，大模型仍然是更好的选择。

以上回复来自 FishC AI 小助理，如未能正确解答您的问题，请继续追问。

账号		自动登录	找回密码
密码			立即注册

[其他工具] 编写了一个调用大模型校对的软件

马上注册，结交更多好友，享用更多功能^_^