|
|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
本帖最后由 Raddy 于 2026-1-15 08:32 编辑
- from pptx import Presentation
- from docx import Document
- from docx.shared import Pt, Inches, RGBColor
- from docx.oxml.shared import OxmlElement, qn
- import os
- import shutil
- import tempfile
- import re
- from PIL import Image # 需要安装Pillow库来处理TIFF格式
- def clean_text(text):
- """
- 清理文本,移除非XML兼容字符(控制字符、NULL字节等)
- :param text: 输入文本
- :return: 清理后的文本
- """
- # 保留可打印字符(包括中文、数字、标点等)
- return ''.join(ch for ch in text if ch.isprintable())
- def extract_ppt_to_docx(pptx_path, output_path="output.docx"):
- """
- 提取PPTX中的文字内容、公式和图片,输出为Word文档
- :param pptx_path: PPTX文件路径
- :param output_path: 输出Word文件路径
- """
- # 检查文件是否存在
- if not os.path.exists(pptx_path):
- print(f"❌ 错误:文件 {pptx_path} 不存在")
- return
- # 检查文件格式
- if not pptx_path.lower().endswith('.pptx'):
- print(f"❌ 错误:文件 {pptx_path} 不是PPTX格式")
- return
- try:
- prs = Presentation(pptx_path)
- except Exception as e:
- print(f"❌ 错误:文件 {pptx_path} 无法打开(可能是PPT格式不对或损坏)")
- print("提示:请确保是PPTX格式(Office 2007+),且文件路径正确")
- return
- # 创建临时目录用于存储图片
- temp_dir = tempfile.mkdtemp()
- print(f"📁 临时目录:{temp_dir}")
-
- doc = Document()
- count_images = 0
- count_formulas = 0
-
- # 遍历每张幻灯片
- for slide_num, slide in enumerate(prs.slides, 1):
- print(f"🔄 处理幻灯片 {slide_num}")
-
- # 提取标题
- if slide.shapes.title:
- title = slide.shapes.title.text.strip()
- if title:
- title_para = doc.add_heading(title, level=1)
- title_para.runs[0].font.bold = True
- title_para.runs[0].font.color.rgb = RGBColor(0, 0, 128)
-
- # 处理所有形状
- for shape_idx, shape in enumerate(slide.shapes):
- print(f" 🔍 检查形状 {shape_idx}: 类型={shape.shape_type}, 是否有图片={hasattr(shape, 'image') and shape.image is not None}")
-
- # 1. 优先处理公式对象(不依赖关键词判断)
- if shape.shape_type == 12: # OLE Object (公式)
- print(f" 🧮 处理公式对象(不依赖关键词)")
- process_formula_object(shape, slide_num, temp_dir, doc, count_formulas)
- continue # 跳过后续处理
-
- # 2. 处理图片(包含数学题的图片)
- if shape.shape_type == 13 and hasattr(shape, 'image') and shape.image:
- print(f" 🖼️ 检查图片形状")
- if is_math_problem_image(slide): # 仍然保留关键词判断
- process_image(shape, slide_num, shape_idx, temp_dir, doc, count_images)
- continue
-
- # 3. 处理普通文本(包含分数)
- if shape.has_text_frame and shape.text_frame.text.strip():
- text = shape.text_frame.text.strip()
- if text:
- # 关键修复:清理文本防止XML错误
- cleaned_text = clean_text(text)
- # 处理分数格式
- processed_text = process_mixed_fraction(cleaned_text)
-
- # 添加段落
- p = doc.add_paragraph()
- p.paragraph_format.space_after = Pt(0)
- # 再次清理确保安全
- final_text = clean_text(processed_text)
- run = p.add_run(final_text)
- run.font.size = Pt(11)
-
- # 每张幻灯片后加个空行
- if slide_num < len(prs.slides):
- doc.add_paragraph()
- # 保存Word文档
- doc.save(output_path)
- print(f"✅ 提取完成!已保存到 {output_path}(共 {len(prs.slides)} 张幻灯片,包含 {count_images} 张图片,{count_formulas} 个公式)")
-
- # 清理临时文件
- try:
- shutil.rmtree(temp_dir)
- print(f"🗑️ 临时目录已清理: {temp_dir}")
- except Exception as e:
- print(f"⚠️ 清理临时目录失败: {e}")
- def process_formula_object(shape, slide_num, temp_dir, doc, count_formulas):
- """处理公式对象(OLE Object),不依赖关键词判断"""
- try:
- # 尝试获取预览图片
- if hasattr(shape, 'image') and shape.image and hasattr(shape.image, 'blob') and shape.image.blob:
- # 保存预览图片
- original_ext = 'png'
- if hasattr(shape.image, 'filename') and shape.image.filename:
- original_ext = os.path.splitext(shape.image.filename)[1][1:].lower()
-
- formula_img_path = os.path.join(temp_dir, f"formula_{slide_num}_{shape.shape_id}.{original_ext}")
- with open(formula_img_path, "wb") as f:
- f.write(shape.image.blob)
-
- # 处理TIFF格式
- processed_img_path = formula_img_path
- if original_ext.lower() in ['tiff', 'tif']:
- processed_img_path = convert_tiff_to_png(formula_img_path)
- if processed_img_path is None:
- print(f" ❌ 公式图片TIFF转换失败,跳过")
- return
-
- # 插入图片
- if os.path.exists(processed_img_path) and os.path.getsize(processed_img_path) > 0:
- doc.add_paragraph().add_run().add_picture(processed_img_path, width=Inches(5))
- doc.add_paragraph()
- count_formulas += 1
- print(f" ✅ 公式图片已插入(计数: {count_formulas})")
- return
-
- # 2. 如果没有预览图片,尝试使用替代文本
- alt_text = getattr(shape, 'text', None) or getattr(shape, 'alternative_text', '')
- if alt_text:
- # 关键修复:清理替代文本
- cleaned_alt_text = clean_text(alt_text)
- # 处理分数格式
- processed_alt_text = process_mixed_fraction(cleaned_alt_text)
- # 再次清理
- final_alt_text = clean_text(processed_alt_text)
- print(f" 📝 处理公式替代文本: {final_alt_text}")
-
- p = doc.add_paragraph()
- p.paragraph_format.space_after = Pt(0)
- run = p.add_run(final_alt_text)
- run.font.size = Pt(11)
- run.font.italic = True
- count_formulas += 1
- print(f" ✅ 公式文本已插入(计数: {count_formulas})")
- return
-
- except Exception as e:
- print(f" ❌ 处理公式对象时出错: {e}")
- def process_image(shape, slide_num, shape_idx, temp_dir, doc, count_images):
- """处理图片(包含数学题)"""
- try:
- # 保存图片
- original_ext = 'png'
- if hasattr(shape.image, 'filename') and shape.image.filename:
- original_ext = os.path.splitext(shape.image.filename)[1][1:].lower()
-
- img_path = os.path.join(temp_dir, f"image_{slide_num}_{shape_idx}.{original_ext}")
- with open(img_path, "wb") as f:
- f.write(shape.image.blob)
-
- # 处理TIFF格式
- processed_img_path = img_path
- if original_ext.lower() in ['tiff', 'tif']:
- processed_img_path = convert_tiff_to_png(img_path)
- if processed_img_path is None:
- print(f" ❌ 图片TIFF转换失败,跳过")
- return
-
- # 插入图片
- if os.path.exists(processed_img_path) and os.path.getsize(processed_img_path) > 0:
- doc.add_paragraph().add_run().add_picture(processed_img_path, width=Inches(5))
- doc.add_paragraph()
- count_images += 1
- print(f" ✅ 图片已插入(计数: {count_images})")
- except Exception as e:
- print(f" ❌ 保存或插入图片时出错: {e}")
- def process_mixed_fraction(text):
- """
- 优化版分数处理:支持带分数和纯分数
- 示例:2 1/2 -> 2 1/2, 1/2 -> 1/2, -3/4 -> -3/4
- """
- # 1. 匹配带分数格式(整数+空格+分数)
- # 2. 匹配纯分数格式(无整数部分)
- pattern = r'(-?\d*)\s*(\d+)/(\d+)'
-
- # 用于替换的函数
- def replace_func(match):
- integer_part = match.group(1)
- numerator = match.group(2)
- denominator = match.group(3)
-
- # 处理整数部分为空的情况(纯分数)
- if integer_part == '':
- return f"{numerator}/{denominator}"
- # 处理负号单独存在的情况(如-1/2)
- elif integer_part == '-':
- return f"-{numerator}/{denominator}"
- else:
- # 保留原始格式(带空格)
- return f"{integer_part} {numerator}/{denominator}"
-
- # 执行替换
- return re.sub(pattern, replace_func, text)
- def convert_tiff_to_png(tiff_path):
- """将TIFF格式图片转换为PNG格式"""
- try:
- with Image.open(tiff_path) as img:
- # 转换为RGB模式
- if img.mode in ('RGBA', 'LA', 'P'):
- img = img.convert('RGB')
- elif img.mode == 'CMYK':
- img = img.convert('RGB')
-
- # 生成PNG路径
- png_path = tiff_path.replace('.tiff', '.png').replace('.tif', '.png')
-
- # 保存为PNG
- img.save(png_path, 'PNG')
- return png_path
- except Exception as e:
- print(f" ❌ TIFF转换失败: {e}")
- return None
- def is_math_problem_image(slide):
- """优化关键词检测(更全面)"""
- slide_text = ""
- for shape in slide.shapes:
- if shape.has_text_frame and shape.text_frame.text.strip():
- slide_text += shape.text_frame.text.strip() + " "
-
- # 更全面的关键词列表
- math_keywords = [
- '例题', '例', '题', '公式', '计算', '解', '证明', '几何', '代数', '方程',
- '如图', '分数', '分子', '分母', '分式', '通分', '约分', '百分比', '小数', '根号'
- ]
-
- # 检查关键词
- for keyword in math_keywords:
- if keyword in slide_text:
- print(f" 📌 检测到关键词 '{keyword}'")
- return True
-
- # 检查分数格式(1/2, 2 1/2等)
- if re.search(r'\d+/\d+', slide_text) or re.search(r'\d+\s+\d+/\d+', slide_text):
- print(f" 📌 检测到分数格式: {slide_text[:50]}...")
- return True
-
- # 检查分数符号(½, ¾)
- if re.search(r'[½¾¼]', slide_text):
- print(f" 📌 检测到分数符号: {slide_text[:50]}...")
- return True
-
- return False
- def batch_convert_pptx_to_docx(input_dir, output_dir=None):
- """批量转换PPTX文件为DOCX"""
- if not os.path.exists(input_dir):
- print(f"❌ 错误:输入目录 {input_dir} 不存在")
- return
-
- if output_dir is None:
- output_dir = input_dir
-
- if not os.path.exists(output_dir):
- os.makedirs(output_dir)
-
- # 获取所有PPTX文件
- pptx_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.pptx')]
-
- # 排除临时文件
- pptx_files = [f for f in pptx_files if not f.startswith('~$')]
-
- if not pptx_files:
- print(f"❌ 错误:在目录 {input_dir} 中没有找到PPTX文件")
- return
-
- print(f"🔍 正在处理 {len(pptx_files)} 个PPTX文件...")
-
- for pptx_file in pptx_files:
- input_path = os.path.join(input_dir, pptx_file)
- output_file = os.path.splitext(pptx_file)[0] + ".docx"
- output_path = os.path.join(output_dir, output_file)
-
- print(f"\n📄 处理: {pptx_file}")
- extract_ppt_to_docx(input_path, output_path)
-
- print(f"\n🎉 批量转换完成!共转换 {len(pptx_files)} 个文件")
- # ========== 使用示例 ==========
- if __name__ == "__main__":
- # 设置输入目录(包含PPTX文件的目录)
- input_dir = r"C:/Users/Administrator/Desktop/(高分突破·课件)25秋数学人教9全通用版/ppt/1.课堂导学案/第二十七章 相似"
-
- # 设置输出目录(默认与输入目录相同)
- output_dir = r"C:/Users/Administrator/Desktop/(高分突破·课件)25秋数学人教9全通用版/word/27章"
-
- # 执行批量转换
- batch_convert_pptx_to_docx(input_dir, output_dir)
复制代码
|
|