|

楼主 |
发表于 2023-7-18 21:38:11
|
显示全部楼层
import pytesseract
from pdf2image import convert_from_path
def pdf_to_txt(pdf_path, txt_path):
# 将PDF转换为图像列表
images = convert_from_path(pdf_path)
# 创建一个空的文本文件
with open(txt_path, 'w') as f:
# 对每个图像应用OCR并将结果写入文本文件
for i, image in enumerate(images):
text = pytesseract.image_to_string(image, lang='eng')
f.write(f'Page {i+1}:\n\n{text}\n\n')
print(f'转换完成!文本文件保存在:{txt_path}')
# 使用示例
pdf_path = 'f:\\'+input('请输入要转换的文件名')+'.pdf'
txt_path = 'f:\\'+input('请输入要保存的文件名')+'.txt'
pdf_to_txt(pdf_path, txt_path)
错误信息
请输入要转换的文件名456
请输入要保存的文件名789
Traceback (most recent call last):
File "C:\Users\ssq\AppData\Roaming\Python\Python39\site-packages\pdf2image\pdf2image.py", line 568, in pdfinfo_from_path
proc = Popen(command, env=env, stdout=PIPE, stderr=PIPE)
File "C:\Program Files (x86)\Python39-32\lib\subprocess.py", line 951, in __init__
self._execute_child(args, executable, preexec_fn, close_fds,
File "C:\Program Files (x86)\Python39-32\lib\subprocess.py", line 1420, in _execute_child
hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
FileNotFoundError: [WinError 2] 系统找不到指定的文件。
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Python\pdf2ocr.py", line 20, in <module>
pdf_to_txt(pdf_path, txt_path)
File "D:\Python\pdf2ocr.py", line 6, in pdf_to_txt
images = convert_from_path(pdf_path)
File "C:\Users\ssq\AppData\Roaming\Python\Python39\site-packages\pdf2image\pdf2image.py", line 127, in convert_from_path
page_count = pdfinfo_from_path(
File "C:\Users\ssq\AppData\Roaming\Python\Python39\site-packages\pdf2image\pdf2image.py", line 594, in pdfinfo_from_path
raise PDFInfoNotInstalledError(
pdf2image.exceptions.PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?
|
|