|
马上注册,结交更多好友,享用更多功能^_^
您需要 登录 才可以下载或查看,没有账号?立即注册
x
#pip install PyPDF2
import os
import re
from PyPDF2 import PdfReader
def search_keywords_in_pdf_directory(directory, keyword):
# 获取目录下的所有PDF文件路径
pdf_files = [os.path.join(directory, filename) for filename in os.listdir(directory) if filename.endswith('.pdf')]
results = []
for pdf_file in pdf_files:
with open(pdf_file, 'rb') as file:
pdf = PdfReader(file)
for page_num, page in enumerate(pdf.pages, start=1):
text = page.extract_text()
# 使用正则表达式在文本中搜索关键字
pattern = re.compile(r'\b{}\b'.format(keyword), flags=re.IGNORECASE)
matches = re.finditer(pattern, text)
for match in matches:
match_text = match.group(0)
match_page = page_num + 1
# 获取关键字所在的行及其上下行内容
lines = text.split('\n')
start_line = max(match.start() - 1, 0)
end_line = min(match.end() + 1, len(lines))
context_lines = lines[start_line:end_line]
# 在关键字所在行将关键字标记为红色
highlighted_lines = []
for line in context_lines:
highlighted_line = line.replace(match_text, '\033[91m{}\033[0m'.format(match_text))
highlighted_lines.append(highlighted_line)
result = {
'file': pdf_file,
'page': match_page,
'context': highlighted_lines
}
results.append(result)
return results
# 示例用法
directory_path = 'e:\\' # 修改为实际的目录路径
search_keyword = input('请输入要查询的内容')
search_results = search_keywords_in_pdf_directory(directory_path, search_keyword)
for result in search_results:
print('文件名:', result['file'])
print('页数:', result['page'])
print('上下文:')
for line in result['context']:
print(line)
print() |
|