这个要如何修改???
原批量PDF文件搜索关键字代码import os
from PyPDF2 import PdfReader
def search_keywords_in_pdf(directory, keyword):
files = os.listdir(directory)
for file in files:
if file.endswith('.pdf'):
file_path = os.path.join(directory, file)
with open(file_path, 'rb') as f:
pdf = PdfReader(f)
for page_num, page in enumerate(pdf.pages, start=1):
text = page.extract_text()
if keyword in text:
print(f"found '{keyword}' in {file} (Page {page_num})")
# 指定要搜索的目录和关键字
directory = r'e:\年报'
keyword = input('请输入要查询的内容')
search_keywords_in_pdf(directory, keyword)
因为PDF文件较多,现在要加入多进程多线程,该如何修改
from concurrent.futures import ThreadPoolExecutor
import multiprocessing
# 设置线程池的最大线程数或进程池的最大进程数
max_workers = multiprocessing.cpu_count()
# 定义线程池或进程池
with ThreadPoolExecutor(max_workers=max_workers) as executor: 你可以将原始代码修改为使用多线程来处理PDF文件的搜索任务。下面是修改后的代码示例:
import os
from PyPDF2 import PdfReader
from concurrent.futures import ThreadPoolExecutor
import multiprocessing
def search_keywords_in_pdf(file_path):
with open(file_path, 'rb') as f:
pdf = PdfReader(f)
for page_num, page in enumerate(pdf.pages, start=1):
text = page.extract_text()
if keyword in text:
print(f"found '{keyword}' in {file_path} (Page {page_num})")
def search_files_in_directory(directory, keyword):
files = os.listdir(directory)
with ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
for file in files:
if file.endswith('.pdf'):
file_path = os.path.join(directory, file)
executor.submit(search_keywords_in_pdf, file_path)
# 指定要搜索的目录和关键字
directory = r'e:\年报'
keyword = input('请输入要查询的内容:')
search_files_in_directory(directory, keyword)
这里使用了`concurrent.futures.ThreadPoolExecutor`来创建一个线程池,并设置最大线程数为计算机的 CPU 核心数,以便同时处理多个文件的搜索任务。请根据需要调整`max_workers`的值。
每个线程会执行`search_keywords_in_pdf`函数来搜索指定文件中是否包含关键字。通过`executor.submit`方法提交任务给线程池执行。
注意:在多线程环境中,需要确保相关资源的线程安全性,例如`print`语句可能在打印时产生混乱的输出。如果需要保持输出的顺序,可以将结果保存在列表中,然后在任务执行完成后按顺序打印结果。 要将原代码添加多进程和多线程功能,可以按照以下方式修改:
import os
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from PyPDF2 import PdfReader
def search_keywords_in_pdf(file_path, keyword):
with open(file_path, 'rb') as f:
pdf = PdfReader(f)
for page_num, page in enumerate(pdf.pages, start=1):
text = page.extract_text()
if keyword in text:
print(f"found '{keyword}' in {file_path} (Page {page_num})")
def search_files(directory, keyword):
files = os.listdir(directory)
results = []
for file in files:
if file.endswith('.pdf'):
file_path = os.path.join(directory, file)
results.append(executor.submit(search_keywords_in_pdf, file_path, keyword))
for result in concurrent.futures.as_completed(results):
result.result()
# 指定要搜索的目录和关键字
directory = r'e:\年报'
keyword = input('请输入要查询的内容')
# 使用多进程池
with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
# 使用多线程池
with ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
search_files(directory, keyword)
修改后的代码使用了ProcessPoolExecutor来创建多个进程,并使用ThreadPoolExecutor在每个进程中创建多个线程。这样可以利用多核处理器的并行能力,在搜索多个PDF文件时提高效率。
请注意,多进程和多线程需要合理管理资源,如果PDF文件过多,可能会导致系统负载过重。可以根据实际情况调整max_workers参数来控制并发数。 陶远航 发表于 2023-7-11 21:21
要将原代码添加多进程和多线程功能,可以按照以下方式修改:
Traceback (most recent call last):
File "D:\Python\多进程多线程搜索.py", line 24, in <module>
with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
NameError: name 'multiprocessing' is not defined 陶远航 发表于 2023-7-11 21:21
要将原代码添加多进程和多线程功能,可以按照以下方式修改:
加了import multiprocessing后
Traceback (most recent call last):
File "D:\Python\多进程多线程搜索.py", line 28, in <module>
search_files(directory, keyword)
File "D:\Python\多进程多线程搜索.py", line 19, in search_files
for result in concurrent.futures.as_completed(results):
NameError: name 'concurrent' is not defined sfqxx 发表于 2023-7-11 21:21
你可以将原始代码修改为使用多线程来处理PDF文件的搜索任务。下面是修改后的代码示例:
可能现在python已支持自动调度CPU,又或者我是在pycharm平台运行的,pycharm平台支持cpu调度
几次搜索时间都是没修改之前快!!!!!
页:
[1]