import os
import concurrent.futures
import re

def search_files(keyword, directory, ignored_extensions=None):
    results = []

    if ignored_extensions is None:
        ignored_extensions = set()

    pattern = re.compile(f'({keyword})', re.IGNORECASE)

    def search(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                matches = re.finditer(pattern, content)
                for match in matches:
                    start = max(0, match.start() - 10)
                    end = min(len(content), match.end() + 10)
                    matched_text = content[start:end]
                    results.append((file_path, matched_text))
        except (IOError, OSError):

    with concurrent.futures.ThreadPoolExecutor() as executor:
        for root, _, files in os.walk(directory):
            for file_name in files:
                _, extension = os.path.splitext(file_name)
                if extension.lower() in ignored_extensions:
                file_path = os.path.join(root, file_name)
                executor.submit(search, file_path)

    return results

# 示例用法
keyword = "秘密"
directory = "C:\"  # 指定要搜索的文件夹,这里以C盘为例
ignored_extensions = {'.jpg', '.png', '.gif'} # 忽略搜索的图片文件类型

results = search_files(keyword, directory, ignored_extensions)

if results:
    for result in results:
        file_path, matched_text = result
        print("文件:", file_path)
        print("匹配文本:", matched_text)


import os
import concurrent.futures
import re

def search_files(keyword, directory, ignored_extensions=None):
    results = []

    if ignored_extensions is None:
        ignored_extensions = set()

    pattern = re.compile(f'({keyword})', re.IGNORECASE)

    def search(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                matches = re.finditer(pattern, content)
                for match in matches:
                    start = max(0, match.start() - 10)
                    end = min(len(content), match.end() + 10)
                    matched_text = content[start:end]
                    results.append((file_path, matched_text))
        except (IOError, OSError):

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for root, _, files in os.walk(directory):
            for file_name in files:
                _, extension = os.path.splitext(file_name)
                if extension.lower() in ignored_extensions:
                file_path = os.path.join(root, file_name)
                futures.append(executor.submit(search, file_path))
        for future in concurrent.futures.as_completed(futures):

    return results

# 示例用法
keyword = "秘密"
directory = "C:\"  # 指定要搜索的文件夹,这里以C盘为例
ignored_extensions = {'.jpg', '.png', '.gif'} # 忽略搜索的图片文件类型

results = search_files(keyword, directory, ignored_extensions)

if results:
    for result in results:
        file_path, matched_text = result
        print("文件:", file_path)
        print("匹配文本:", matched_text)


 楼主| 发表于 2024-3-13 16:00:15
>>> import os
>>> import concurrent.futures
>>> import re
>>> def search_files(keyword, directory, ignored_extensions=None):
...     results = []
>>>     if ignored_extensions is None:
  File "<stdin>", line 1
    if ignored_extensions is None:
IndentationError: unexpected indent
>>>         ignored_extensions = set()
  File "<stdin>", line 1
    ignored_extensions = set()
IndentationError: unexpected indent
>>>     pattern = re.compile(f'({keyword})', re.IGNORECASE)
  File "<stdin>", line 1
    pattern = re.compile(f'({keyword})', re.IGNORECASE)
IndentationError: unexpected indent
>>>     def search(file_path):
  File "<stdin>", line 1
    def search(file_path):
IndentationError: unexpected indent
>>>         try:
  File "<stdin>", line 1
IndentationError: unexpected indent
>>>             with open(file_path, 'r', encoding='utf-8') as file:
  File "<stdin>", line 1
    with open(file_path, 'r', encoding='utf-8') as file:
IndentationError: unexpected indent
>>>                 content = file.read()
  File "<stdin>", line 1
    content = file.read()
IndentationError: unexpected indent
>>>                 matches = re.finditer(pattern, content)
  File "<stdin>", line 1
    matches = re.finditer(pattern, content)
IndentationError: unexpected indent
>>>                 for match in matches:
  File "<stdin>", line 1
    for match in matches:
IndentationError: unexpected indent
>>>                     start = max(0, match.start() - 10)
  File "<stdin>", line 1
    start = max(0, match.start() - 10)
IndentationError: unexpected indent
>>>                     end = min(len(content), match.end() + 10)
  File "<stdin>", line 1
    end = min(len(content), match.end() + 10)
IndentationError: unexpected indent
>>>                     matched_text = content[start:end]
  File "<stdin>", line 1
    matched_text = content[start:end]
IndentationError: unexpected indent
>>>                     results.append((file_path, matched_text))
  File "<stdin>", line 1
    results.append((file_path, matched_text))
IndentationError: unexpected indent
>>>         except (IOError, OSError):
  File "<stdin>", line 1
    except (IOError, OSError):
IndentationError: unexpected indent
>>>             pass
  File "<stdin>", line 1
IndentationError: unexpected indent
>>>     with concurrent.futures.ThreadPoolExecutor() as executor:
  File "<stdin>", line 1
    with concurrent.futures.ThreadPoolExecutor() as executor:
IndentationError: unexpected indent
>>>         futures = []
  File "<stdin>", line 1
    futures = []
IndentationError: unexpected indent
>>>         for root, _, files in os.walk(directory):
  File "<stdin>", line 1
    for root, _, files in os.walk(directory):
IndentationError: unexpected indent
>>>             for file_name in files:
  File "<stdin>", line 1
    for file_name in files:
IndentationError: unexpected indent
>>>                 _, extension = os.path.splitext(file_name)
  File "<stdin>", line 1
    _, extension = os.path.splitext(file_name)
IndentationError: unexpected indent
>>>                 if extension.lower() in ignored_extensions:
  File "<stdin>", line 1
    if extension.lower() in ignored_extensions:
IndentationError: unexpected indent
>>>                     continue
  File "<stdin>", line 1
IndentationError: unexpected indent
>>>                 file_path = os.path.join(root, file_name)
  File "<stdin>", line 1
    file_path = os.path.join(root, file_name)
IndentationError: unexpected indent
>>>                 futures.append(executor.submit(search, file_path))
  File "<stdin>", line 1
    futures.append(executor.submit(search, file_path))
IndentationError: unexpected indent
>>>         for future in concurrent.futures.as_completed(futures):
  File "<stdin>", line 1
    for future in concurrent.futures.as_completed(futures):
IndentationError: unexpected indent
>>>             pass
  File "<stdin>", line 1
IndentationError: unexpected indent
>>>     return results
  File "<stdin>", line 1
    return results
IndentationError: unexpected indent
lxg1434334642 发表于 2024-03-13 16:00
还是无法正常运行>>> import os
>>> import concurre ...

import os
import concurrent.futures
import re

def search_files(keyword, directory, ignored_extensions=None):
    results = []
    if ignored_extensions is None:
        ignored_extensions = set()
    pattern = re.compile(f'({keyword})', re.IGNORECASE)
    def search(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                matches = re.finditer(pattern, content)
                for match in matches:
                    start = max(0, match.start() - 10)
                    end = min(len(content), match.end() + 10)
                    matched_text = content[start:end]
                    results.append((file_path, matched_text))
        except (IOError, OSError):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for root, _, files in os.walk(directory):
            for file_name in files:
                _, extension = os.path.splitext(file_name)
                if extension.lower() in ignored_extensions:
                file_path = os.path.join(root, file_name)
                futures.append(executor.submit(search, file_path))
        for future in concurrent.futures.as_completed(futures):
    return results

# 示例用法
keyword = "秘密"
directory = "C:\"  # 指定要搜索的文件夹,这里以C盘为例
ignored_extensions = {'.jpg', '.png', '.gif'} # 忽略搜索的图片文件类型
results = search_files(keyword, directory, ignored_extensions)

if results:
    for result in results:
        file_path, matched_text = result
        print("文件:", file_path)
        print("匹配文本:", matched_text)


 楼主| 发表于 2024-3-13 16:13:25
>>> import os
>>> import concurrent.futures
>>> import re
>>> def search_files(keyword, directory, ignored_extensions=None):
...     results = []
...     if ignored_extensions is None:
...         ignored_extensions = set()
...     pattern = re.compile(f'({keyword})', re.IGNORECASE)
...     def search(file_path):
...         try:
...             with open(file_path, 'r', encoding='utf-8') as file:
...                 content = file.read()
...                 matches = re.finditer(pattern, content)
...                 for match in matches:
...                     start = max(0, match.start() - 10)
...                     end = min(len(content), match.end() + 10)
...                     matched_text = content[start:end]
...                     results.append((file_path, matched_text))
...         except (IOError, OSError):
...             pass
...     with concurrent.futures.ThreadPoolExecutor() as executor:
...         futures = []
...         for root, _, files in os.walk(directory):
...             for file_name in files:
...                 _, extension = os.path.splitext(file_name)
...                 if extension.lower() in ignored_extensions:
...                     continue
...                 file_path = os.path.join(root, file_name)
...                 futures.append(executor.submit(search, file_path))
...         for future in concurrent.futures.as_completed(futures):
...             pass
...     return results
>>> # 示例用法
>>> keyword = "秘密"
>>> directory = "C:"  # 指定要搜索的文件夹,这里以C盘为例
  File "<stdin>", line 1
    directory = "C:"  # 指定要搜索的文件夹,这里以C盘为例
SyntaxError: EOL while scanning string literal
>>> ignored_extensions = {'.jpg', '.png', '.gif'} # 忽略搜索的图片文件类型
>>> results = search_files(keyword, directory, ignored_extensions)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
NameError: name 'directory' is not defined
>>> if results:
...     print("包含关键字的文件:")
...     for result in results:
...         file_path, matched_text = result
...         print("文件:", file_path)
...         print("匹配文本:", matched_text)
...         print("-----------")
... else:
...     print("未找到包含关键字的文件。")
发表于 2024-3-13 16:14:02
lxg1434334642 发表于 2024-03-13 16:13
程序还是无法正常运行>>> import os
>>> import conc ...

# 示例用法
keyword = "秘密"
directory = "C:\"
ignored_extensions = {'.jpg', '.png', '.gif'} # 忽略搜索的图片文件类型

results = search_files(keyword, directory, ignored_extensions)
if results:
    for result in results:
        file_path, matched_text = result
        print("文件:", file_path)
        print("匹配文本:", matched_text)


 楼主| 发表于 2024-3-13 20:08:01
import os
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from docx import Document

# 创建或加载索引
def create_or_load_index(index_folder):
    if not os.path.exists(index_folder):
        ix = index.create_in(index_folder, create_schema())
        ix = index.open_dir(index_folder)
    return ix

# 创建索引模式
def create_schema():
    return Schema(file_path=ID(stored=True), file_content=TEXT(stored=True))

# 遍历文件夹建立索引
def build_index(ix, folder_path):
    writer = ix.writer()
    for foldername, _, filenames in os.walk(folder_path):
        for filename in filenames:
            file_path = os.path.join(foldername, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                writer.add_document(file_path=file_path, file_content=content)

# 搜索文件
def search_files(ix, keywords):
    with ix.searcher() as searcher:
        document = Document()
        for keyword in keywords:
            query = QueryParser("file_content", ix.schema).parse(keyword)
            results = searcher.search(query)
            for hit in results:
                document.add_paragraph(f"文件名: {os.path.basename(hit['file_path'])}, 位置: {hit['file_path']}")
        document.save(os.path.join(os.getcwd(), '搜索结果.docx'))

# 主程序
def main():
    folder_path = r"C:"
    index_folder = r"C:\Users\Lenovo\Desktop\新建文件夹"
    keywords = ['秘密', '机 密', '绝  密']
    ix = create_or_load_index(index_folder)
    build_index(ix, folder_path)
    search_files(ix, keywords)

if __name__ == "__main__":
发表于 2024-3-13 20:08:15
lxg1434334642 发表于 2024-03-13 20:08
修改后的代码还是无法运行import os
from whoosh imp ...

from whoosh import index, writing
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from docx import Document


 楼主| 发表于 2024-3-13 20:09:39
Traceback (most recent call last):
  File "C:/Users/Lenovo/Desktop/搜索.py", line 5, in <module>
    from docx import Document
ModuleNotFoundError: No module named 'docx
发表于 2024-3-13 20:09:54
lxg1434334642 发表于 2024-03-13 20:09
Traceback (most recent call last):
  File "C:/Use ...

pip install python-docx
conda install -c conda-forge python-docx

 楼主| 发表于 2024-3-13 20:14:14
ERROR: Exception:
Traceback (most recent call last):
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_vendor\urllib3\response.py", line 438, in _error_catcher
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_vendor\urllib3\response.py", line 561, in read
    data = self._fp_read(amt) if not fp_closed else b""
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_vendor\urllib3\response.py", line 527, in _fp_read
    return self._fp.read(amt) if amt is not None else self._fp.read()
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 90, in read
    data = self.__fp.read(amt)
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\http\client.py", line 472, in read
    s = self.fp.read(amt)
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\socket.py", line 707, in readinto
    return self._sock.recv_into(b)
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\ssl.py", line 1249, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\ssl.py", line 1105, in read
    return self._sslobj.read(len, buffer)
TimeoutError: The read operation timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\cli\base_command.py", line 180, in exc_logging_wrapper
    status = run_func(*args)
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\cli\req_command.py", line 248, in wrapper
    return func(self, options, args)
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\commands\install.py", line 377, in run
    requirement_set = resolver.resolve(
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\resolution\resolvelib\resolver.py", line 161, in resolve
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\operations\prepare.py", line 565, in prepare_linked_requirements_more
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\operations\prepare.py", line 479, in _complete_partial_requirements
    for link, (filepath, _) in batch_download:
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\network\download.py", line 183, in __call__
    for chunk in chunks:
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\cli\progress_bars.py", line 53, in _rich_progress_bar
    for chunk in iterable:
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\network\utils.py", line 63, in response_chunks
    for chunk in response.raw.stream(
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_vendor\urllib3\response.py", line 622, in stream
    data = self.read(amt=amt, decode_content=decode_content)
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_vendor\urllib3\response.py", line 560, in read
    with self._error_catcher():
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\contextlib.py", line 155, in __exit__
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_vendor\urllib3\response.py", line 443, in _error_catcher
    raise ReadTimeoutError(self._pool, None, "Read timed out.")
pip._vendor.urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='files.pythonhosted.org', port=443): Read timed out.
发表于 2024-3-13 20:14:56
lxg1434334642 发表于 2024-03-13 20:14
ERROR: Exception:
Traceback (most recent call las ...

这个错误是由于在安装`python-docx`模块时,连接到Python Package Index(PyPI)的过程中发生了超时错误。由于网络环境不稳定或网络连接问题,导致无法从PyPI下载所需的模块文件。


1. 首先,确保您的网络连接正常并且没有任何限制,可以使用浏览器访问https://pypi.org/,以确保能够正常访问。

2. 如果您使用的是代理服务器,请确保已正确配置代理设置。可以通过设置环境变量来配置代理,例如:
set http_proxy=http://your_proxy_server:port
set https_proxy=https://your_proxy_server:port

将 "your_proxy_server" 替换为实际的代理服务器地址,将 "port" 替换为代理服务器端口号。

3. 另外,您可以尝试使用其他的Python包管理工具,例如Anaconda或Miniconda,来安装`python-docx`模块。您可以尝试使用以下命令在Anaconda环境中安装:
conda install -c conda-forge python-docx

如果以上方法仍然无法解决问题,可以尝试使用离线安装包手动安装`python-docx`模块,您可以从https://www.lfd.uci.edu/~gohlke/pythonlibs/ 网站下载适用于您版本的Python和操作系统的离线安装包。


 楼主| 发表于 2024-3-13 20:24:29
raceback (most recent call last):
  File "C:/Users/Lenovo/Desktop/搜索.py", line 52, in <module>
  File "C:/Users/Lenovo/Desktop/搜索.py", line 47, in main
    ix = create_or_load_index(index_folder)
  File "C:/Users/Lenovo/Desktop/搜索.py", line 13, in create_or_load_index
    ix = index.open_dir(index_folder)
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\whoosh\index.py", line 123, in open_dir
    return FileIndex(storage, schema=schema, indexname=indexname)
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\whoosh\index.py", line 421, in __init__
    TOC.read(self.storage, self.indexname, schema=self._schema)
  File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\whoosh\index.py", line 618, in read
    raise EmptyIndexError("Index %r does not exist in %r"
whoosh.index.EmptyIndexError: Index 'MAIN' does not exist in FileStorage('C:\\Users\\Lenovo\\Desktop\\新建文件夹')
发表于 2024-3-13 20:24:52
lxg1434334642 发表于 2024-03-13 20:24
raceback (most recent call last):
  File "C:/User ...


1. 第一次运行程序时,尚未创建索引文件夹或索引文件夹被删除。
2. 您提供的索引文件夹路径不正确。


1. 确保指定的索引文件夹存在,并且您有读写权限。
2. 如果程序是第一次运行,或者索引文件夹已被删除,请确保在运行程序之前执行`build_index`函数来创建索引。
3. 检查索引文件夹中是否存在名为"MAIN"的索引文件。如果没有,请确保在调用`create_or_load_index`函数之前执行`build_index`函数。


 楼主| 发表于 2024-3-13 20:29:38
import os
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from docx import Document

# 创建或加载索引
def create_or_load_index(index_folder):
    if not os.path.exists(index_folder):
        ix = index.create_in(index_folder, create_schema())
        ix = index.open_dir(index_folder)
    return ix

# 创建索引模式
def create_schema():
    return Schema(file_path=ID(stored=True), file_content=TEXT(stored=True))

# 遍历文件夹建立索引
def build_index(ix, folder_path):
    writer = ix.writer()
    for foldername, _, filenames in os.walk(folder_path):
        for filename in filenames:
            file_path = os.path.join(foldername, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                writer.add_document(file_path=file_path, file_content=content)

# 搜索文件
def search_files(ix, keywords):
    with ix.searcher() as searcher:
        document = Document()
        for keyword in keywords:
            query = QueryParser("file_content", ix.schema).parse(keyword)
            results = searcher.search(query)
            for hit in results:
                document.add_paragraph(f"文件名: {os.path.basename(hit['file_path'])}, 位置: {hit['file_path']}")
        document.save(os.path.join(os.getcwd(), '搜索结果.docx'))

# 主程序
def main():
    folder_path = r"C:\"
    index_folder = r"C:\Users\Lenovo\Desktop\新建文件夹"
    keywords = ['秘密', '机 密', '绝  密']
    ix = create_or_load_index(index_folder)
    build_index(ix, folder_path)
    search_files(ix, keywords)

if __name__ == "__main__":
发表于 2024-3-13 20:30:05
lxg1434334642 发表于 2024-03-13 20:29
import os
from wh ...

这段代码无法正常运行的原因是在`keywords`变量的赋值处缺少了一个列表,导致语法错误。您需要将`keywords =`修改为`keywords = []`来定义一个空列表。这样修改后的代码如下所示:import osfrom whoosh import index, writingfrom whoosh.fields import Schema, TEXT, IDfrom whoosh.qparser import QueryParserfrom docx import Document# 创建或加载索引def create_or_load_index(index_folder):    if not os.path.exists(index_folder):        os.makedirs(index_folder)        ix = index.create_in(index_folder, create_schema())    else:        ix = index.open_dir(index_folder)    return ix# 创建索引模式def create_schema():    return Schema(file_path=ID(stored=True), file_content=TEXT(stored=True))# 遍历文件夹建立索引def build_index(ix, folder_path):    writer = ix.writer()    for foldername, _, filenames in os.walk(folder_path):        for filename in filenames:            file_path = os.path.join(foldername, filename)            with open(file_path, 'r', encoding='utf-8') as file:                content = file.read()                writer.add_document(file_path=file_path, file_content=content)    writer.commit()# 搜索文件def search_files(ix, keywords):    with ix.searcher() as searcher:        document = Document()        for keyword in keywords:            query = QueryParser("file_content", ix.schema).parse(keyword)            results = searcher.search(query)            for hit in results:                document.add_paragraph(f"文件名: {os.path.basename(hit)}, 位置: {hit}")        document.save(os.path.join(os.getcwd(), '搜索结果.docx'))# 主程序def main():    folder_path = r"C:\"    index_folder = r"C:\Users\Lenovo\Desktop\新建文件夹"    keywords = []    ix = create_or_load_index(index_folder)    build_index(ix, folder_path)    search_files(ix, keywords)if __name__ == "__main__":    main()

 楼主| 发表于 2024-3-13 20:34:40
folder_path = r"C:\Your\Folder\Path"
发表于 2024-3-13 20:35:18
lxg1434334642 发表于 2024-03-13 20:34
请问这个代码里面我需要填写什么?folder_path = r"C: ...


`folder_path = r"C:\MyFolder"`



 楼主| 发表于 2024-3-13 20:36:56
发表于 2024-3-13 20:37:16
lxg1434334642 发表于 2024-03-13 20:36

要搜索电脑中的所有文件,您可以将`folder_path`变量的值设置为根目录的路径,例如:`folder_path = 'C:\\'`。这将使程序遍历整个C盘(或其他驱动器)来搜索文件。请注意,这可能会需要很长时间,并且搜索的范围非常广泛,可能会导致性能问题。因此,请谨慎使用此功能。

