import osimport concurrent.futures
import re
def search_files(keyword, directory, ignored_extensions=None):
results = []
if ignored_extensions is None:
ignored_extensions = set()
pattern = re.compile(f'({keyword})', re.IGNORECASE)
def search(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
matches = re.finditer(pattern, content)
for match in matches:
start = max(0, match.start() - 10)
end = min(len(content), match.end() + 10)
matched_text = content
results.append((file_path, matched_text))
except (IOError, OSError):
with concurrent.futures.ThreadPoolExecutor() as executor:
for root, _, files in os.walk(directory):
for file_name in files:
_, extension = os.path.splitext(file_name)
if extension.lower() in ignored_extensions:
file_path = os.path.join(root, file_name)
executor.submit(search, file_path)
return results
# 示例用法
keyword = "秘密"
directory = "C:\\"# 指定要搜索的文件夹,这里以C盘为例
ignored_extensions = {'.jpg', '.png', '.gif'} # 忽略搜索的图片文件类型
results = search_files(keyword, directory, ignored_extensions)
if results:
for result in results:
file_path, matched_text = result
print("文件:", file_path)
print("匹配文本:", matched_text)
print("未找到包含关键字的文件。") 这段代码无法正常运行的原因是由于线程池在搜索文件时没有足够的时间来执行完所有的任务,导致`results`返回的结果为空。
import os
import concurrent.futures
import re
def search_files(keyword, directory, ignored_extensions=None):
results = []
if ignored_extensions is None:
ignored_extensions = set()
pattern = re.compile(f'({keyword})', re.IGNORECASE)
def search(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
matches = re.finditer(pattern, content)
for match in matches:
start = max(0, match.start() - 10)
end = min(len(content), match.end() + 10)
matched_text = content
results.append((file_path, matched_text))
except (IOError, OSError):
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
for root, _, files in os.walk(directory):
for file_name in files:
_, extension = os.path.splitext(file_name)
if extension.lower() in ignored_extensions:
file_path = os.path.join(root, file_name)
futures.append(executor.submit(search, file_path))
for future in concurrent.futures.as_completed(futures):
return results
# 示例用法
keyword = "秘密"
directory = "C:\\"# 指定要搜索的文件夹,这里以C盘为例
ignored_extensions = {'.jpg', '.png', '.gif'} # 忽略搜索的图片文件类型
results = search_files(keyword, directory, ignored_extensions)
if results:
for result in results:
file_path, matched_text = result
print("文件:", file_path)
print("匹配文本:", matched_text)
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。 还是无法正常运行>>> import os
>>> import concurrent.futures
>>> import re
>>> def search_files(keyword, directory, ignored_extensions=None):
... results = []
>>> if ignored_extensions is None:
File "<stdin>", line 1
if ignored_extensions is None:
IndentationError: unexpected indent
>>> ignored_extensions = set()
File "<stdin>", line 1
ignored_extensions = set()
IndentationError: unexpected indent
>>> pattern = re.compile(f'({keyword})', re.IGNORECASE)
File "<stdin>", line 1
pattern = re.compile(f'({keyword})', re.IGNORECASE)
IndentationError: unexpected indent
>>> def search(file_path):
File "<stdin>", line 1
def search(file_path):
IndentationError: unexpected indent
>>> try:
File "<stdin>", line 1
IndentationError: unexpected indent
>>> with open(file_path, 'r', encoding='utf-8') as file:
File "<stdin>", line 1
with open(file_path, 'r', encoding='utf-8') as file:
IndentationError: unexpected indent
>>> content = file.read()
File "<stdin>", line 1
content = file.read()
IndentationError: unexpected indent
>>> matches = re.finditer(pattern, content)
File "<stdin>", line 1
matches = re.finditer(pattern, content)
IndentationError: unexpected indent
>>> for match in matches:
File "<stdin>", line 1
for match in matches:
IndentationError: unexpected indent
>>> start = max(0, match.start() - 10)
File "<stdin>", line 1
start = max(0, match.start() - 10)
IndentationError: unexpected indent
>>> end = min(len(content), match.end() + 10)
File "<stdin>", line 1
end = min(len(content), match.end() + 10)
IndentationError: unexpected indent
>>> matched_text = content
File "<stdin>", line 1
matched_text = content
IndentationError: unexpected indent
>>> results.append((file_path, matched_text))
File "<stdin>", line 1
results.append((file_path, matched_text))
IndentationError: unexpected indent
>>> except (IOError, OSError):
File "<stdin>", line 1
except (IOError, OSError):
IndentationError: unexpected indent
>>> pass
File "<stdin>", line 1
IndentationError: unexpected indent
>>> with concurrent.futures.ThreadPoolExecutor() as executor:
File "<stdin>", line 1
with concurrent.futures.ThreadPoolExecutor() as executor:
IndentationError: unexpected indent
>>> futures = []
File "<stdin>", line 1
futures = []
IndentationError: unexpected indent
>>> for root, _, files in os.walk(directory):
File "<stdin>", line 1
for root, _, files in os.walk(directory):
IndentationError: unexpected indent
>>> for file_name in files:
File "<stdin>", line 1
for file_name in files:
IndentationError: unexpected indent
>>> _, extension = os.path.splitext(file_name)
File "<stdin>", line 1
_, extension = os.path.splitext(file_name)
IndentationError: unexpected indent
>>> if extension.lower() in ignored_extensions:
File "<stdin>", line 1
if extension.lower() in ignored_extensions:
IndentationError: unexpected indent
>>> continue
File "<stdin>", line 1
IndentationError: unexpected indent
>>> file_path = os.path.join(root, file_name)
File "<stdin>", line 1
file_path = os.path.join(root, file_name)
IndentationError: unexpected indent
>>> futures.append(executor.submit(search, file_path))
File "<stdin>", line 1
futures.append(executor.submit(search, file_path))
IndentationError: unexpected indent
>>> for future in concurrent.futures.as_completed(futures):
File "<stdin>", line 1
for future in concurrent.futures.as_completed(futures):
IndentationError: unexpected indent
>>> pass
File "<stdin>", line 1
IndentationError: unexpected indent
>>> return results
File "<stdin>", line 1
return results
IndentationError: unexpected indent lxg1434334642 发表于 2024-03-13 16:00
还是无法正常运行>>> import os
>>> import concurre ...
import os
import concurrent.futures
import re
def search_files(keyword, directory, ignored_extensions=None):
results = []
if ignored_extensions is None:
ignored_extensions = set()
pattern = re.compile(f'({keyword})', re.IGNORECASE)
def search(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
matches = re.finditer(pattern, content)
for match in matches:
start = max(0, match.start() - 10)
end = min(len(content), match.end() + 10)
matched_text = content
results.append((file_path, matched_text))
except (IOError, OSError):
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
for root, _, files in os.walk(directory):
for file_name in files:
_, extension = os.path.splitext(file_name)
if extension.lower() in ignored_extensions:
file_path = os.path.join(root, file_name)
futures.append(executor.submit(search, file_path))
for future in concurrent.futures.as_completed(futures):
return results
# 示例用法
keyword = "秘密"
directory = "C:\\"# 指定要搜索的文件夹,这里以C盘为例
ignored_extensions = {'.jpg', '.png', '.gif'} # 忽略搜索的图片文件类型
results = search_files(keyword, directory, ignored_extensions)
if results:
for result in results:
file_path, matched_text = result
print("文件:", file_path)
print("匹配文本:", matched_text)
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。 程序还是无法正常运行>>> import os
>>> import concurrent.futures
>>> import re
>>> def search_files(keyword, directory, ignored_extensions=None):
... results = []
... if ignored_extensions is None:
... ignored_extensions = set()
... pattern = re.compile(f'({keyword})', re.IGNORECASE)
... def search(file_path):
... try:
... with open(file_path, 'r', encoding='utf-8') as file:
... content = file.read()
... matches = re.finditer(pattern, content)
... for match in matches:
... start = max(0, match.start() - 10)
... end = min(len(content), match.end() + 10)
... matched_text = content
... results.append((file_path, matched_text))
... except (IOError, OSError):
... pass
... with concurrent.futures.ThreadPoolExecutor() as executor:
... futures = []
... for root, _, files in os.walk(directory):
... for file_name in files:
... _, extension = os.path.splitext(file_name)
... if extension.lower() in ignored_extensions:
... continue
... file_path = os.path.join(root, file_name)
... futures.append(executor.submit(search, file_path))
... for future in concurrent.futures.as_completed(futures):
... pass
... return results
>>> # 示例用法
>>> keyword = "秘密"
>>> directory = "C:\"# 指定要搜索的文件夹,这里以C盘为例
File "<stdin>", line 1
directory = "C:\"# 指定要搜索的文件夹,这里以C盘为例
SyntaxError: EOL while scanning string literal
>>> ignored_extensions = {'.jpg', '.png', '.gif'} # 忽略搜索的图片文件类型
>>> results = search_files(keyword, directory, ignored_extensions)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
NameError: name 'directory' is not defined
>>> if results:
... print("包含关键字的文件:")
... for result in results:
... file_path, matched_text = result
... print("文件:", file_path)
... print("匹配文本:", matched_text)
... print("-----------")
... else:
... print("未找到包含关键字的文件。") lxg1434334642 发表于 2024-03-13 16:13
程序还是无法正常运行>>> import os
>>> import conc ...
# 示例用法
keyword = "秘密"
directory = "C:\\"
ignored_extensions = {'.jpg', '.png', '.gif'} # 忽略搜索的图片文件类型
results = search_files(keyword, directory, ignored_extensions)
if results:
for result in results:
file_path, matched_text = result
print("文件:", file_path)
print("匹配文本:", matched_text)
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。 修改后的代码还是无法运行import os
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from docx import Document
# 创建或加载索引
def create_or_load_index(index_folder):
if not os.path.exists(index_folder):
ix = index.create_in(index_folder, create_schema())
ix = index.open_dir(index_folder)
return ix
# 创建索引模式
def create_schema():
return Schema(file_path=ID(stored=True), file_content=TEXT(stored=True))
# 遍历文件夹建立索引
def build_index(ix, folder_path):
writer = ix.writer()
for foldername, _, filenames in os.walk(folder_path):
for filename in filenames:
file_path = os.path.join(foldername, filename)
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
writer.add_document(file_path=file_path, file_content=content)
# 搜索文件
def search_files(ix, keywords):
with ix.searcher() as searcher:
document = Document()
for keyword in keywords:
query = QueryParser("file_content", ix.schema).parse(keyword)
results = searcher.search(query)
for hit in results:
document.add_paragraph(f"文件名: {os.path.basename(hit['file_path'])}, 位置: {hit['file_path']}")
document.save(os.path.join(os.getcwd(), '搜索结果.docx'))
# 主程序
def main():
folder_path = r"C:\"
index_folder = r"C:\Users\Lenovo\Desktop\新建文件夹"
keywords = ['秘密', '机 密', '绝密']
ix = create_or_load_index(index_folder)
build_index(ix, folder_path)
search_files(ix, keywords)
if __name__ == "__main__":
main() lxg1434334642 发表于 2024-03-13 20:08
修改后的代码还是无法运行import os
from whoosh imp ...
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from docx import Document
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。 Traceback (most recent call last):
File "C:/Users/Lenovo/Desktop/搜索.py", line 5, in <module>
from docx import Document
ModuleNotFoundError: No module named 'docx lxg1434334642 发表于 2024-03-13 20:09
Traceback (most recent call last):
File "C:/Use ...
pip install python-docx
conda install -c conda-forge python-docx
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。 ERROR: Exception:
Traceback (most recent call last):
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_vendor\urllib3\response.py", line 438, in _error_catcher
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_vendor\urllib3\response.py", line 561, in read
data = self._fp_read(amt) if not fp_closed else b""
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_vendor\urllib3\response.py", line 527, in _fp_read
return self._fp.read(amt) if amt is not None else self._fp.read()
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 90, in read
data = self.__fp.read(amt)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\http\client.py", line 472, in read
s = self.fp.read(amt)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\socket.py", line 707, in readinto
return self._sock.recv_into(b)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\ssl.py", line 1249, in recv_into
return self.read(nbytes, buffer)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\ssl.py", line 1105, in read
return self._sslobj.read(len, buffer)
TimeoutError: The read operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\cli\base_command.py", line 180, in exc_logging_wrapper
status = run_func(*args)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\cli\req_command.py", line 248, in wrapper
return func(self, options, args)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\commands\install.py", line 377, in run
requirement_set = resolver.resolve(
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\resolution\resolvelib\resolver.py", line 161, in resolve
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\operations\prepare.py", line 565, in prepare_linked_requirements_more
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\operations\prepare.py", line 479, in _complete_partial_requirements
for link, (filepath, _) in batch_download:
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\network\download.py", line 183, in __call__
for chunk in chunks:
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\cli\progress_bars.py", line 53, in _rich_progress_bar
for chunk in iterable:
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_internal\network\utils.py", line 63, in response_chunks
for chunk in response.raw.stream(
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_vendor\urllib3\response.py", line 622, in stream
data = self.read(amt=amt, decode_content=decode_content)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_vendor\urllib3\response.py", line 560, in read
with self._error_catcher():
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\contextlib.py", line 155, in __exit__
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\pip\_vendor\urllib3\response.py", line 443, in _error_catcher
raise ReadTimeoutError(self._pool, None, "Read timed out.")
pip._vendor.urllib3.exceptions.ReadTimeoutError: HTTPSConnectionPool(host='files.pythonhosted.org', port=443): Read timed out. lxg1434334642 发表于 2024-03-13 20:14
ERROR: Exception:
Traceback (most recent call las ...
这个错误是由于在安装`python-docx`模块时,连接到Python Package Index(PyPI)的过程中发生了超时错误。由于网络环境不稳定或网络连接问题,导致无法从PyPI下载所需的模块文件。
1. 首先,确保您的网络连接正常并且没有任何限制,可以使用浏览器访问https://pypi.org/,以确保能够正常访问。
2. 如果您使用的是代理服务器,请确保已正确配置代理设置。可以通过设置环境变量来配置代理,例如:
set http_proxy=http://your_proxy_server:port
set https_proxy=https://your_proxy_server:port
将 "your_proxy_server" 替换为实际的代理服务器地址,将 "port" 替换为代理服务器端口号。
3. 另外,您可以尝试使用其他的Python包管理工具,例如Anaconda或Miniconda,来安装`python-docx`模块。您可以尝试使用以下命令在Anaconda环境中安装:
conda install -c conda-forge python-docx
如果以上方法仍然无法解决问题,可以尝试使用离线安装包手动安装`python-docx`模块,您可以从https://www.lfd.uci.edu/~gohlke/pythonlibs/ 网站下载适用于您版本的Python和操作系统的离线安装包。
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。 raceback (most recent call last):
File "C:/Users/Lenovo/Desktop/搜索.py", line 52, in <module>
File "C:/Users/Lenovo/Desktop/搜索.py", line 47, in main
ix = create_or_load_index(index_folder)
File "C:/Users/Lenovo/Desktop/搜索.py", line 13, in create_or_load_index
ix = index.open_dir(index_folder)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\whoosh\index.py", line 123, in open_dir
return FileIndex(storage, schema=schema, indexname=indexname)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\whoosh\index.py", line 421, in __init__
TOC.read(self.storage, self.indexname, schema=self._schema)
File "C:\Users\Lenovo\AppData\Local\Programs\Python\Python312\Lib\site-packages\whoosh\index.py", line 618, in read
raise EmptyIndexError("Index %r does not exist in %r"
whoosh.index.EmptyIndexError: Index 'MAIN' does not exist in FileStorage('C:\\Users\\Lenovo\\Desktop\\新建文件夹')
lxg1434334642 发表于 2024-03-13 20:24
raceback (most recent call last):
File "C:/User ...
1. 第一次运行程序时,尚未创建索引文件夹或索引文件夹被删除。
2. 您提供的索引文件夹路径不正确。
1. 确保指定的索引文件夹存在,并且您有读写权限。
2. 如果程序是第一次运行,或者索引文件夹已被删除,请确保在运行程序之前执行`build_index`函数来创建索引。
3. 检查索引文件夹中是否存在名为"MAIN"的索引文件。如果没有,请确保在调用`create_or_load_index`函数之前执行`build_index`函数。
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。 为什么修改后的代码还是无法运行
import os
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser
from docx import Document
# 创建或加载索引
def create_or_load_index(index_folder):
if not os.path.exists(index_folder):
ix = index.create_in(index_folder, create_schema())
ix = index.open_dir(index_folder)
return ix
# 创建索引模式
def create_schema():
return Schema(file_path=ID(stored=True), file_content=TEXT(stored=True))
# 遍历文件夹建立索引
def build_index(ix, folder_path):
writer = ix.writer()
for foldername, _, filenames in os.walk(folder_path):
for filename in filenames:
file_path = os.path.join(foldername, filename)
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
writer.add_document(file_path=file_path, file_content=content)
# 搜索文件
def search_files(ix, keywords):
with ix.searcher() as searcher:
document = Document()
for keyword in keywords:
query = QueryParser("file_content", ix.schema).parse(keyword)
results = searcher.search(query)
for hit in results:
document.add_paragraph(f"文件名: {os.path.basename(hit['file_path'])}, 位置: {hit['file_path']}")
document.save(os.path.join(os.getcwd(), '搜索结果.docx'))
# 主程序
def main():
folder_path = r"C:\"
index_folder = r"C:\Users\Lenovo\Desktop\新建文件夹"
keywords = ['秘密', '机 密', '绝密']
ix = create_or_load_index(index_folder)
build_index(ix, folder_path)
search_files(ix, keywords)
if __name__ == "__main__":
main() lxg1434334642 发表于 2024-03-13 20:29
import os
from wh ...
这段代码无法正常运行的原因是在`keywords`变量的赋值处缺少了一个列表,导致语法错误。您需要将`keywords =`修改为`keywords = []`来定义一个空列表。这样修改后的代码如下所示:import osfrom whoosh import index, writingfrom whoosh.fields import Schema, TEXT, IDfrom whoosh.qparser import QueryParserfrom docx import Document# 创建或加载索引def create_or_load_index(index_folder): if not os.path.exists(index_folder): os.makedirs(index_folder) ix = index.create_in(index_folder, create_schema()) else: ix = index.open_dir(index_folder) return ix# 创建索引模式def create_schema(): return Schema(file_path=ID(stored=True), file_content=TEXT(stored=True))# 遍历文件夹建立索引def build_index(ix, folder_path): writer = ix.writer() for foldername, _, filenames in os.walk(folder_path): for filename in filenames: file_path = os.path.join(foldername, filename) with open(file_path, 'r', encoding='utf-8') as file: content = file.read() writer.add_document(file_path=file_path, file_content=content) writer.commit()# 搜索文件def search_files(ix, keywords): with ix.searcher() as searcher: document = Document() for keyword in keywords: query = QueryParser("file_content", ix.schema).parse(keyword) results = searcher.search(query) for hit in results: document.add_paragraph(f"文件名: {os.path.basename(hit)}, 位置: {hit}") document.save(os.path.join(os.getcwd(), '搜索结果.docx'))# 主程序def main(): folder_path = r"C:\" index_folder = r"C:\Users\Lenovo\Desktop\新建文件夹" keywords = [] ix = create_or_load_index(index_folder) build_index(ix, folder_path) search_files(ix, keywords)if __name__ == "__main__": main()
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。 请问这个代码里面我需要填写什么?folder_path = r"C:\Your\Folder\Path" lxg1434334642 发表于 2024-03-13 20:34
请问这个代码里面我需要填写什么?folder_path = r"C: ...
`folder_path = r"C:\MyFolder"`
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。 如果我想搜索电脑内所有文件应该怎么填写 lxg1434334642 发表于 2024-03-13 20:36
要搜索电脑中的所有文件,您可以将`folder_path`变量的值设置为根目录的路径,例如:`folder_path = 'C:\\'`。这将使程序遍历整个C盘(或其他驱动器)来搜索文件。请注意,这可能会需要很长时间,并且搜索的范围非常广泛,可能会导致性能问题。因此,请谨慎使用此功能。
以上回复来自 FishC AI 小助理,如未能正确解答您的问题,请继续追问。