双文档同步测试集划分
我所用的模型是对比学习模型输入数据集的路径有两个txt文件一个作为训练一个做评估,两个文件的生成代码如下
代码1
import os
def file_paths_generator(folder_path):
id_counter = 0
for root, dirs, files in os.walk(folder_path):
for filename in sorted(files):
file_path = os.path.join(root, filename)
full_path = os.path.relpath(file_path, folder_path)
yield id_counter, full_path
id_counter += 1
def generate_txt_file(folder_path, output_file):
with open(output_file, 'w') as f:
for id, path in file_paths_generator(folder_path):
f.write(f"id{id:04d} {path}\n")
print(f"id{id:04d} {path}")
folder_path = '/home/data/pxy/ceshi2.3.1'
output_file = 'data_list.txt'
generate_txt_file(folder_path, output_file)
代码2
import os
import random
def get_file_paths(folder_path):
file_paths = []
for root, dirs, files in os.walk(folder_path):
for filename in sorted(files):# 按照文件名排序
file_path = os.path.join(root, filename)
grandparent_directory = os.path.basename(root)
full_path = os.path.join(grandparent_directory, filename)
file_paths.append(full_path)
return file_paths
def generate_txt_file(folder_path, output_file):
file_paths = get_file_paths(folder_path)
with open(output_file, 'w') as f:
for i in range(len(file_paths)):
for _ in range(4):# 每个文件生成4组正负对
# 正对
random_index = random.randint(0, len(file_paths) - 1)
while random_index == i or os.path.dirname(file_paths) != os.path.dirname(file_paths):# 确保选择的正对路径与当前文件路径不同且在同一子文件夹内
random_index = random.randint(0, len(file_paths) - 1)
f.write('1 ' + file_paths + ' ' + file_paths + '\n')
# 负对
random_index = random.randint(0, len(file_paths) - 1)
while random_index == i or os.path.dirname(file_paths) == os.path.dirname(file_paths):# 确保选择的负对路径与当前文件路径不同且不在同一子文件夹内
random_index = random.randint(0, len(file_paths) - 1)
f.write('0 ' + file_paths + ' ' + file_paths + '\n')
print(file_paths)
folder_path = '/home/data/pxy/ceshi2.3.3'
output_file = 'val_list.txt'
generate_txt_file(folder_path, output_file)
现在我想要实现以下功能需要求助
1.两个代码合并为一个,我试过该函数名直接合并,想看看有没有更好的办法
2.对数据集划分开80%作为训练集,20%作为测试集。(要求两份用作训练生成的txt文件里面训练集的选取的文件相同,测试集也是)
页:
[1]