import os import json import shutil # 读取关键词文件并构建关键词映射字典 keyword_file = '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI-MMbench-CoT/output_multi_column.txt' keyword_dict = {} with open(keyword_file, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line: continue # 跳过空行 parts = line.split(',') if len(parts) != 4: print(f"格式错误,跳过此行:{line}") continue keyword, department, task, modality = [p.strip() for p in parts] keyword_dict[keyword] = { 'department': department, 'task': task, 'modality': modality } print(f"总共加载了 {len(keyword_dict)} 个关键词。") # 定义需要处理的科室列表 departments = [ 'Cardiovascular Surgery', 'Dermatology', 'Endocrinology', 'Gastroenterology and Hepatology', 'General Surgery', 'Hematology', 'Infectious Diseases', 'Laboratory Medicine and Pathology', 'Nephrology and Hypertension', 'Neurosurgery', 'Obstetrics and Gynecology', 'Oncology (Medical)', 'Ophthalmology', 'Orthopedic Surgery', 'Otolaryngology (ENT)/Head and Neck Surgery', 'Pulmonary Medicine', 'Sports Medicine', 'Urology' ] # 创建科室到目录名称的映射,处理特殊情况 def get_department_dir_name(department): if department == 'Otolaryngology (ENT)/Head and Neck Surgery': return 'Otolaryngology (ENT)' else: return department # 将科室列表转换为集合,方便查找 departments_set = set(departments) # 定义源目录列表 source_dirs = [ '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/cls_2d', '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/det_2d', '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/semantic_seg_2d', '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/semantic_seg_3d' ] # 定义目标基础目录 destination_root = '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI-MMbench-CoT' # 用于统计和调试 total_files_processed = 0 files_matched = 0 images_copied = 0 # 用于统计每个科室的匹配文件数 department_file_counts = {dept: 0 for dept in departments} # 要处理的图片键列表 image_keys = ['img_mask_path', 'img_contour_path', 'img_bbox_path', 'img_path'] # 遍历每个源目录 for source_dir in source_dirs: print(f"正在遍历目录:{source_dir}") for root, dirs, files in os.walk(source_dir): for file in files: if file.endswith('.json'): total_files_processed += 1 source_file_path = os.path.join(root, file) try: with open(source_file_path, 'r', encoding='utf-8') as f: data = json.load(f) answer_letter = data.get('answer', '').strip() options = data.get('options', []) if not answer_letter or not options: print(f"文件缺少 'answer' 或 'options' 字段,跳过:{source_file_path}") continue # 创建选项字典,映射字母到选项文本 option_dict = {} for opt in options: if len(opt) > 2 and opt[1] == '.': opt_letter = opt[0] opt_text = opt[3:].strip() option_dict[opt_letter] = opt_text else: print(f"选项格式错误,文件:{source_file_path},选项:{opt}") # 获取关键词 keyword = option_dict.get(answer_letter) if not keyword: print(f"答案字母 '{answer_letter}' 在选项中未找到,文件:{source_file_path}") continue print(f"处理文件:{source_file_path}") print(f"关键词:'{keyword}'") # 检查关键词是否在关键词字典中 if keyword in keyword_dict: department_info = keyword_dict[keyword] department = department_info['department'] print(f"关键词 '{keyword}' 的科室为:'{department}'") if department in departments_set: files_matched += 1 department_dir_name = get_department_dir_name(department) destination_base = os.path.join(destination_root, department_dir_name) # 构造目标文件路径 relative_path = os.path.relpath(source_file_path, '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI') destination_file_path = os.path.join(destination_base, relative_path) # 创建目标目录(如果不存在) destination_dir = os.path.dirname(destination_file_path) if not os.path.exists(destination_dir): os.makedirs(destination_dir) print(f"创建目录:{destination_dir}") # 复制JSON文件 shutil.copy2(source_file_path, destination_file_path) print(f"已复制文件到:{destination_file_path}") # 处理并复制图片 for image_key in image_keys: if image_key in data: image_path = data[image_key] # 图片路径是相对于 source_dir + '/images' 的 source_image_path = os.path.join(source_dir, 'images', image_path) if not os.path.exists(source_image_path): print(f"源图片不存在,跳过:{source_image_path}") continue # 构造相对路径,从 GMAI 之后开始,包括 'images' 目录 relative_image_path = os.path.relpath(source_image_path, '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI') # 构造目标图片路径 destination_image_path = os.path.join(destination_base, relative_image_path) destination_image_dir = os.path.dirname(destination_image_path) if not os.path.exists(destination_image_dir): os.makedirs(destination_image_dir) print(f"创建图片目录:{destination_image_dir}") # 复制图片文件 shutil.copy2(source_image_path, destination_image_path) images_copied += 1 print(f"已复制图片到:{destination_image_path}") # 增加对应科室的文件计数 department_file_counts[department] += 1 else: print(f"科室 '{department}' 不在处理列表中,不复制文件。") else: print(f"关键词 '{keyword}' 不在关键词列表中。") except Exception as e: print(f"处理文件 {source_file_path} 时发生错误:{e}") print(f"总共处理了 {total_files_processed} 个 JSON 文件。") print(f"总共匹配并复制了 {files_matched} 个 JSON 文件。") print(f"总共复制了 {images_copied} 张图片。") # 打印每个科室的文件计数 print("每个科室匹配并复制的文件数量:") for dept in departments: count = department_file_counts[dept] dept_dir_name = get_department_dir_name(dept) print(f"{dept_dir_name}: {count} 个文件")