santiago commited on
Commit
09ad451
1 Parent(s): d572f04

feat: add data preprocessing pipeline

Browse files
Files changed (1) hide show
  1. prepare_data.py +141 -0
prepare_data.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ from typing import Dict, List1
5
+
6
+ import argparse
7
+ import json
8
+ from functools import partial
9
+ import pathlib
10
+ import shutil
11
+ import re
12
+
13
+ from tqdm import tqdm
14
+ from PIL import Image
15
+ import pandas as pd
16
+
17
+ ImageCaptionMap = Dict[str, Dict[str, str]]
18
+
19
+ def _get_image_path(row: pd.Series, root_dir: str = '.') -> str:
20
+ path = [
21
+ root_dir,
22
+ 'files',
23
+ f'p{row.subject_id}'[:3],
24
+ f'p{row.subject_id}',
25
+ f's{row.study_id}',
26
+ f'{row.dicom_id}.jpg'
27
+ ]
28
+
29
+ return '/'.join(path)
30
+
31
+ def _prepare_dataframe(
32
+ captions: pd.DataFrame,
33
+ metadata: pd.DataFrame,
34
+ row: pd.Series
35
+ ) -> pd.Series:
36
+ if f's{row.study_id}' in captions.index:
37
+ row[captions.columns] = (
38
+ captions
39
+ .loc[f's{row.study_id}']
40
+ .apply(lambda text: (
41
+ re.sub('_+', '_', text)
42
+ .replace('\n', ' ')
43
+ .lower().rstrip('.')
44
+ ))
45
+ )
46
+
47
+ if row.dicom_id in metadata.index:
48
+ row['view_position'] = metadata.loc[row.dicom_id, 'ViewPosition']
49
+
50
+ return row
51
+
52
+ def copy_image(
53
+ row: pd.Series,
54
+ target_path: pathlib.Path,
55
+ split: str,
56
+ size: int = 224
57
+ ) -> str:
58
+ target_img_path = target_path / split / f'{row.dicom_id}.jpg'
59
+ target_img_path = str(target_img_path.resolve())
60
+
61
+ img = Image.open(row.path)
62
+ img = img.resize((size, size))
63
+ img.save(target_img_path)
64
+
65
+ return target_img_path
66
+
67
+ def generate_dataset(
68
+ root_dir: pathlib.Path,
69
+ target_dir: pathlib.Path,
70
+ split: str = 'validate'
71
+ ) -> ImageCaptionMap:
72
+ meta_dir = root_dir / 'metadata'
73
+
74
+ metadata = pd.read_csv(meta_dir / 'mimic-cxr-2.0.0-metadata.csv')
75
+ df_split = pd.read_csv(meta_dir / 'mimic-cxr-2.0.0-split.csv')
76
+ captions = pd.read_csv(meta_dir / 'mimic_cxr_sectioned.csv')
77
+
78
+ captions = captions.where(~captions.isna(), '').set_index('study')
79
+ metadata = metadata.set_index('dicom_id')
80
+
81
+ if split in df_split.split.unique():
82
+ current_split = df_split[df_split.split == split]
83
+ get_abs_path = partial(_get_image_path, root_dir=str(root_dir.resolve()))
84
+
85
+ current_split['path'] = current_split.apply(get_abs_path, axis=1)
86
+ current_split['view_position'] = ''
87
+ for col in captions.columns:
88
+ current_split[col] = ''
89
+
90
+ preprocess_func = partial(_prepare_dataframe, captions, metadata)
91
+
92
+ df = current_split.apply(preprocess_func, axis=1)
93
+
94
+ else:
95
+ raise ValueError('bad split')
96
+
97
+ image_path_to_caption = {}
98
+ (target_dir / split).mkdir(exist_ok=True, parents=True)
99
+
100
+ for _, element in tqdm(df.iterrows()):
101
+ caption = {
102
+ 'impression': element['impression'],
103
+ 'findings': element['findings'],
104
+ 'last_paragraph': element['last_paragraph'],
105
+ 'comparison': element['comparison'],
106
+ 'view_position': element['view_position'],
107
+ }
108
+
109
+ image_path = copy_image(element, target_dir, split)
110
+
111
+ image_path_to_caption[image_path] = caption
112
+
113
+ return image_path_to_caption
114
+
115
+ def dump_dataset(image_path_to_caption: ImageCaptionMap) -> List[str]:
116
+ lines = []
117
+
118
+ for image_path, captions in image_path_to_caption.items():
119
+ lines.append(json.dumps({
120
+ 'image_path': image_path,
121
+ 'caption': captions,
122
+ }))
123
+
124
+ return lines
125
+
126
+ if __name__ == '__main__':
127
+ parser = argparse.ArgumentParser(description='Preprocess MIMIC-CXR dataset')
128
+ parser.add_argument('--data_dir', description='MIMIC-CXR path')
129
+ parser.add_argument('--target_dir', description='output path')
130
+
131
+ args = parser.parse_args()
132
+
133
+ data_dir = pathlib.Path(args.data_dir)
134
+ target_dir = pathlib.Path(args.target_dir)
135
+
136
+ for split in ['test', 'validate', 'train']:
137
+ image_path_to_caption = generate_dataset(data_dir, target_dir, split)
138
+ lines = dump_dataset(image_path_to_caption)
139
+
140
+ with open(target_dir / f'{split}_dataset.json', 'w') as f:
141
+ f.write('\n'.join(lines))