santiago
commited on
Commit
•
09ad451
1
Parent(s):
d572f04
feat: add data preprocessing pipeline
Browse files- prepare_data.py +141 -0
prepare_data.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
from typing import Dict, List1
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import json
|
8 |
+
from functools import partial
|
9 |
+
import pathlib
|
10 |
+
import shutil
|
11 |
+
import re
|
12 |
+
|
13 |
+
from tqdm import tqdm
|
14 |
+
from PIL import Image
|
15 |
+
import pandas as pd
|
16 |
+
|
17 |
+
ImageCaptionMap = Dict[str, Dict[str, str]]
|
18 |
+
|
19 |
+
def _get_image_path(row: pd.Series, root_dir: str = '.') -> str:
|
20 |
+
path = [
|
21 |
+
root_dir,
|
22 |
+
'files',
|
23 |
+
f'p{row.subject_id}'[:3],
|
24 |
+
f'p{row.subject_id}',
|
25 |
+
f's{row.study_id}',
|
26 |
+
f'{row.dicom_id}.jpg'
|
27 |
+
]
|
28 |
+
|
29 |
+
return '/'.join(path)
|
30 |
+
|
31 |
+
def _prepare_dataframe(
|
32 |
+
captions: pd.DataFrame,
|
33 |
+
metadata: pd.DataFrame,
|
34 |
+
row: pd.Series
|
35 |
+
) -> pd.Series:
|
36 |
+
if f's{row.study_id}' in captions.index:
|
37 |
+
row[captions.columns] = (
|
38 |
+
captions
|
39 |
+
.loc[f's{row.study_id}']
|
40 |
+
.apply(lambda text: (
|
41 |
+
re.sub('_+', '_', text)
|
42 |
+
.replace('\n', ' ')
|
43 |
+
.lower().rstrip('.')
|
44 |
+
))
|
45 |
+
)
|
46 |
+
|
47 |
+
if row.dicom_id in metadata.index:
|
48 |
+
row['view_position'] = metadata.loc[row.dicom_id, 'ViewPosition']
|
49 |
+
|
50 |
+
return row
|
51 |
+
|
52 |
+
def copy_image(
|
53 |
+
row: pd.Series,
|
54 |
+
target_path: pathlib.Path,
|
55 |
+
split: str,
|
56 |
+
size: int = 224
|
57 |
+
) -> str:
|
58 |
+
target_img_path = target_path / split / f'{row.dicom_id}.jpg'
|
59 |
+
target_img_path = str(target_img_path.resolve())
|
60 |
+
|
61 |
+
img = Image.open(row.path)
|
62 |
+
img = img.resize((size, size))
|
63 |
+
img.save(target_img_path)
|
64 |
+
|
65 |
+
return target_img_path
|
66 |
+
|
67 |
+
def generate_dataset(
|
68 |
+
root_dir: pathlib.Path,
|
69 |
+
target_dir: pathlib.Path,
|
70 |
+
split: str = 'validate'
|
71 |
+
) -> ImageCaptionMap:
|
72 |
+
meta_dir = root_dir / 'metadata'
|
73 |
+
|
74 |
+
metadata = pd.read_csv(meta_dir / 'mimic-cxr-2.0.0-metadata.csv')
|
75 |
+
df_split = pd.read_csv(meta_dir / 'mimic-cxr-2.0.0-split.csv')
|
76 |
+
captions = pd.read_csv(meta_dir / 'mimic_cxr_sectioned.csv')
|
77 |
+
|
78 |
+
captions = captions.where(~captions.isna(), '').set_index('study')
|
79 |
+
metadata = metadata.set_index('dicom_id')
|
80 |
+
|
81 |
+
if split in df_split.split.unique():
|
82 |
+
current_split = df_split[df_split.split == split]
|
83 |
+
get_abs_path = partial(_get_image_path, root_dir=str(root_dir.resolve()))
|
84 |
+
|
85 |
+
current_split['path'] = current_split.apply(get_abs_path, axis=1)
|
86 |
+
current_split['view_position'] = ''
|
87 |
+
for col in captions.columns:
|
88 |
+
current_split[col] = ''
|
89 |
+
|
90 |
+
preprocess_func = partial(_prepare_dataframe, captions, metadata)
|
91 |
+
|
92 |
+
df = current_split.apply(preprocess_func, axis=1)
|
93 |
+
|
94 |
+
else:
|
95 |
+
raise ValueError('bad split')
|
96 |
+
|
97 |
+
image_path_to_caption = {}
|
98 |
+
(target_dir / split).mkdir(exist_ok=True, parents=True)
|
99 |
+
|
100 |
+
for _, element in tqdm(df.iterrows()):
|
101 |
+
caption = {
|
102 |
+
'impression': element['impression'],
|
103 |
+
'findings': element['findings'],
|
104 |
+
'last_paragraph': element['last_paragraph'],
|
105 |
+
'comparison': element['comparison'],
|
106 |
+
'view_position': element['view_position'],
|
107 |
+
}
|
108 |
+
|
109 |
+
image_path = copy_image(element, target_dir, split)
|
110 |
+
|
111 |
+
image_path_to_caption[image_path] = caption
|
112 |
+
|
113 |
+
return image_path_to_caption
|
114 |
+
|
115 |
+
def dump_dataset(image_path_to_caption: ImageCaptionMap) -> List[str]:
|
116 |
+
lines = []
|
117 |
+
|
118 |
+
for image_path, captions in image_path_to_caption.items():
|
119 |
+
lines.append(json.dumps({
|
120 |
+
'image_path': image_path,
|
121 |
+
'caption': captions,
|
122 |
+
}))
|
123 |
+
|
124 |
+
return lines
|
125 |
+
|
126 |
+
if __name__ == '__main__':
|
127 |
+
parser = argparse.ArgumentParser(description='Preprocess MIMIC-CXR dataset')
|
128 |
+
parser.add_argument('--data_dir', description='MIMIC-CXR path')
|
129 |
+
parser.add_argument('--target_dir', description='output path')
|
130 |
+
|
131 |
+
args = parser.parse_args()
|
132 |
+
|
133 |
+
data_dir = pathlib.Path(args.data_dir)
|
134 |
+
target_dir = pathlib.Path(args.target_dir)
|
135 |
+
|
136 |
+
for split in ['test', 'validate', 'train']:
|
137 |
+
image_path_to_caption = generate_dataset(data_dir, target_dir, split)
|
138 |
+
lines = dump_dataset(image_path_to_caption)
|
139 |
+
|
140 |
+
with open(target_dir / f'{split}_dataset.json', 'w') as f:
|
141 |
+
f.write('\n'.join(lines))
|