Spaces:

mohdelgaar
/

Clinical_Decisions

Running

App Files Files Community

mohdelgaar commited on Sep 4

Commit

c47c7dc

•

0 Parent(s):

Initial commit

Browse files

Files changed (13) hide show

.gitattributes +34 -0
README.md +12 -0
app.py +81 -0
data.py +487 -0
demo.py +241 -0
demo_assets.py +20 -0
electra-base.pt +3 -0
examples/note1.txt +17 -0
examples/note2.txt +17 -0
examples/note3.txt +17 -0
examples/note4.txt +17 -0
model.py +206 -0
requirements.txt +5 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Clinical Decisions
+emoji: ⚕️
+colorFrom: indigo
+colorTo: pink
+sdk: gradio
+sdk_version: 4.40.0
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import argparse
+import torch
+from data import load_tokenizer
+from model import load_model
+from demo import run_gradio
+parser = argparse.ArgumentParser()
+parser.add_argument('--data_dir', default='/data/mohamed/data')
+parser.add_argument('--aim_repo', default='/data/mohamed/')
+parser.add_argument('--ckpt', default='electra-base.pt')
+parser.add_argument('--aim_exp', default='mimic-decisions-1215')
+parser.add_argument('--label_encoding', default='multiclass')
+parser.add_argument('--multiclass', action='store_true')
+parser.add_argument('--debug', action='store_true')
+parser.add_argument('--save_losses', action='store_true')
+parser.add_argument('--task', default='token', choices=['seq', 'token'])
+parser.add_argument('--max_len', type=int, default=512)
+parser.add_argument('--num_layers', type=int, default=3)
+parser.add_argument('--kernels', nargs=3, type=int, default=[1,2,3])
+parser.add_argument('--model', default='roberta-base',)
+parser.add_argument('--model_name', default='google/electra-base-discriminator',)
+parser.add_argument('--gpu', default='0')
+parser.add_argument('--grad_accumulation', default=2, type=int)
+parser.add_argument('--pheno_id', type=int)
+parser.add_argument('--unseen_pheno', type=int)
+parser.add_argument('--text_subset')
+parser.add_argument('--pheno_n', type=int, default=500)
+parser.add_argument('--hidden_size', type=int, default=100)
+parser.add_argument('--emb_size', type=int, default=400)
+parser.add_argument('--total_steps', type=int, default=5000)
+parser.add_argument('--train_log', type=int, default=500)
+parser.add_argument('--val_log', type=int, default=1000)
+parser.add_argument('--seed', default = '0')
+parser.add_argument('--num_phenos', type=int, default=10)
+parser.add_argument('--num_decs', type=int, default=9)
+parser.add_argument('--num_umls_tags', type=int, default=33)
+parser.add_argument('--batch_size', type=int, default=8)
+parser.add_argument('--pos_weight', type=float, default=1.25)
+parser.add_argument('--alpha_distil', type=float, default=1)
+parser.add_argument('--distil', action='store_true')
+parser.add_argument('--distil_att', action='store_true')
+parser.add_argument('--distil_ckpt')
+parser.add_argument('--use_umls', action='store_true')
+parser.add_argument('--include_nolabel', action='store_true')
+parser.add_argument('--truncate_train', action='store_true')
+parser.add_argument('--truncate_eval', action='store_true')
+parser.add_argument('--load_ckpt', action='store_true')
+parser.add_argument('--gradio', action='store_true')
+parser.add_argument('--optuna', action='store_true')
+parser.add_argument('--mimic_data', action='store_true')
+parser.add_argument('--eval_only', action='store_true')
+parser.add_argument('--lr', type=float, default=4e-5)
+parser.add_argument('--resample', default='')
+parser.add_argument('--verbose', type=bool, default=True)
+parser.add_argument('--use_crf', type=bool)
+parser.add_argument('--print_spans', action='store_true')
+args = parser.parse_args()
+if args.task == 'seq' and args.pheno_id is not None:
+    args.num_labels = 1
+elif args.task == 'seq':
+    args.num_labels = args.num_phenos
+elif args.task == 'token':
+    if args.use_umls:
+        args.num_labels = args.num_umls_tags
+    else:
+        args.num_labels = args.num_decs
+    if args.label_encoding == 'multiclass':
+        args.num_labels = args.num_labels * 2 + 1
+    elif args.label_encoding == 'bo':
+        args.num_labels *= 2
+    elif args.label_encoding == 'boe':
+        args.num_labels *= 3
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+tokenizer = load_tokenizer(args.model_name)
+model = load_model(args, device)[0]
+model.eval()
+torch.set_grad_enabled(False)
+run_gradio(model, tokenizer)

data.py ADDED Viewed

	@@ -0,0 +1,487 @@

+import torch
+import json
+import os
+import pandas as pd
+import numpy as np
+from torch.utils.data import Dataset, DataLoader
+from transformers import AutoTokenizer
+from glob import glob
+from collections.abc import Iterable
+from collections import defaultdict
+pheno_map = {'alcohol.abuse': 0,
+        'advanced.lung.disease': 1,
+        'advanced.heart.disease': 2,
+        'chronic.pain.fibromyalgia': 3,
+        'other.substance.abuse': 4,
+        'psychiatric.disorders': 5,
+        'obesity': 6,
+        'depression': 7,
+        'advanced.cancer': 8,
+        'chronic.neurological.dystrophies': 9,
+        'none': -1}
+rev_pheno_map = {v: k for k,v in pheno_map.items()}
+valid_cats = range(0,9)
+umls_cats = ['T114', 'T029', 'T073', 'T058', 'T191', 'T200', 'T048', 'T019', 'T046', 'T023', 'T041', 'T059', 'T184', 'T034', 'T116', 'T039', 'T127', 'T201', 'T129', 'T067', 'T109', 'T197', 'T131', 'T130', 'T126', 'T061', 'T203', 'T047', 'T037', 'T074', 'T031', 'T195', 'T168']
+umls_map = {s: i for i,s in enumerate(umls_cats)}
+def gen_splits(args, phenos):
+    np.random.seed(0)
+    if args.task == 'token':
+        files = glob(os.path.join(args.data_dir, 'mimic_decisions/data/**/*'))
+        if args.use_umls:
+            files = ["/".join(x.split('/')[-1:]) for x in files]
+        else:
+            files = ["/".join(x.split('/')[-2:]) for x in files]
+        subjects = np.unique([os.path.basename(x).split('_')[0] for x in files])
+    elif phenos is not None:
+        subjects = phenos['subject_id'].unique()
+    else:
+        raise ValueError
+    phenos['phenotype_label'] = phenos['phenotype_label'].apply(lambda x: x.lower())
+    n = len(subjects)
+    train_count = int(0.8*n)
+    val_count = int(0.9*n) - int(0.8*n)
+    test_count = n - int(0.9*n)
+    train, val, test = [], [], []
+    np.random.shuffle(subjects)
+    subjects = list(subjects)
+    pheno_list = set(pheno_map.keys())
+    if args.unseen_pheno is not None:
+        test_phenos = {rev_pheno_map[args.unseen_pheno]}
+        unseen_pheno = rev_pheno_map[args.unseen_pheno]
+        train_phenos = pheno_list - test_phenos
+    else:
+        test_phenos = pheno_list
+        train_phenos = pheno_list
+        unseen_pheno = 'null'
+    while len(subjects) > 0:
+        if len(pheno_list) > 0:
+            for pheno in pheno_list:
+                if len(train) < train_count and pheno in train_phenos:
+                    el = None
+                    for i, subj in enumerate(subjects):
+                        row = phenos[phenos.subject_id == subj]
+                        if row['phenotype_label'].apply(lambda x: pheno in x and not unseen_pheno in x).any():
+                            el = subjects.pop(i)
+                            break
+                    if el is not None:
+                        train.append(el)
+                    elif el is None:
+                        pheno_list.remove(pheno)
+                        break
+                if len(val) < val_count and (not args.pheno_id or len(val) <= (0.5*val_count)):
+                    el = None
+                    for i, subj in enumerate(subjects):
+                        row = phenos[phenos.subject_id == subj]
+                        if row['phenotype_label'].apply(lambda x: pheno in x).any():
+                            el = subjects.pop(i)
+                            break
+                    if el is not None:
+                        val.append(el)
+                    elif el is None:
+                        pheno_list.remove(pheno)
+                        break
+                if len(test) < test_count or (args.unseen_pheno is not None and pheno in test_phenos):
+                    el = None
+                    for i, subj in enumerate(subjects):
+                        row = phenos[phenos.subject_id == subj]
+                        if row['phenotype_label'].apply(lambda x: pheno in x).any():
+                            el = subjects.pop(i)
+                            break
+                    if el is not None:
+                        test.append(el)
+                    elif el is None:
+                        pheno_list.remove(pheno)
+                        break
+        else:
+            if len(train) < train_count:
+                el = subjects.pop()
+                if el is not None:
+                    train.append(el)
+            if len(val) < val_count:
+                el = subjects.pop()
+                if el is not None:
+                    val.append(el)
+            if len(test) < test_count:
+                el = subjects.pop()
+                if el is not None:
+                    test.append(el)
+    if args.task == 'token':
+        train = [x for x in files if os.path.basename(x).split('_')[0] in train]
+        val = [x for x in files if os.path.basename(x).split('_')[0] in val]
+        test = [x for x in files if os.path.basename(x).split('_')[0] in test]
+    elif phenos is not None:
+        train = phenos[phenos.subject_id.isin(train)]
+        val = phenos[phenos.subject_id.isin(val)]
+        test = phenos[phenos.subject_id.isin(test)]
+    return train, val, test
+class MyDataset(Dataset):
+    def __init__(self, args, tokenizer, data_source, phenos, train = False):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.data = []
+        self.train = train
+        self.pheno_ids = defaultdict(list)
+        self.dec_ids = {k: [] for k in pheno_map.keys()}
+        if args.task == 'seq':
+            for i, row in data_source.iterrows():
+                sample = self.load_phenos(args, row, i)
+                self.data.append(sample)
+        else:
+            for i, fn in enumerate(data_source):
+                sample = self.load_decisions(args, fn, i, phenos)
+                self.data.append(sample)
+    def load_phenos(self, args, row, idx):
+        txt_candidates = glob(os.path.join(args.data_dir,
+            f'mimic_decisions/raw_text/{row["subject_id"]}_{row["hadm_id"]}*.txt'))
+        text = open(txt_candidates[0]).read()
+        if args.pheno_n == 500:
+            file_dir = glob(os.path.join(args.data_dir,
+                f'mimic_decisions/data/*/{row["subject_id"]}_{row["hadm_id"]}*.json'))[0]
+            with open(file_dir) as f:
+                data = json.load(f, strict=False)
+            annots = data[0]['annotations']
+            if args.text_subset:
+                unlabeled_text = np.ones(len(text), dtype=bool)
+                labeled_text = np.zeros(len(text), dtype=bool)
+                for annot in annots:
+                    cat = parse_cat(annot['category'])
+                    start, end = map(int, (annot['start_offset'], annot['end_offset']))
+                    if cat is not None:
+                        unlabeled_text[start:end] = 0
+                    if cat in args.text_subset:
+                        labeled_text[start:end] = 1
+                combined_text = unlabeled_text | labeled_text if args.include_nolabel else labeled_text
+                text = "".join([c for i,c in enumerate(text) if combined_text[i]])
+            encoding = self.tokenizer.encode_plus(text,
+                    truncation=args.truncate_train if self.train else args.truncate_eval)
+            ids = np.zeros((args.num_decs, len(encoding['input_ids'])))
+            for annot in annots:
+                start = int(annot['start_offset'])
+                enc_start = encoding.char_to_token(start)
+                i = 1
+                while enc_start is None:
+                    enc_start = encoding.char_to_token(start+i)
+                    i += 1
+                end = int(annot['end_offset'])
+                enc_end = encoding.char_to_token(end)
+                j = 1
+                while enc_end is None:
+                    enc_end = encoding.char_to_token(end-j)
+                    j += 1
+                if enc_start is None or enc_end is None:
+                    raise ValueError
+                cat = parse_cat(annot['category'])
+                if not cat or cat not in valid_cats:
+                    continue
+                ids[cat-1, enc_start:enc_end] = 1
+        else:
+            encoding = self.tokenizer.encode_plus(text,
+                    truncation=args.truncate_train if self.train else args.truncate_eval)
+            ids = None
+        labels = np.zeros(args.num_phenos)
+        if args.pheno_n in (500, 800):
+            sample_phenos = row['phenotype_label']
+            if sample_phenos != 'none':
+                for pheno in sample_phenos.split(','):
+                    labels[pheno_map[pheno.lower()]] = 1
+        elif args.pheno_n == 1500:
+            for k,v in pheno_map.items():
+                if row[k] == 1:
+                    labels[v] = 1
+        if args.pheno_id is not None:
+            if args.pheno_id == -1:
+                labels = [0.0 if any(labels) else 1.0]
+            else:
+                labels = [labels[args.pheno_id]]
+        return encoding['input_ids'], labels, ids
+    def load_decisions(self, args, fn, idx, phenos):
+        basename = os.path.basename(fn).split("-")[0]
+        if args.use_umls:
+            file_dir = os.path.join(args.data_dir, 'mimic_decisions/umls', basename)
+        else:
+            file_dir = os.path.join(args.data_dir, 'mimic_decisions/data', fn)
+        pheno_id = "_".join(basename.split("_")[:3]) + '.txt'
+        txt_candidates = glob(os.path.join(args.data_dir,
+            f'mimic_decisions/raw_text/{basename}*.txt'))
+        text = open(txt_candidates[0]).read()
+        encoding = self.tokenizer.encode_plus(text,
+                max_length=args.max_len,
+                truncation=args.truncate_train if self.train else args.truncate_eval,
+                padding = 'max_length',
+                )
+        if pheno_id in phenos.index:
+            sample_phenos = phenos.loc[pheno_id]['phenotype_label']
+            for pheno in sample_phenos.split(','):
+                self.pheno_ids[pheno].append(idx)
+        with open(file_dir) as f:
+            data = json.load(f, strict=False)
+        if args.use_umls:
+            annots = data
+        else:
+            annots = data[0]['annotations']
+        if args.label_encoding == 'multiclass':
+            labels = np.full(len(encoding['input_ids']), args.num_labels-1, dtype=int)
+        else:
+            labels = np.zeros((len(encoding['input_ids']), args.num_labels))
+        for annot in annots:
+            start = int(annot['start_offset'])
+            enc_start = encoding.char_to_token(start)
+            i = 1
+            while enc_start is None and i < 10:
+                enc_start = encoding.char_to_token(start+i)
+                i += 1
+            if i == 10:
+                break
+            end = int(annot['end_offset'])
+            enc_end = encoding.char_to_token(end)
+            j = 1
+            while enc_end is None and j < 10:
+                enc_end = encoding.char_to_token(end-j)
+                j += 1
+            if j == 10:
+                enc_end = len(encoding.input_ids)
+            if enc_start is None or enc_end is None:
+                raise ValueError
+            if args.label_encoding == 'multiclass' and any([x in [2*y for y in range(args.num_labels//2)] for x in labels[enc_start:enc_end]]):
+                    continue
+            if args.use_umls:
+                cat = umls_map.get(annot['category'], None)
+            else:
+                cat = parse_cat(annot['category'])
+                if cat:
+                    cat -= 1
+            if cat is None or (not args.use_umls and cat not in valid_cats):
+                continue
+            if args.label_encoding == 'multiclass':
+                cat1 = cat * 2
+                cat2 = cat * 2 + 1
+                labels[enc_start] = cat1
+                labels[enc_start+1:enc_end] = cat2
+            elif args.label_encoding == 'bo':
+                cat1 = cat * 2
+                cat2 = cat * 2 + 1
+                labels[enc_start, cat1] = 1
+                labels[enc_start+1:enc_end, cat2] = 1
+            elif args.label_encoding == 'boe':
+                cat1 = cat * 3
+                cat2 = cat * 3 + 1
+                cat3 = cat * 3 + 2
+                labels[enc_start, cat1] = 1
+                labels[enc_start+1:enc_end-1, cat2] = 1
+                labels[enc_end-1, cat3] = 1
+            else:
+                labels[enc_start:enc_end, cat] = 1
+        return {'input_ids': encoding['input_ids'], 'labels': labels, 't2c': encoding.token_to_chars}
+    def __getitem__(self, idx):
+        return self.data[idx]
+    def __len__(self):
+        return len(self.data)
+def parse_cat(cat):
+    for i,c in enumerate(cat):
+        if c.isnumeric():
+            if cat[i+1].isnumeric():
+                return int(cat[i:i+2])
+            return int(c)
+    return None
+def load_phenos(args):
+    if args.pheno_n == 500:
+        phenos = pd.read_csv(os.path.join(args.data_dir,
+            'mimic_decisions/phenos500'),
+            sep='\t').rename(lambda x: x.strip(), axis=1)
+        phenos['raw_text'] = phenos['raw_text'].apply(lambda x: os.path.basename(x))
+        phenos[['SUBJECT_ID', 'HADM_ID', 'ROW_ID']] = \
+            [os.path.splitext(x)[0].split('_')[:3] for x in phenos['raw_text']]
+        phenos = phenos[phenos['phenotype_label'] != '?']
+    elif args.pheno_n == 800:
+        phenos = pd.read_csv(os.path.join(args.data_dir, 'mimic_decisions/phenos800.csv'))
+        phenos.rename({'Ham_ID': 'HADM_ID'}, inplace=True, axis=1)
+        phenos = phenos[phenos.phenotype_label != '?']
+    elif args.pheno_n == 1500:
+        phenos = pd.read_csv(os.path.join(args.data_dir, 'mimic_decisions/phenos1500.csv'))
+        phenos.rename({'Hospital.Admission.ID': 'HADM_ID',
+            'subject.id': 'SUBJECT_ID'}, inplace=True, axis=1)
+        phenos = phenos[phenos.Unsure != 1]
+        phenos['psychiatric.disorders'] = phenos['Dementia']\
+                                        | phenos['Developmental.Delay.Retardation']\
+                                        | phenos['Schizophrenia.and.other.Psychiatric.Disorders']
+    else:
+        raise ValueError
+    phenos.rename(lambda k: k.lower(), inplace=True, axis = 1)
+    return phenos
+def downsample(dataset):
+    data = dataset.data
+    class0 = [x for x in data if x[1][0] == 0]
+    class1 = [x for x in data if x[1][0] == 1]
+    if len(class0) > len(class1):
+        class0 = resample(class0, replace=False, n_samples=len(class1), random_state=0)
+    else:
+        class1 = resample(class1, replace=False, n_samples=len(class0), random_state=0)
+    dataset.data = class0 + class1
+def upsample(dataset):
+    data = dataset.data
+    class0 = [x for x in data if x[1][0] == 0]
+    class1 = [x for x in data if x[1][0] == 1]
+    if len(class0) > len(class1):
+        class1 = resample(class1, replace=True, n_samples=len(class0), random_state=0)
+    else:
+        class0 = resample(class0, replace=True, n_samples=len(class1), random_state=0)
+    dataset.data = class0 + class1
+def load_tokenizer(name):
+    return AutoTokenizer.from_pretrained(name)
+def load_data(args):
+    from sklearn.utils import resample
+    def collate_segment(batch):
+        xs = []
+        ys = []
+        t2cs = []
+        has_ids = 'ids' in batch[0]
+        if has_ids:
+            idss = []
+        else:
+            ids = None
+        masks = []
+        for i in range(len(batch)):
+            x = batch[i]['input_ids']
+            y = batch[i]['labels']
+            if has_ids:
+                ids = batch[i]['ids']
+            n = len(x)
+            if n > args.max_len:
+                start = np.random.randint(0, n - args.max_len + 1)
+                x = x[start:start + args.max_len]
+                if args.task == 'token':
+                    y = y[start:start + args.max_len]
+                if has_ids:
+                    new_ids = []
+                    ids = [x[start:start + args.max_len] for x in ids]
+                    for subids in ids:
+                        subids = [idx for idx, x in enumerate(subids) if x]
+                        new_ids.append(subids)
+                    all_ids = set([y for x in new_ids for y in x])
+                    nones = set(range(args.max_len)) - all_ids
+                    new_ids.append(list(nones))
+                mask = [1] * args.max_len
+            elif n < args.max_len:
+                x = np.pad(x, (0, args.max_len - n))
+                if args.task == 'token':
+                    y = np.pad(y, ((0, args.max_len - n), (0, 0)))
+                mask = [1] * n + [0] * (args.max_len - n)
+            else:
+                mask = [1] * n
+            xs.append(x)
+            ys.append(y)
+            t2cs.append(batch[i]['t2c'])
+            if has_ids:
+                idss.append(new_ids)
+            masks.append(mask)
+        xs = torch.tensor(xs)
+        ys = torch.tensor(ys)
+        masks = torch.tensor(masks)
+        return {'input_ids': xs, 'labels': ys, 'ids': ids, 'mask': masks, 't2c': t2cs}
+    def collate_full(batch):
+        lens = [len(x['input_ids']) for x in batch]
+        max_len = max(args.max_len, max(lens))
+        for i in range(len(batch)):
+            batch[i]['input_ids'] = np.pad(batch[i]['input_ids'], (0, max_len - lens[i]))
+            if args.task == 'token':
+                if args.label_encoding == 'multiclass':
+                    batch[i]['labels'] = np.pad(batch[i]['labels'], (0, max_len - lens[i]), constant_values=-100)
+                else:
+                    batch[i]['labels'] = np.pad(batch[i]['labels'], ((0, max_len - lens[i]), (0, 0)))
+            mask = [1] * lens[i] + [0] * (max_len - lens[i])
+            batch[i]['mask'] = mask
+        batch = {k: torch.tensor(np.array([sample[k] for sample in batch])) if isinstance(batch[0][k], Iterable) else
+                [sample[k] for sample in batch]
+                for k in batch[0].keys()}
+        return batch
+    tokenizer = load_tokenizer(args.model_name)
+    args.vocab_size = tokenizer.vocab_size
+    args.max_length = min(tokenizer.model_max_length, 512)
+    if args.mimic_data:
+        from datasets import Dataset
+        df = pd.read_csv('/data/mohamed/data/mimiciii/NOTEEVENTS.csv.gz',
+                usecols=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'TEXT'])
+        data = Dataset.from_pandas(df)
+        return data, tokenizer
+    else:
+        phenos = load_phenos(args)
+        train_files, val_files, test_files = gen_splits(args, phenos)
+        phenos.set_index('raw_text', inplace=True)
+        train_dataset = MyDataset(args, tokenizer, train_files, phenos, train=True)
+        if args.resample == 'down':
+            downsample(train_dataset)
+        elif args.resample == 'up':
+            upsample(train_dataset)
+        val_dataset = MyDataset(args, tokenizer, val_files, phenos)
+        test_dataset = MyDataset(args, tokenizer, test_files, phenos)
+        print('Train dataset:', len(train_dataset))
+        print('Val dataset:', len(val_dataset))
+        print('Test dataset:', len(test_dataset))
+        train_ns = DataLoader(train_dataset, 1, False,
+                collate_fn=collate_full,
+                )
+        train_dataloader = DataLoader(train_dataset, args.batch_size, True,
+                collate_fn=collate_segment,
+                )
+        val_dataloader = DataLoader(val_dataset, 1, False, collate_fn=collate_full)
+        test_dataloader = DataLoader(test_dataset, 1, False, collate_fn=collate_full)
+        train_files = [os.path.basename(x).split('-')[0] for x in train_files]
+        val_files = [os.path.basename(x).split('-')[0] for x in val_files]
+        test_files = [os.path.basename(x).split('-')[0] for x in test_files]
+        return train_dataloader, val_dataloader, test_dataloader, train_ns, [train_files, val_files, test_files]

demo.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import gradio as gr
+import torch
+from datetime import datetime
+from dateutil import parser
+from demo_assets import *
+import re
+categories = ['Contact related', 'Gathering additional information', 'Defining problem',
+        'Treatment goal', 'Drug related', 'Therapeutic procedure related', 'Evaluating test result',
+        'Deferment', 'Advice and precaution', 'Legal and insurance related']
+unicode_symbols = [
+        "\U0001F91D",  # Handshake
+        "\U0001F50D",  # Magnifying glass
+        "\U0001F9E9",  # Puzzle piece
+        "\U0001F3AF",  # Target
+        "\U0001F48A",  # Pill
+        "\U00002702",  # Surgical scissors
+        "\U0001F9EA",  # Test tube
+        "\U000023F0",  # Alarm clock
+        "\U000026A0",  # Warning sign
+        "\U0001F4C4"   # Document
+        ]
+OTHERS_ID = 18
+def postprocess_labels(text, logits, t2c):
+    tags = [None for _ in text]
+    labels = logits.argmax(-1)
+    for i,cat in enumerate(labels):
+        if cat != OTHERS_ID:
+            char_ids = t2c(i)
+            if char_ids is None:
+                continue
+            for idx in range(char_ids.start, char_ids.end):
+                if tags[idx] is None and idx < len(text):
+                    tags[idx] = categories[cat // 2]
+    for i in range(len(text)-1):
+        if text[i] == ' ' and (text[i+1] == ' ' or tags[i-1] == tags[i+1]):
+            tags[i] = tags[i-1]
+    return tags
+def indicators_to_spans(labels, t2c = None):
+    def add_span(c, start, end):
+        if t2c(start) is None or t2c(end) is None:
+            start, end = -1, -1
+        else:
+            start = t2c(start).start
+            end = t2c(end).end
+        span = (c, start, end)
+        spans.add(span)
+    spans = set()
+    num_tokens = len(labels)
+    num_classes = OTHERS_ID // 2
+    start = None
+    cls = None
+    for t in range(num_tokens):
+        if start and labels[t] == cls + 1:
+            continue
+        elif start:
+            add_span(cls // 2, start, t - 1)
+            start = None
+        # if not start and labels[t] in [2*x for x in range(num_classes)]:
+        if not start and labels[t] != OTHERS_ID:
+            start = t
+            cls = int(labels[t]) // 2 * 2
+    return spans
+def extract_date(text):
+    pattern = r'(?<=Date: )\s*(\[\*\*.*?\*\*\]|\d{1,4}[-/]\d{1,2}[-/]\d{1,4})'
+    match = re.search(pattern, text).group(1)
+    start, end = None, None
+    for i, c in enumerate(match):
+        if start is None and c.isnumeric():
+            start = i
+        elif c.isnumeric():
+            end = i + 1
+    match = match[start:end]
+    return match
+def run_gradio(model, tokenizer):
+    def predict(text):
+        encoding = tokenizer.encode_plus(text)
+        x = torch.tensor(encoding['input_ids']).unsqueeze(0).to(device)
+        mask = torch.ones_like(x)
+        output = model.generate(x, mask)[0]
+        return output, encoding.token_to_chars
+    def process(text):
+        if text is not None:
+            output, t2c = predict(text)
+            tags = postprocess_labels(text, output, t2c)
+            with open('log.csv', 'a') as f:
+                f.write(f'{datetime.now()},{text}\n')
+            return list(zip(text, tags))
+        else:
+            return text
+    def process_sum(*inputs):
+        global sum_c
+        dates = {}
+        for i in range(sum_c):
+            text = inputs[i]
+            output, t2c = predict(text)
+            spans = indicators_to_spans(output.argmax(-1), t2c)
+            date = extract_date(text)
+            present_decs = set(cat for cat, _, _ in spans)
+            decs = {k: [] for k in sorted(present_decs)}
+            for c, s, e in spans:
+                decs[c].append(text[s:e])
+            dates[date] = decs
+        out = ""
+        for date in sorted(dates.keys(), key = lambda x: parser.parse(x)):
+            out += f'## **[{date}]**\n\n'
+            decs = dates[date]
+            for c in decs:
+                out += f'### {unicode_symbols[c]} ***{categories[c]}***\n\n'
+                for dec in decs[c]:
+                    out += f'{dec}\n\n'
+        return out
+    global sum_c
+    sum_c = 1
+    SUM_INPUTS = 20
+    def update_inputs(inputs):
+        outputs = []
+        if inputs is None:
+            c = 0
+        else:
+            inputs = [open(f.name).read() for f in inputs]
+            for i, text in enumerate(inputs):
+                outputs.append(gr.update(value=text, visible=True))
+            c = len(inputs)
+        n = SUM_INPUTS
+        for i in range(n - c):
+            outputs.append(gr.update(value='', visible=False))
+        global sum_c; sum_c = c
+        return outputs
+    def add_ex(*inputs):
+        global sum_c
+        new_idx = sum_c
+        if new_idx < SUM_INPUTS:
+            out = inputs[:new_idx] + (gr.update(visible=True),) + inputs[new_idx+1:]
+            sum_c += 1
+        else:
+            out = inputs
+        return out
+    def sub_ex(*inputs):
+        global sum_c
+        new_idx = sum_c - 1
+        if new_idx > 0:
+            out = inputs[:new_idx] + (gr.update(visible=False),) + inputs[new_idx+1:]
+            sum_c -= 1
+        else:
+            out = inputs
+        return out
+    device = model.backbone.device
+    # colors = ['aqua', 'blue', 'fuchsia', 'teal', 'green', 'olive', 'lime', 'silver', 'purple', 'red',
+    #         'yellow', 'navy', 'gray', 'white', 'maroon', 'black']
+    colors = ['#8dd3c7', '#ffffb3', '#bebada', '#fb8072', '#80b1d3', '#fdb462', '#b3de69', '#fccde5', '#d9d9d9', '#bc80bd']
+    color_map = {cat: colors[i] for i,cat in enumerate(categories)}
+    det_desc = ['Admit, discharge, follow-up, referral',
+            'Ordering test, consulting colleague, seeking external information',
+            'Diagnostic conclusion, evaluation of health state, etiological inference, prognostic judgment',
+            'Quantitative or qualitative',
+            'Start, stop, alter, maintain, refrain',
+            'Start, stop, alter, maintain, refrain',
+            'Positive, negative, ambiguous test results',
+            'Transfer responsibility, wait and see, change subject',
+            'Advice or precaution',
+            'Sick leave, drug refund, insurance, disability']
+    desc = '### Zones (categories)\n'
+    desc += '| | |\n| --- | --- |\n'
+    for i,cat in enumerate(categories):
+        desc += f'| {unicode_symbols[i]} **{cat}** | {det_desc[i]}|\n'
+    #colors
+    #markdown labels
+    #legend and desc
+    #css font-size
+    css = '.category-legend {border:1px dashed black;}'\
+            '.text-sm {font-size: 1.5rem; line-height: 200%;}'\
+            '.gr-sample-textbox {width: 1000px; white-space: nowrap; overflow: hidden; text-overflow: ellipsis;}'\
+            '.text-limit label textarea {height: 150px !important; overflow: scroll; }'\
+            '.text-gray-500 {color: #111827; font-weight: 600; font-size: 1.25em; margin-top: 1.6em; margin-bottom: 0.6em;'\
+                    'line-height: 1.6;}'\
+            '#sum-out {border: 2px solid #007bff; padding: 20px; border-radius: 10px; box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);'
+    title='Clinical Decision Zoning'
+    with gr.Blocks(title=title, css=css) as demo:
+        gr.Markdown(f'# {title}')
+        with gr.Tab("Label a Clinical Note"):
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("## Enter a Discharge Summary or Clinical Note"),
+                    text_input = gr.Textbox(
+                            # value=examples[0],
+                            label="",
+                            placeholder="Enter text here...")
+                    text_btn = gr.Button('Run')
+                with gr.Column():
+                    gr.Markdown("## Labeled Summary or Note"),
+                    text_out = gr.Highlight(label="", combine_adjacent=True, show_legend=False, color_map=color_map)
+            gr.Examples(text_examples, inputs=text_input)
+        with gr.Tab("Summarize Patient History"):
+            with gr.Row():
+                with gr.Column():
+                    sum_inputs = [gr.Text(label='Clinical Note 1', elem_classes='text-limit')]
+                    sum_inputs.extend([gr.Text(label='Clinical Note %d'%i, visible=False, elem_classes='text-limit')
+                        for i in range(2, SUM_INPUTS + 1)])
+                    sum_btn = gr.Button('Run')
+                    with gr.Row():
+                        ex_add = gr.Button("+")
+                        ex_sub = gr.Button("-")
+                    upload = gr.File(label='Upload clinical notes', file_type='text', file_count='multiple')
+                    gr.Examples(sum_examples, inputs=upload,
+                            fn = update_inputs, outputs=sum_inputs, run_on_click=True)
+                with gr.Column():
+                    gr.Markdown("## Summarized Clinical Decision History")
+                    sum_out = gr.Markdown(elem_id='sum-out')
+        gr.Markdown(desc)
+        # Functions
+        text_input.submit(process, inputs=text_input, outputs=text_out)
+        text_btn.click(process, inputs=text_input, outputs=text_out)
+        upload.change(update_inputs, inputs=upload, outputs=sum_inputs)
+        ex_add.click(add_ex, inputs=sum_inputs, outputs=sum_inputs)
+        ex_sub.click(sub_ex, inputs=sum_inputs, outputs=sum_inputs)
+        sum_btn.click(process_sum, inputs=sum_inputs, outputs=sum_out)
+    # demo = gr.TabbedInterface([text_demo, sum_demo], ["Label a Clinical Note", "Summarize Patient History"])
+    demo.launch(share=False)

demo_assets.py ADDED Viewed

	@@ -0,0 +1,20 @@

+sum_examples = [
+        [['examples/note%d.txt'%i for i in range(1,n)]]
+        for n in range(5,1, -1)
+        ]
+text_examples = [
+        ["a 72 year old female with chronic indwelling foley. GNR identified in her blood and there is concern for possible resistant pseudomonas. vanco discontinued and ceftriaxone replaced with zosyn. "],
+        ["This is a 73 year old man with CMML with recent admission to OSH for emergent splenectomy after splenic rupture who is admitted for hypoxemia and worsening bilateral ground glass opacities. He was treated aggressively on the floor with antibiotics and other etiologies (PE, MI, etc) were appropriately addressed. He was fluid resusitated and continued on his CMML regimen. Despite this, the patient became progressively hypotensive and was transferred to the ICU for further care.\n"
+            "In the ICU the patient continued to deteriorate and developed progressive hypotension and acidosis despite aggressive fluid repletion, pressor support, and bicarbonate drip. He received >8L NS, 8amps bicarb, pressor support w/ levophed and vasopressin, and maximum ventilatory support. Despite these measures, his lactate continued to trend upwards and he became progressively more hypotensive on the PEEP settings required to adequately oxygenate him. Furthermore, the patient developed tumor lysis syndrome in the setting of his chemotherapy and became anuric producing only 40cc of urine over 8hr. Renal service was called emergently to consider dialysis but the family elected to change his code status to DNR/DNI and focus care on comfort as a priority, after discussion w/ his oncologist Dr [**First Name (STitle) 1557**] and to defer more aggressive therapy."],
+            ["48 year old male with complicated past medical history, multiple problems notably including ESRD s/p renal transplant complicated by collapsing FSGS, recent MRSA line sepsis, here with fevers and hypotension at dialysis, code sepsis."
+            "He met sepsis criteria with fever, tachycardia and likely source of infection at site of tunneled dialysis catheter. Also had leukocytosis with L shift. CXR clear, urine not produced for sample. No central line placed [**3-5**] lack of access. Treated with 2 doses linezolid PO; d/w renal team - preferred vanco use, patient switched to vanco by level and d/c on vanco at HD. Underwent stim test; failed, started on hydrocort at stress dose levels (50 q6), d/w renal, felt uneccessary, patient started on prednisone taper back to home dose of 5 mg PO qd. Held HTN meds in setting of sepsis. Received dose of vanco prior to d/c."
+            "Dialysis Catheter - noted morning after admission to be clotted; question whether this was related to blood draw. Instilled tPA in catheter overnight; were able to use cath in AM for HD. "
+            "ESRD - Started on prograf; monitored levels, d/c on home dose. As per pharm, must continue to monitor levels in context of using itraconazole. Continued patient on bactrim for prophylaxis given tacrolimus use. To go to dialysis. 7 point HCT drop noted during admission; thought elevated HCT hemoconcentration. Hemolysis labs neg, no stool to guaiac. "
+            "PTT elevation - noted on admission, resolved in ICU. DIC labs negative. PT/PTT elevation at discharge c/w warfarin/SC heparin use."
+            "Hypertension: History of HTN, on lopressor and diltiazem. "
+            "Pulmonary Aspergillus: Stable. On itraconazole and followed by pulmonary as an outpatient. Continued in house"
+            "Atrial fibrillation: He is normally rate controlled with metoprolol and anticoagulated with coumadin. NSR on EKG here, continued warfarin, held beta blocker .  "
+            "..."
+            "Call your PCP or return to the ED for fevers/chills/shakes, chest pain, shortness of breath, pain at the site of your dialysis catheter, nausea, vomiting, or swelling in your legs/feet. "]
+            ]

electra-base.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64dc780def96ec19006340e180a60531dacd8db9e0e4e206ba57c720e79775d3
+size 435711089

examples/note1.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+Date: 2/12/2024
+History: the patient is a 45 yo F here for nervousness. A few weeks ago she noticed that she was feeling more nervous than usual and that it has been worsening. It is exacerbated by family and work. She feels espeically nervous on Sunday night and Monday morning when she is preparing for the week. She is unable to fall asleep and doesn’t want to eat anything, though she does make herself eat. Nothing helps her nervousness. She otherwise denies significant changes in appetite, weight loss, or overall wellbeing. She denies fevers, chills, nausea, constipation, diarrhea, skin changes, racing heart, shortness of breath, dizziness, headaches or rashes.
+ROS: otherwise negative
+PMH: None; PSH: None
+Meds: Tylenol for occasional HA
+FHX: Father had an MI, died at 65yo
+Allergies: NKDA
+SH: Lives at home with husband, mother, and youngest son. Is an english literature professor at a local college.
+Has 2 drinks/mo, no tobacco or drug use.
+Physical Examination:
+VS: Blood Pressure: 130/85 mm Hg
+Heart Rate: 96/min
+Gen: No acute distress, conversational, thin
+Neck: No thyromegaly, no lymphadeopathy
+Heart: RRR, no murmurs, rubs or gallops. Radial pulses +2 bilaterally
+Lungs: Clear to ascultation bilaterally, no wheezes
+Psych: Well-groomed. Non-pressured speech, linear though process.

examples/note2.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+Date: 3/18/2024
+History: the patient, a 45-year-old female, returns for a follow-up regarding her nervousness. She reports a slight improvement in her symptoms with reduced intensity of nervousness on Sunday nights and Monday mornings. However, she still experiences difficulty sleeping and occasional lack of appetite. She has tried meditation and deep breathing exercises, which have provided minimal relief. No new symptoms have emerged since the last visit. She continues to deny fevers, chills, nausea, or other systemic symptoms.
+ROS: Negative except as noted.
+PMH: No changes.
+PSH: None.
+Meds: Tylenol for occasional headaches. Started on a trial of low-dose sertraline since the last visit.
+FHX: No changes.
+Allergies: NKDA.
+SH: No changes in social circumstances. Continues to work as an English literature professor.
+Physical Examination:
+VS: Blood Pressure: 128/82 mm Hg, Heart Rate: 92/min
+Gen: Appears more relaxed than the previous visit.
+Neck: No changes.
+Heart: Unchanged.
+Lungs: Clear to auscultation.
+Psych: Appears slightly more at ease, maintains good eye contact, speech and thought process remain coherent.
+Assessment/Plan: Improvement noted with sertraline. Will continue the current dose and re-evaluate in 3 months. Encouraged to continue non-pharmacological interventions like meditation and deep breathing exercises. Consider referral to therapy for additional support.

examples/note3.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+Date: 6/2/2024
+History: the patient, now 46, reports further improvement in her symptoms of nervousness. She has started seeing a therapist, which she finds helpful. Her sleep has improved, and she no longer experiences significant appetite loss. She has not developed any new symptoms and continues to deny systemic symptoms.
+ROS: Negative except as noted.
+PMH: No changes.
+PSH: None.
+Meds: Continues sertraline. No longer using Tylenol for headaches.
+FHX: No changes.
+Allergies: NKDA.
+SH: Stable home and work life. Activities and responsibilities as an English literature professor are well-managed.
+Physical Examination:
+VS: Blood Pressure: 125/80 mm Hg, Heart Rate: 88/min
+Gen: Appears comfortable and at ease.
+Neck: No changes.
+Heart: Unchanged.
+Lungs: Clear to auscultation.
+Psych: Noticeable improvement in mood and anxiety levels. Reports feeling more in control.
+Assessment/Plan: Significant improvement with sertraline and therapy. Plan to continue current management and follow up in 6 months or as needed. Discuss potential for gradually reducing medication under supervision if improvement sustains.

examples/note4.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+Date: 12/29/2024
+History: the patient comes in for a scheduled follow-up. She feels much better and has managed to maintain her improvements. She expresses a desire to start tapering off sertraline under medical supervision. No new health concerns have been noted. She remains active at work and home.
+ROS: Entirely negative.
+PMH: No changes.
+PSH: None.
+Meds: Sertraline, with a plan to taper.
+FHX: No changes.
+Allergies: NKDA.
+SH: Stable and positive home and work environment.
+Physical Examination:
+VS: Blood Pressure: 122/78 mm Hg, Heart Rate: 84/min
+Gen: Looks healthy and content.
+Neck: No changes.
+Heart: Unchanged.
+Lungs: Clear.
+Psych: Maintained improvement in mental health. Ready for gradual medication reduction.
+Assessment/Plan: Patient has shown sustained improvement and is interested in tapering off medication. Will initiate a slow tapering process of sertraline and monitor closely for any recurrence of symptoms. Continue therapy and supportive measures. Next follow-up scheduled in 3 months to assess progress.

model.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import copy
+import torch
+from torch import nn
+from transformers import AutoModel
+from torch.optim import AdamW
+from transformers import get_linear_schedule_with_warmup
+# from torchcrf import CRF
+class MyModel(nn.Module):
+    def __init__(self, args, backbone):
+        super().__init__()
+        self.args = args
+        self.backbone = backbone
+        self.cls_id = 0
+        hidden_dim = self.backbone.config.hidden_size
+        self.classifier = nn.Sequential(
+                nn.Dropout(0.1),
+                nn.Linear(hidden_dim, args.num_labels)
+                )
+        if args.distil_att:
+            self.distil_att = nn.Parameter(torch.ones(self.backbone.config.hidden_size))
+    def forward(self, x, mask):
+        x = x.to(self.backbone.device)
+        mask = mask.to(self.backbone.device)
+        out = self.backbone(x, attention_mask = mask, output_attentions=True)
+        return out, self.classifier(out.last_hidden_state)
+    def decisions(self, x, mask):
+        x = x.to(self.backbone.device)
+        mask = mask.to(self.backbone.device)
+        out = self.backbone(x, attention_mask = mask, output_attentions=False)
+        return out, self.classifier(out.last_hidden_state)
+    def phenos(self, x, mask):
+        x = x.to(self.backbone.device)
+        mask = mask.to(self.backbone.device)
+        out = self.backbone(x, attention_mask = mask, output_attentions=True)
+        return out, self.classifier(out.pooler_output)
+    def generate(self, x, mask, choice=None):
+        outs = []
+        if self.args.task == 'seq' or choice == 'seq':
+            for i, offset in enumerate(range(0, x.shape[1], self.args.max_len-1)):
+                if i == 0:
+                    segment = x[:, offset:offset + self.args.max_len-1]
+                    segment_mask = mask[:, offset:offset + self.args.max_len-1]
+                else:
+                    segment = torch.cat((torch.ones((x.shape[0], 1), dtype=int).to(x.device)\
+                            *self.cls_id,
+                            x[:, offset:offset + self.args.max_len-1]), axis=1)
+                    segment_mask = torch.cat((torch.ones((mask.shape[0], 1)).to(mask.device),
+                            mask[:, offset:offset + self.args.max_len-1]), axis=1)
+                logits = self.phenos(segment, segment_mask)[1]
+                outs.append(logits)
+            return torch.max(torch.stack(outs, 1), 1).values
+        elif self.args.task == 'token':
+            for i, offset in enumerate(range(0, x.shape[1], self.args.max_len)):
+                segment = x[:, offset:offset + self.args.max_len]
+                segment_mask = mask[:, offset:offset + self.args.max_len]
+                h = self.decisions(segment, segment_mask)[0].last_hidden_state
+                outs.append(h)
+            h = torch.cat(outs, 1)
+            return self.classifier(h)
+class CNN(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.emb = nn.Embedding(args.vocab_size, args.emb_size)
+        self.model = nn.Sequential(
+                nn.Conv1d(args.emb_size, args.hidden_size, args.kernels[0],
+                    padding='same' if args.task == 'token' else 'valid'),
+                nn.ReLU(),
+                nn.MaxPool1d(1),
+                nn.Conv1d(args.hidden_size, args.hidden_size, args.kernels[1],
+                    padding='same' if args.task == 'token' else 'valid'),
+                nn.ReLU(),
+                nn.MaxPool1d(1),
+                nn.Conv1d(args.hidden_size, args.hidden_size, args.kernels[2],
+                    padding='same' if args.task == 'token' else 'valid'),
+                nn.ReLU(),
+                nn.MaxPool1d(1),
+                )
+        if args.task == 'seq':
+            out_shape = 512 - args.kernels[0] - args.kernels[1] - args.kernels[2] + 3
+        elif args.task == 'token':
+            out_shape = 1
+        self.classifier = nn.Linear(args.hidden_size*out_shape, args.num_labels)
+        self.dropout = nn.Dropout()
+        self.args = args
+        self.device = None
+    def forward(self, x, _):
+        x = x.to(self.device)
+        bs = x.shape[0]
+        x = self.emb(x)
+        x = x.transpose(1,2)
+        x = self.model(x)
+        x = self.dropout(x)
+        if self.args.task == 'token':
+            x = x.transpose(1,2)
+            h = self.classifier(x)
+            return x, h
+        elif self.args.task == 'seq':
+            x = x.reshape(bs, -1)
+            x = self.classifier(x)
+            return x
+    def generate(self, x, _):
+        outs = []
+        for i, offset in enumerate(range(0, x.shape[1], self.args.max_len)):
+            segment = x[:, offset:offset + self.args.max_len]
+            n = segment.shape[1]
+            if n != self.args.max_len:
+                segment = torch.nn.functional.pad(segment, (0, self.args.max_len -  n))
+            if self.args.task == 'seq':
+                logits = self(segment, None)
+                outs.append(logits)
+            elif self.args.task == 'token':
+                h = self(segment, None)[0]
+                h = h[:,:n]
+                outs.append(h)
+        if self.args.task == 'seq':
+            return torch.max(torch.stack(outs, 1), 1).values
+        elif self.args.task == 'token':
+            h = torch.cat(outs, 1)
+            return self.classifier(h)
+class LSTM(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.emb = nn.Embedding(args.vocab_size, args.emb_size)
+        self.model = nn.LSTM(args.emb_size, args.hidden_size, num_layers=args.num_layers,
+                batch_first=True, bidirectional=True)
+        dim = 2*args.num_layers*args.hidden_size if args.task == 'seq' else 2*args.hidden_size
+        self.classifier = nn.Linear(dim, args.num_labels)
+        self.dropout = nn.Dropout()
+        self.args = args
+        self.device = None
+    def forward(self, x, _):
+        x = x.to(self.device)
+        x = self.emb(x)
+        o, (x, _) = self.model(x)
+        o_out = self.classifier(o) if self.args.task == 'token' else None
+        if self.args.task == 'seq':
+            x = torch.cat([h for h in x], 1)
+            x = self.dropout(x)
+            x = self.classifier(x)
+        return (x, o), o_out
+    def generate(self, x, _):
+        outs = []
+        for i, offset in enumerate(range(0, x.shape[1], self.args.max_len)):
+            segment = x[:, offset:offset + self.args.max_len]
+            if self.args.task == 'seq':
+                logits = self(segment, None)[0][0]
+                outs.append(logits)
+            elif self.args.task == 'token':
+                h = self(segment, None)[0][1]
+                outs.append(h)
+        if self.args.task == 'seq':
+            return torch.max(torch.stack(outs, 1), 1).values
+        elif self.args.task == 'token':
+            h = torch.cat(outs, 1)
+            return self.classifier(h)
+def load_model(args, device):
+    if args.model == 'lstm':
+        model = LSTM(args).to(device)
+        model.device = device
+    elif args.model == 'cnn':
+        model = CNN(args).to(device)
+        model.device = device
+    else:
+        model = MyModel(args, AutoModel.from_pretrained(args.model_name)).to(device)
+    if args.ckpt:
+        model.load_state_dict(torch.load(args.ckpt, map_location=device), strict=False)
+    if args.distil:
+        args2 = copy.deepcopy(args)
+        args2.task = 'token'
+        # args2.num_labels = args.num_decs
+        args2.num_labels = args.num_umls_tags
+        model_B = MyModel(args2, AutoModel.from_pretrained(args.model_name)).to(device)
+        model_B.load_state_dict(torch.load(args.distil_ckpt, map_location=device), strict=False)
+        for p in model_B.parameters():
+            p.requires_grad = False
+    else:
+        model_B = None
+    if args.label_encoding == 'multiclass':
+        if args.use_crf:
+            crit = CRF(args.num_labels, batch_first = True).to(device)
+        else:
+            crit = nn.CrossEntropyLoss(reduction='none')
+    else:
+        crit = nn.BCEWithLogitsLoss(
+                pos_weight=torch.ones(args.num_labels).to(device)*args.pos_weight,
+                reduction='none'
+                )
+    optimizer = AdamW(model.parameters(), lr=args.lr)
+    lr_scheduler = get_linear_schedule_with_warmup(optimizer,
+            int(0.1*args.total_steps), args.total_steps)
+    return model, crit, optimizer, lr_scheduler, model_B

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+argparse
+numpy
+pandas
+torch==1.13.1
+transformers==4.38.1