Spaces:

uragankatrrin
/

MHN-React

Runtime error

+# -*- coding: utf-8 -*-
+"""
+Author: Philipp Seidl
+        ELLIS Unit Linz, LIT AI Lab, Institute for Machine Learning
+        Johannes Kepler University Linz
+Contact: [email protected]
+File contains functions that help prepare and download USPTO-related datasets
+"""
+import os
+import gzip
+import pickle
+import requests
+import subprocess
+import pandas as pd
+import numpy as np
+from scipy import sparse
+import json
+def download_temprel_repo(save_path='data/temprel-fortunato', chunk_size=128):
+    "downloads the template-relevance master branch"
+    url = "https://gitlab.com/mefortunato/template-relevance/-/archive/master/template-relevance-master.zip"
+    r = requests.get(url, stream=True)
+    with open(save_path, 'wb') as fd:
+        for chunk in r.iter_content(chunk_size=chunk_size):
+            fd.write(chunk)
+def unzip(path):
+    "unzips a file given a path"
+    import zipfile
+    with zipfile.ZipFile(path, 'r') as zip_ref:
+        zip_ref.extractall(path.replace('.zip',''))
+def download_file(url, output_path=None):
+    """
+        # code from fortunato
+        # could also import  from temprel.data.download import get_uspto_50k but slightly altered ;)
+    """
+    if not output_path:
+        output_path = url.split('/')[-1]
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        with open(output_path, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                if chunk:
+                    f.write(chunk)
+def get_uspto_480k():
+    if not os.path.exists('data'):
+        os.mkdir('data')
+    if not os.path.exists('data/raw'):
+        os.mkdir('data/raw')
+    os.chdir('data/raw')
+    download_file(
+        'https://github.com/connorcoley/rexgen_direct/raw/master/rexgen_direct/data/train.txt.tar.gz',
+        'train.txt.tar.gz'
+    )
+    subprocess.run(['tar', 'zxf', 'train.txt.tar.gz'])
+    download_file(
+        'https://github.com/connorcoley/rexgen_direct/raw/master/rexgen_direct/data/valid.txt.tar.gz',
+        'valid.txt.tar.gz'
+    )
+    subprocess.run(['tar', 'zxf', 'valid.txt.tar.gz'])
+    download_file(
+        'https://github.com/connorcoley/rexgen_direct/raw/master/rexgen_direct/data/test.txt.tar.gz',
+        'test.txt.tar.gz'
+    )
+    subprocess.run(['tar', 'zxf', 'test.txt.tar.gz'])
+    with open('train.txt') as f:
+        train = [
+            {
+                'reaction_smiles': line.strip(),
+                'split': 'train'
+            }
+            for line in f.readlines()
+        ]
+    with open('valid.txt') as f:
+        valid = [
+            {
+                'reaction_smiles': line.strip(),
+                'split': 'valid'
+            }
+            for line in f.readlines()
+        ]
+    with open('test.txt') as f:
+        test = [
+            {
+                'reaction_smiles': line.strip(),
+                'split': 'test'
+            }
+            for line in f.readlines()
+        ]
+    df = pd.concat([
+        pd.DataFrame(train),
+        pd.DataFrame(valid),
+        pd.DataFrame(test)
+    ]).reset_index()
+    df.to_json('uspto_lg_reactions.json.gz', compression='gzip')
+    os.chdir('..')
+    os.chdir('..')
+    return df
+def get_uspto_50k():
+    '''
+    get SI from:
+    Nadine Schneider; Daniel M. Lowe; Roger A. Sayle; Gregory A. Landrum. J. Chem. Inf. Model.201555139-53
+    '''
+    if not os.path.exists('data'):
+        os.mkdir('data')
+    if not os.path.exists('data/raw'):
+        os.mkdir('data/raw')
+    os.chdir('data/raw')
+    subprocess.run(['wget', 'https://pubs.acs.org/doi/suppl/10.1021/ci5006614/suppl_file/ci5006614_si_002.zip'])
+    subprocess.run(['unzip', '-o', 'ci5006614_si_002.zip'])
+    data = []
+    with gzip.open('ChemReactionClassification/data/training_test_set_patent_data.pkl.gz') as f:
+        while True:
+            try:
+                data.append(pickle.load(f))
+            except EOFError:
+                break
+    reaction_smiles = [d[0] for d in data]
+    reaction_reference = [d[1] for d in data]
+    reaction_class = [d[2] for d in data]
+    df = pd.DataFrame()
+    df['reaction_smiles'] = reaction_smiles
+    df['reaction_reference'] = reaction_reference
+    df['reaction_class'] = reaction_class
+    df.to_json('uspto_sm_reactions.json.gz', compression='gzip')
+    os.chdir('..')
+    os.chdir('..')
+    return df
+def get_uspto_golden():
+    """ get uspto golden and convert it to smiles dataframe from
+    Lin, Arkadii; Dyubankova, Natalia; Madzhidov, Timur; Nugmanov, Ramil;
+    Rakhimbekova, Assima; Ibragimova, Zarina; Akhmetshin, Tagir; Gimadiev,
+    Timur; Suleymanov, Rail; Verhoeven, Jonas; Wegner, Jörg Kurt;
+    Ceulemans, Hugo; Varnek, Alexandre (2020):
+    Atom-to-Atom Mapping: A Benchmarking Study of Popular Mapping Algorithms and Consensus Strategies.
+    ChemRxiv. Preprint. https://doi.org/10.26434/chemrxiv.13012679.v1
+    """
+    if os.path.exists('data/raw/uspto_golden.json.gz'):
+        print('loading precomputed')
+        return pd.read_json('data/raw/uspto_golden.json.gz', compression='gzip')
+    if not os.path.exists('data'):
+        os.mkdir('data')
+    if not os.path.exists('data/raw'):
+        os.mkdir('data/raw')
+    os.chdir('data/raw')
+    subprocess.run(['wget', 'https://github.com/Laboratoire-de-Chemoinformatique/Reaction_Data_Cleaning/raw/master/data/golden_dataset.zip'])
+    subprocess.run(['unzip', '-o', 'golden_dataset.zip']) #return golden_dataset.rdf
+    from CGRtools.files import RDFRead
+    import CGRtools
+    from rdkit.Chem import AllChem
+    def cgr2rxnsmiles(cgr_rx):
+        smiles_rx = '.'.join([AllChem.MolToSmiles(CGRtools.to_rdkit_molecule(m)) for m in cgr_rx.reactants])
+        smiles_rx += '>>'+'.'.join([AllChem.MolToSmiles(CGRtools.to_rdkit_molecule(m)) for m in cgr_rx.products])
+        return smiles_rx
+    data = {}
+    input_file = 'golden_dataset.rdf'
+    do_basic_standardization=True
+    print('reading and converting the rdf-file')
+    with RDFRead(input_file) as f:
+            while True:
+                try:
+                    r = next(f)
+                    key = r.meta['Reaction_ID']
+                    if do_basic_standardization:
+                        r.thiele()
+                        r.standardize()
+                    data[key] = cgr2rxnsmiles(r)
+                except StopIteration:
+                    break
+    print('saving as a dataframe to data/uspto_golden.json.gz')
+    df = pd.DataFrame([data],index=['reaction_smiles']).T
+    df['reaction_reference'] = df.index
+    df.index = range(len(df)) #reindex
+    df.to_json('uspto_golden.json.gz', compression='gzip')
+    os.chdir('..')
+    os.chdir('..')
+    return df
+def load_USPTO_fortu(path='data/processed', which='uspto_sm_', is_appl_matrix=False):
+    """
+    loads the fortunato preprocessed data as
+    dict X containing X['train'], X['valid'], and X['test']
+    as well as the labels containing the corresponding splits
+    returns X, y
+    """
+    X = {}
+    y = {}
+    for split in ['train','valid', 'test']:
+        tmp = np.load(f'{path}/{which}{split}.input.smiles.npy', allow_pickle=True)
+        X[split] = []
+        for ii in range(len(tmp)):
+            X[split].append( tmp[ii].split('.'))
+        if is_appl_matrix:
+            y[split] = sparse.load_npz(f'{path}/{which}{split}.appl_matrix.npz')
+        else:
+            y[split] = np.load(f'{path}/{which}{split}.labels.classes.npy', allow_pickle=True)
+        print(split, y[split].shape[0], 'samples (', y[split].max() if not is_appl_matrix else y[split].shape[1],'max label)')
+    return X, y
+#TODO one should load in this file pd.read_json('uspto_R_retro.templates.uspto_R_.json.gz')
+# this only holds the templates.. the other holds everything
+def load_templates_sm(path = 'data/processed/uspto_sm_templates.df.json.gz', get_complete_df=False):
+    "returns a dict mapping from class index to mapped reaction_smarts from the templates_df"
+    df = pd.read_json(path)
+    if get_complete_df: return df
+    template_dict = {}
+    for row in range(len(df)):
+        template_dict[df.iloc[row]['index']] = df.iloc[row].reaction_smarts
+    return template_dict
+def load_templates_lg(path = 'data/processed/uspto_lg_templates.df.json.gz', get_complete_df=False):
+    return load_templates_sm(path=path, get_complete_df=get_complete_df)
+def load_USPTO_sm():
+    "loads the default dataset"
+    return load_USPTO_fortu(which='uspto_sm_')
+def load_USPTO_lg():
+    "loads the default dataset"
+    return load_USPTO_fortu(which='uspto_lg_')
+def load_USPTO_sm_pretraining():
+    "loads the default application matrix label and dataset"
+    return load_USPTO_fortu(which='uspto_sm_', is_appl_matrix=True)
+def load_USPTO_lg_pretraining():
+    "loads the default application matrix label and dataset"
+    return load_USPTO_fortu(which='uspto_lg_', is_appl_matrix=True)
+def load_USPTO_df_sm():
+    "loads the USPTO small Sm dataset dataframe"
+    return pd.read_json('data/raw/uspto_sm_reactions.json.gz')
+def load_USPTO_df_lg():
+    "loads the USPTO large Lg dataset dataframe"
+    return pd.read_json('data/raw/uspto_sm_reactions.json.gz')
+def load_USPTO_golden():
+    "loads the golden USPTO dataset"
+    return load_USPTO_fortu(which=f'uspto_golden_', is_appl_matrix=False)
+def load_USPTO(which = 'sm', is_appl_matrix=False):
+    return load_USPTO_fortu(which=f'uspto_{which}_', is_appl_matrix=is_appl_matrix)
+def load_templates(which = 'sm',fdir='data/processed', get_complete_df=False):
+    return load_templates_sm(path=f'{fdir}/uspto_{which}_templates.df.json.gz', get_complete_df=get_complete_df)
+def load_data(dataset, path):
+    splits = ['train', 'valid', 'test']
+    split2smiles = {}
+    split2label = {}
+    split2reactants = {}
+    split2appl = {}
+    split2prod_idx_reactants = {}
+    for split in splits:
+        label_fn = os.path.join(path, f'{dataset}_{split}.labels.classes.npy')
+        split2label[split] = np.load(label_fn, allow_pickle=True)
+        smiles_fn = os.path.join(path, f'{dataset}_{split}.input.smiles.npy')
+        split2smiles[split] = np.load(smiles_fn, allow_pickle=True)
+        reactants_fn = os.path.join(path, f'uspto_R_{split}.reactants.canonical.npy')
+        split2reactants[split] = np.load(reactants_fn, allow_pickle=True)
+        split2appl[split] = np.load(os.path.join(path, f'{dataset}_{split}.applicability.npy'))
+        pir_fn = os.path.join(path, f'{dataset}_{split}.prod.idx.reactants.p')
+        if os.path.isfile(pir_fn):
+            with open(pir_fn, 'rb') as f:
+                split2prod_idx_reactants[split] = pickle.load(f)
+    if len(split2prod_idx_reactants) == 0:
+        split2prod_idx_reactants = None
+    with open(os.path.join(path, f'{dataset}_templates.json'), 'r') as f:
+        label2template = json.load(f)
+        label2template = {int(k): v for k,v in label2template.items()}
+    return split2smiles, split2label, split2reactants, split2appl, split2prod_idx_reactants, label2template
+def load_dataset_from_csv(csv_path='', split_col='split', input_col='prod_smiles', ssretroeval=False, reactants_col='reactants_can', ret_df=False, **kwargs):
+    """loads the dataset from a CSV file containing a split-column, and input-column which can be defined,
+    as well as a 'reaction_smarts' column containing the extracted template, a 'label' column (the index of the template)
+    :returns
+    """
+    print('loading X, y from csv')
+    df = pd.read_csv(csv_path)
+    X = {}
+    y = {}
+    for spli in set(df[split_col]):
+        #X[spli] = list(df[df[split_col]==spli]['prod_smiles'].apply(lambda k: [k]))
+        X[spli] = list(df[df[split_col]==spli][input_col].apply(lambda k: [k]))
+        y[spli] = (df[df[split_col]==spli]['label']).values
+        print(spli, len(X[spli]), 'samples')
+    # template to dict
+    tmp = df[['reaction_smarts','label']].drop_duplicates(subset=['reaction_smarts','label']).sort_values('label')
+    tmp.index= tmp.label
+    template_list = tmp['reaction_smarts'].to_dict()
+    print(len(template_list),'templates')
+    if ssretroeval:
+        # setup for ttest
+        test_reactants_can = list(df[df[split_col]=='test'][reactants_col])
+        only_in_test = set(y['test']) - set(y['train']).union(set(y['valid']))
+        print('obfuscating', len(only_in_test), 'templates because they are only in test')
+        for ii in only_in_test:
+                template_list[ii] = 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC.CCCCCCCCCCCCCCCCCCCCCCCCCCC.CCCCCCCCCCCCCCCCCCCCCC>>CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC.CCCCCCCCCCCCCCCCCCCCC' #obfuscate them
+        if ret_df:
+            return X, y, template_list, test_reactants_can, df
+        return X, y, template_list, test_reactants_can
+    if ret_df:
+        return X, y, template_list, None, df
+    return X, y, template_list, None

mhnreact/inference.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# -*- coding: utf-8 -*-
+"""
+Author: Philipp Seidl
+        ELLIS Unit Linz, LIT AI Lab, Institute for Machine Learning
+        Johannes Kepler University Linz
+Contact: [email protected]
+File contains functions that help prepare and download USPTO-related datasets
+"""
+# Cell
+from .model import ModelConfig, MHN
+import torch

mhnreact/inspect.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# -*- coding: utf-8 -*-
+"""
+Author: Philipp Seidl
+        ELLIS Unit Linz, LIT AI Lab, Institute for Machine Learning
+        Johannes Kepler University Linz
+Contact: [email protected]
+File contains functions that
+"""
+from . import model
+import torch
+import os
+MODEL_PATH = 'data/model/'
+def smarts2svg(smarts, useSmiles=True, highlightByReactant=True, save_to=''):
+    """
+    draws smiles of smarts to an SVG and displays it in the Notebook,
+    or optinally can be saved to a file `save_to`
+    adapted from https://www.kesci.com/mw/project/5c7685191ce0af002b556cc5
+    """
+    # adapted from https://www.kesci.com/mw/project/5c7685191ce0af002b556cc5
+    from rdkit import RDConfig
+    from rdkit import Chem
+    from rdkit.Chem import Draw, AllChem
+    from rdkit.Chem.Draw import rdMolDraw2D
+    from rdkit import Geometry
+    import matplotlib.pyplot as plt
+    import matplotlib.cm as cm
+    import matplotlib
+    from IPython.display import SVG, display
+    rxn = AllChem.ReactionFromSmarts(smarts,useSmiles=useSmiles)
+    d = Draw.MolDraw2DSVG(900, 100)
+    # rxn = AllChem.ReactionFromSmarts('[CH3:1][C:2](=[O:3])[OH:4].[CH3:5][NH2:6]>CC(O)C.[Pt]>[CH3:1][C:2](=[O:3])[NH:6][CH3:5].[OH2:4]',useSmiles=True)
+    colors=[(0.3, 0.7, 0.9),(0.9, 0.7, 0.9),(0.6,0.9,0.3),(0.9,0.9,0.1)]
+    try:
+        d.DrawReaction(rxn,highlightByReactant=highlightByReactant)
+        d.FinishDrawing()
+        txt = d.GetDrawingText()
+        # self.assertTrue(txt.find("<svg") != -1)
+        # self.assertTrue(txt.find("</svg>") != -1)
+        svg = d.GetDrawingText()
+        svg2 = svg.replace('svg:','')
+        svg3 = SVG(svg2)
+        display(svg3)
+        if save_to!='':
+            with open(save_to, 'w') as f_handle:
+                f_handle.write(svg3.data)
+    except:
+        print('Error drawing')
+    return svg2
+def list_models(model_path=MODEL_PATH):
+    """returns a list of loadable models"""
+    return dict(enumerate(list(filter(lambda k: str(k)[-3:]=='.pt', os.listdir(model_path)))))
+def load_clf(model_fn='', model_path=MODEL_PATH, device='cpu', model_type='mhn'):
+    """ returns the model with loaded weights given a filename"""
+    import json
+    config_fn = '_'.join(model_fn.split('_')[-2:]).split('.pt')[0]
+    conf_dict = json.load( open( f"{model_path}{config_fn}_config.json" ) )
+    train_conf_dict = json.load( open( f"{model_path}{config_fn}_config.json" ) )
+    # specify the config the saved model had
+    conf = model.ModelConfig(**conf_dict)
+    conf.device = device
+    print(conf.__dict__)
+    if model_type == 'staticQK':
+        clf = model.StaticQK(conf)
+    elif model_type == 'mhn':
+        clf = model.MHN(conf)
+    elif model_type == 'segler':
+        clf = model.SeglerBaseline(conf)
+    elif model_type == 'fortunato':
+        clf = model.SeglerBaseline(conf)
+    else:
+        raise NotImplementedError('model_type',model_type,'not found')
+    # load the model
+    PATH = model_path+model_fn
+    params = torch.load(PATH, map_location=torch.device('cpu')) #!!!
+    clf.load_state_dict(params, strict=False)
+    if 'templates+noise' in params.keys():
+        print('loading templates+noise')
+        clf.templates = params['templates+noise']
+        #clf.templates.to(clf.config.device)
+    return clf

mhnreact/model.py ADDED Viewed

	@@ -0,0 +1,660 @@

+# -*- coding: utf-8 -*-
+"""
+Author: Philipp Seidl
+        ELLIS Unit Linz, LIT AI Lab, Institute for Machine Learning
+        Johannes Kepler University Linz
+Contact: [email protected]
+Model related functionality
+"""
+from .utils import top_k_accuracy
+from .plotutils import plot_loss, plot_topk, plot_nte
+from .molutils import convert_smiles_to_fp
+import os
+import numpy as np
+import torch
+import torch.nn as nn
+from collections import defaultdict
+from scipy import sparse
+import logging
+from tqdm import tqdm
+import wandb
+log = logging.getLogger(__name__)
+class ChemRXNDataset(torch.utils.data.Dataset):
+    "Torch Dataset for ChemRXN containing Xs: the input as np array, target: the target molecules (or nothing), and ys: the label"
+    def __init__(self, Xs, target, ys, is_smiles=False, fp_size=2048, fingerprint_type='morgan'):
+        self.is_smiles=is_smiles
+        if is_smiles:
+            self.Xs = Xs
+            self.target = target
+            self.fp_size = fp_size
+            self.fingerprint_type = fingerprint_type
+        else:
+            self.Xs = Xs.astype(np.float32)
+            self.target = target.astype(np.float32)
+        self.ys = ys
+        self.ys_is_sparse = isinstance(self.ys, sparse.csr.csr_matrix)
+    def __getitem__(self, k):
+        mol_fp = self.Xs[k]
+        if self.is_smiles:
+            mol_fp = convert_smiles_to_fp(mol_fp, fp_size=self.fp_size, which=self.fingerprint_type).astype(np.float32)
+        target = None if self.target is None else self.target[k]
+        if self.is_smiles and self.target:
+            target = convert_smiles_to_fp(target, fp_size=self.fp_size, which=self.fingerprint_type).astype(np.float32)
+        label = self.ys[k]
+        if isinstance(self.ys, sparse.csr.csr_matrix):
+            label = label.toarray()[0]
+        return (mol_fp, target, label)
+    def __len__(self):
+        return len(self.Xs)
+class ModelConfig(object):
+    def __init__(self, **kwargs):
+        self.fingerprint_type = kwargs.pop("fingerprint_type", 'morgan')
+        self.template_fp_type = kwargs.pop("template_fp_type", 'rdk')
+        self.num_templates = kwargs.pop("num_templates", 401)
+        self.fp_size = kwargs.pop("fp_size", 2048)
+        self.fp_radius = kwargs.pop("fp_radius", 4)
+        self.device = kwargs.pop("device", 'cuda' if torch.cuda.is_available() else 'cpu')
+        self.batch_size = kwargs.pop("batch_size", 32)
+        self.pooling_operation_state_embedding = kwargs.pop('pooling_operation_state_embedding', 'mean')
+        self.pooling_operation_head = kwargs.pop('pooling_operation_head', 'max')
+        self.dropout = kwargs.pop('dropout', 0.0)
+        self.lr = kwargs.pop('lr', 1e-4)
+        self.optimizer = kwargs.pop("optimizer", "Adam")
+        self.activation_function = kwargs.pop('activation_function', 'ReLU')
+        self.verbose = kwargs.pop("verbose", False)  # debugging or printing additional warnings / information set tot True
+        self.hopf_input_size = kwargs.pop('hopf_input_size', 2048)
+        self.hopf_output_size = kwargs.pop("hopf_output_size", 768)
+        self.hopf_num_heads = kwargs.pop("hopf_num_heads", 1)
+        self.hopf_asso_dim = kwargs.pop("hopf_asso_dim", 768)
+        self.hopf_association_activation = kwargs.pop("hopf_association_activation", None)
+        self.hopf_beta = kwargs.pop("hopf_beta",0.125) #  1/(self.hopf_asso_dim**(1/2) sqrt(d_k)
+        self.norm_input = kwargs.pop("norm_input",False)
+        self.norm_asso = kwargs.pop("norm_asso", False)
+        # additional experimental hyperparams
+        if 'hopf_n_layers' in kwargs.keys():
+            self.hopf_n_layers = kwargs.pop('hopf_n_layers', 0)
+        if 'mol_encoder_layers' in kwargs.keys():
+            self.mol_encoder_layers = kwargs.pop('mol_encoder_layers', 1)
+        if 'temp_encoder_layers' in kwargs.keys():
+            self.temp_encoder_layers = kwargs.pop('temp_encoder_layers', 1)
+        if 'encoder_af' in kwargs.keys():
+            self.encoder_af = kwargs.pop('encoder_af', 'ReLU')
+        # additional kwargs
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                log.error(f"Can't set {key} with value {value} for {self}")
+                raise err
+class Encoder(nn.Module):
+    """Simple FFNN"""
+    def __init__(self, input_size: int = 2048, output_size: int = 1024,
+                    num_layers: int = 1, dropout: float = 0.3, af_name: str ='None',
+                    norm_in: bool = False, norm_out: bool = False):
+        super().__init__()
+        self.ws = []
+        self.setup_af(af_name)
+        self.norm_in = (lambda k: k) if not norm_in else torch.nn.LayerNorm(input_size, elementwise_affine=False)
+        self.norm_out = (lambda k: k) if not norm_out else torch.nn.LayerNorm(output_size, elementwise_affine=False)
+        self.setup_ff(input_size, output_size, num_layers)
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(self, x: torch.Tensor):
+        x = self.norm_in(x)
+        for i, w in enumerate(self.ws):
+            if i==(len(self.ws)-1):
+                x = self.dropout(w(x)) # all except last haf ff_af
+            else:
+                x = self.dropout(self.af(w(x)))
+        x = self.norm_out(x)
+        return x
+    def setup_ff(self, input_size:int, output_size:int, num_layers=1):
+        """setup feed-forward NN with n-layers"""
+        for n in range(0, num_layers):
+            w = nn.Linear(input_size if n==0 else output_size, output_size)
+            torch.nn.init.kaiming_normal_(w.weight, mode='fan_in', nonlinearity='linear') # eqiv to LeCun init
+            setattr(self, f'W_{n}', w) # consider doing a step-wise reduction
+            self.ws.append(getattr(self, f'W_{n}'))
+    def setup_af(self, af_name : str):
+        """set activation function"""
+        if af_name is None or (af_name == 'None'):
+          self.af = lambda k: k
+        else:
+          try:
+              self.af = getattr(nn, af_name)()
+          except AttributeError as err:
+              log.error(f"Can't find activation-function {af_name} in torch.nn")
+              raise err
+class MoleculeEncoder(Encoder):
+    """
+    Class for Molecule encoder: can be any class mapping Smiles to a Vector (preferable differentiable ;)
+    """
+    def __init__(self, config):
+        self.config = config
+class FPMolEncoder(Encoder):
+    """
+    Fingerprint Based Molecular encoder
+    """
+    def __init__(self, config):
+        super().__init__(input_size = config.hopf_input_size*config.hopf_num_heads,
+                   output_size = config.hopf_asso_dim*config.hopf_num_heads,
+                   num_layers = config.mol_encoder_layers,
+                   dropout = config.dropout,
+                   af_name = config.encoder_af,
+                   norm_in = config.norm_input,
+                   norm_out = config.norm_asso,
+                  )
+        # number of layers = self.config.mol_encoder_layers
+        # layer-dimension = self.config.hopf_asso_dim
+        # activation-function = self.config.af
+        self.config = config
+    def forward_smiles(self, list_of_smiles: list):
+        fp_tensor = self.convert_smiles_to_tensor(list_of_smiles)
+        return self.forward(fp_tensor)
+    def convert_smiles_to_tensor(self, list_of_smiles):
+        fps = convert_smiles_to_fp(list_of_smiles, fp_size=self.config.fp_size,
+                                   which=self.config.fingerprint_type, radius=self.config.fp_radius)
+        fps_tensor = torch.from_numpy(fps.astype(np.float)).to(dtype=torch.float).to(self.config.device)
+        return fps_tensor
+class TemplateEncoder(Encoder):
+    """
+    Class for Template encoder: can be any class mapping a Smarts-Reaction to a Vector (preferable differentiable ;)
+    """
+    def __init__(self, config):
+        super().__init__(input_size = config.hopf_input_size*config.hopf_num_heads,
+                   output_size = config.hopf_asso_dim*config.hopf_num_heads,
+                   num_layers = config.temp_encoder_layers,
+                   dropout = config.dropout,
+                   af_name = config.encoder_af,
+                   norm_in = config.norm_input,
+                   norm_out = config.norm_asso,
+                  )
+        self.config = config
+        #number of layers
+        #template fingerprint type
+        #random template threshold
+        #reactant pooling
+        if config.temp_encoder_layers==0:
+            print('No Key-Projection = Static Key/Templates')
+            assert self.config.hopf_asso_dim==self.config.fp_size
+            self.wks = []
+class MHN(nn.Module):
+    """
+    MHN - modern Hopfield Network -- for Template relevance prediction
+    """
+    def __init__(self, config=None, layer2weight=0.05, use_template_encoder=True):
+        super().__init__()
+        if config:
+            self.config = config
+        else:
+            self.config = ModelConfig()
+        self.beta = self.config.hopf_beta
+        # hopf_num_heads
+        self.mol_encoder = FPMolEncoder(self.config)
+        if use_template_encoder:
+            self.template_encoder = TemplateEncoder(self.config)
+        self.W_v = None
+        self.layer2weight = layer2weight
+        # more MHN layers -- added recursively
+        if hasattr(self.config, 'hopf_n_layers'):
+            di = self.config.__dict__
+            di['hopf_n_layers'] -= 1
+            if di['hopf_n_layers']>0:
+                conf_wo_hopf_nlayers = ModelConfig(**di)
+                self.layer = MHN(conf_wo_hopf_nlayers)
+                if di['hopf_n_layers']!=0:
+                    self.W_v = nn.Linear(self.config.hopf_asso_dim, self.config.hopf_input_size)
+                    torch.nn.init.kaiming_normal_(self.W_v.weight, mode='fan_in', nonlinearity='linear') # eqiv to LeCun init
+        self.softmax = torch.nn.Softmax(dim=1)
+        self.lossfunction = nn.CrossEntropyLoss(reduction='none')#, weight=class_weights)
+        self.pretrain_lossfunction = nn.BCEWithLogitsLoss(reduction='none')#, weight=class_weights)
+        self.lr = self.config.lr
+        if self.config.hopf_association_activation is None or (self.config.hopf_association_activation.lower()=='none'):
+            self.af = lambda k: k
+        else:
+            self.af = getattr(nn, self.config.hopf_association_activation)()
+        self.pooling_operation_head = getattr(torch, self.config.pooling_operation_head)
+        self.X = None # templates projected to Hopfield Layer
+        self.optimizer = getattr(torch.optim, self.config.optimizer)(self.parameters(), lr=self.lr)
+        self.steps = 0
+        self.hist = defaultdict(list)
+        self.to(self.config.device)
+    def set_templates(self, template_list, which='rdk', fp_size=None, radius=2, learnable=False, njobs=1, only_templates_in_batch=False):
+        self.template_list = template_list.copy()
+        if fp_size is None:
+            fp_size = self.config.fp_size
+        if len(template_list)>=100000:
+            import math
+            print('batch-wise template_calculation')
+            bs = 30000
+            final_temp_emb = torch.zeros((len(template_list), fp_size)).float().to(self.config.device)
+            for b in range(math.ceil(len(template_list)//bs)+1):
+                self.template_list = template_list[bs*b:min(bs*(b+1), len(template_list))]
+                templ_emb = self.update_template_embedding(which=which, fp_size=fp_size, radius=radius, learnable=learnable, njobs=njobs, only_templates_in_batch=only_templates_in_batch)
+                final_temp_emb[bs*b:min(bs*(b+1), len(template_list))] = torch.from_numpy(templ_emb)
+            self.templates = final_temp_emb
+        else:
+            self.update_template_embedding(which=which, fp_size=fp_size, radius=radius, learnable=learnable, njobs=njobs, only_templates_in_batch=only_templates_in_batch)
+        self.set_templates_recursively()
+    def set_templates_recursively(self):
+        if 'hopf_n_layers' in self.config.__dict__.keys():
+            if self.config.hopf_n_layers >0:
+                self.layer.templates = self.templates
+                self.layer.set_templates_recursively()
+    def update_template_embedding(self,fp_size=2048, radius=4, which='rdk', learnable=False, njobs=1, only_templates_in_batch=False):
+        print('updating template-embedding; (just computing the template-fingerprint and using that)')
+        bs = self.config.batch_size
+        split_template_list = [str(t).split('>')[0].split('.') for t in self.template_list]
+        templates_np = convert_smiles_to_fp(split_template_list, is_smarts=True, fp_size=fp_size, radius=radius, which=which, njobs=njobs)
+        split_template_list = [str(t).split('>')[-1].split('.') for t in self.template_list]
+        reactants_np = convert_smiles_to_fp(split_template_list, is_smarts=True, fp_size=fp_size, radius=radius, which=which, njobs=njobs)
+        template_representation = templates_np-(reactants_np*0.5)
+        if learnable:
+            self.templates = torch.nn.Parameter(torch.from_numpy(template_representation).float(), requires_grad=True).to(self.config.device)
+            self.register_parameter(name='templates', param=self.templates)
+        else:
+            if only_templates_in_batch:
+                self.templates_np = template_representation
+            else:
+                self.templates = torch.from_numpy(template_representation).float().to(self.config.device)
+        return template_representation
+    def np_fp_to_tensor(self, np_fp):
+        return torch.from_numpy(np_fp.astype(np.float64)).to(self.config.device).float()
+    def masked_loss_fun(self, loss_fun, h_out, ys_batch):
+        if loss_fun == self.BCEWithLogitsLoss:
+            mask = (ys_batch != -1).float()
+            ys_batch = ys_batch.float()
+        else:
+            mask = (ys_batch.long() != -1).long()
+        mask_sum = int(mask.sum().cpu().numpy())
+        if mask_sum == 0:
+            return 0
+        ys_batch = ys_batch * mask
+        loss = (loss_fun(h_out, ys_batch * mask) * mask.float()).sum() / mask_sum  # only mean from non -1
+        return loss
+    def compute_losses(self, out, ys_batch, head_loss_weight=None):
+        if len(ys_batch.shape)==2:
+            if ys_batch.shape[1]==self.config.num_templates: # it is in pretraining_mode
+                loss = self.pretrain_lossfunction(out, ys_batch.float()).mean()
+            else:
+                # legacy from policyNN
+                loss = self.lossfunction(out, ys_batch[:, 2]).mean()  # WARNING: HEAD4 Reaction Template is ys[:,2]
+        else:
+            loss = self.lossfunction(out, ys_batch).mean()
+        return loss
+    def forward_smiles(self, list_of_smiles, templates=None):
+        state_tensor = self.mol_encoder.convert_smiles_to_tensor(list_of_smiles)
+        return self.forward(state_tensor, templates=templates)
+    def forward(self, m, templates=None):
+        """
+        m: molecule in the form batch x fingerprint
+        templates: None or newly given templates if not instanciated
+        returns logits ranking the templates for each molecule
+        """
+        #states_emb = self.fcfe(state_fp)
+        bs = m.shape[0] #batch_size
+        #templates = self.temp_emb(torch.arange(0,2000).long())
+        if (templates is None) and (self.X is None) and (self.templates is None):
+            raise Exception('Either pass in templates, or init templates by runnting clf.set_templates')
+        n_temp = len(templates) if templates is not None else len(self.templates)
+        if self.training or (templates is None) or (self.X is not None):
+            templates = templates if templates is not None else self.templates
+            X = self.template_encoder(templates)
+        else:
+            X = self.X # precomputed from last forward run
+        Xi = self.mol_encoder(m)
+        Xi = Xi.view(bs, self.config.hopf_num_heads, self.config.hopf_asso_dim) # [bs, H, A]
+        X = X.view(1, n_temp, self.config.hopf_asso_dim, self.config.hopf_num_heads) #[1, T, A, H]
+        XXi = torch.tensordot(Xi, X, dims=[(2,1), (2,0)]) # AxA -> [bs, T, H]
+        # pooling over heads
+        if self.config.hopf_num_heads<=1:
+            #QKt_pooled = QKt
+            XXi = XXi[:,:,0] #torch.squeeze(QKt, dim=2)
+        else:
+            XXi = self.pooling_operation_head(XXi, dim=2) # default is max pooling over H [bs, T]
+            if (self.config.pooling_operation_head =='max') or (self.config.pooling_operation_head =='min'):
+                XXi = XXi[0] #max and min also return the indices =S
+        out = self.beta*XXi # [bs, T, H] # softmax over dim=1 #pooling_operation_head
+        self.xinew = self.softmax(out)@X.view(n_temp, self.config.hopf_asso_dim) # [bs,T]@[T,emb] -> [bs,emb]
+        if self.W_v:
+            # call layers recursive
+            hopfout = self.W_v(self.xinew) # [bs,emb]@[emb,hopf_inp]  --> [bs, hopf_inp]
+            # TODO check if using x_pooled or if not going through mol_encoder again
+            hopfout = hopfout + m # skip-connection
+            # give it to the next layer
+            out2 = self.layer.forward(hopfout) #templates=self.W_v(self.K)
+            out = out*(1-self.layer2weight)+out2*self.layer2weight
+        return out
+    def train_from_np(self, Xs, targets, ys, is_smiles=False, epochs=2, lr=0.001, bs=32,
+                      permute_batches=False, shuffle=True, optimizer=None,
+                      use_dataloader=True, verbose=False,
+                      wandb=None, scheduler=None, only_templates_in_batch=False):
+        """
+        Xs in the form sample x states
+        targets
+        ys in the form sample x [y_h1, y_h2, y_h3, y_h4]
+        """
+        self.train()
+        if optimizer is None:
+            try:
+                self.optimizer = getattr(torch.optim, self.config.optimizer)(self.parameters(), lr=self.lr if lr is None else lr)
+            except AttributeError as err:
+                log.error(f"Can't find optimizer {config.optimizer} in torch.optim")
+                raise err
+            optimizer = self.optimizer
+        dataset = ChemRXNDataset(Xs, targets, ys, is_smiles=is_smiles,
+                                 fp_size=self.config.fp_size, fingerprint_type=self.config.fingerprint_type)
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size=bs, shuffle=shuffle, sampler=None,
+                   batch_sampler=None, num_workers=0, collate_fn=None,
+                   pin_memory=False, drop_last=False, timeout=0,
+                   worker_init_fn=None)
+        for epoch in range(epochs):  # loop over the dataset multiple times
+            running_loss = 0.0
+            running_loss_dict = defaultdict(int)
+            batch_order = range(0, len(Xs), bs)
+            if permute_batches:
+                batch_order = np.random.permutation(batch_order)
+            for step, s in tqdm(enumerate(dataloader),mininterval=2):
+                batch = [b.to(self.config.device, non_blocking=True) for b in s]
+                Xs_batch, target_batch, ys_batch = batch
+                # zero the parameter gradients
+                optimizer.zero_grad()
+                # forward + backward + optimize
+                out = self.forward(Xs_batch)
+                total_loss = self.compute_losses(out, ys_batch)
+                loss_dict = {'CE_loss': total_loss}
+                total_loss.backward()
+                optimizer.step()
+                if scheduler:
+                    scheduler.step()
+                self.steps += 1
+                # print statistics
+                for k in loss_dict:
+                    running_loss_dict[k] += loss_dict[k].item()
+                try:
+                    running_loss += total_loss.item()
+                except:
+                    running_loss += 0
+                rs = min(100,len(Xs)//bs) # reporting/logging steps
+                if step % rs == (rs-1):  # print every 2000 mini-batches
+                    if verbose: print('[%d, %5d] loss: %.3f' %
+                          (epoch + 1, step + 1, running_loss / rs))
+                    self.hist['step'].append(self.steps)
+                    self.hist['loss'].append(running_loss/rs)
+                    self.hist['trianing_running_loss'].append(running_loss/rs)
+                    [self.hist[k].append(running_loss_dict[k]/rs) for k in running_loss_dict]
+                    if wandb:
+                        wandb.log({'trianing_running_loss': running_loss / rs})
+                    running_loss = 0.0
+                    running_loss_dict = defaultdict(int)
+        if verbose: print('Finished Training')
+        return optimizer
+    def evaluate(self, Xs, targets, ys, split='test', is_smiles=False, bs = 32, shuffle=False, wandb=None, only_loss=False):
+        self.eval()
+        y_preds = np.zeros( (ys.shape[0], self.config.num_templates), dtype=np.float16)
+        loss_metrics = defaultdict(int)
+        new_hist = defaultdict(float)
+        with torch.no_grad():
+            dataset = ChemRXNDataset(Xs, targets, ys, is_smiles=is_smiles,
+                                     fp_size=self.config.fp_size, fingerprint_type=self.config.fingerprint_type)
+            dataloader = torch.utils.data.DataLoader(dataset, batch_size=bs, shuffle=shuffle, sampler=None,
+                       batch_sampler=None, num_workers=0, collate_fn=None,
+                       pin_memory=False, drop_last=False, timeout=0,
+                       worker_init_fn=None)
+            #for step, s in eoutputs = self.forward(batch[0], batchnumerate(range(0, len(Xs), bs)):
+            for step, batch in enumerate(dataloader):#
+                batch = [b.to(self.config.device, non_blocking=True) for b in batch]
+                ys_batch = batch[2]
+                if hasattr(self, 'templates_np'):
+                    outputs = []
+                    for ii in range(10):
+                        tlen = len(self.templates_np)
+                        i_tlen = tlen//10
+                        templates = torch.from_numpy(self.templates_np[(i_tlen*ii):min(i_tlen*(ii+1), tlen)]).float().to(self.config.device)
+                        outputs.append( self.forward(batch[0], templates = templates ) )
+                    outputs = torch.cat(outputs, dim=0)
+                else:
+                    outputs = self.forward(batch[0])
+                loss = self.compute_losses(outputs, ys_batch, None)
+                # not quite right because in every batch there might be different number of valid samples
+                weight = 1/len(batch[0])#len(Xs[s:min(s + bs, len(Xs))]) / len(Xs)
+                loss_metrics['loss'] += (loss.item())
+                if len(ys.shape)>1:
+                    outputs = self.softmax(outputs) if not (ys.shape[1]==self.config.num_templates) else torch.sigmoid(outputs)
+                else:
+                    outputs = self.softmax(outputs)
+                outputs_np = [None if o is None else o.to('cpu').numpy().astype(np.float16) for o in outputs]
+                if not only_loss:
+                    ks = [1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
+                    topkacc, mrocc = top_k_accuracy(ys_batch, outputs, k=ks, ret_arocc=True, ret_mrocc=False)
+                    # mrocc -- median rank of correct choice
+                    for k, tkacc in zip(ks, topkacc):
+                        #iterative average update
+                        new_hist[f't{k}_acc_{split}'] += (tkacc-new_hist[f't{k}_acc_{split}']) / (step+1)
+                        # todo weight by batch-size
+                    new_hist[f'meanrank_{split}'] = mrocc
+                y_preds[step*bs : min((step+1)*bs,len(y_preds))] = outputs_np
+        new_hist[f'steps_{split}'] = (self.steps)
+        new_hist[f'loss_{split}'] = (loss_metrics['loss'] / (step+1))
+        for k in new_hist:
+            self.hist[k].append(new_hist[k])
+        if wandb:
+            wandb.log(new_hist)
+        self.hist[f'loss_{split}'].append(loss_metrics[f'loss'] / (step+1))
+        return y_preds
+    def save_hist(self, prefix='', postfix=''):
+        HIST_PATH = 'data/hist/'
+        if not os.path.exists(HIST_PATH):
+            os.mkdir(HIST_PATH)
+        fn_hist = HIST_PATH+prefix+postfix+'.csv'
+        with open(fn_hist, 'w') as fh:
+            print(dict(self.hist), file=fh)
+        return fn_hist
+    def save_model(self, prefix='', postfix='', name_as_conf=False):
+        MODEL_PATH = 'data/model/'
+        if not os.path.exists(MODEL_PATH):
+            os.mkdir(MODEL_PATH)
+        if name_as_conf:
+            confi_str = str(self.config.__dict__.values()).replace("'","").replace(': ','_').replace(', ',';')
+        else:
+            confi_str = ''
+        model_name = prefix+confi_str+postfix+'.pt'
+        torch.save(self.state_dict(), MODEL_PATH+model_name)
+        return MODEL_PATH+model_name
+    def plot_loss(self):
+        plot_loss(self.hist)
+    def plot_topk(self, sets=['train', 'valid', 'test'], with_last = 2):
+        plot_topk(self.hist, sets=sets, with_last = with_last)
+    def plot_nte(self, last_cpt=1, dataset='Sm', include_bar=True):
+        plot_nte(self.hist, dataset=dataset, last_cpt=last_cpt, include_bar=include_bar)
+class SeglerBaseline(MHN):
+    """FFNN - only the Molecule Encoder + an output projection"""
+    def __init__(self, config=None):
+        config.template_fp_type = 'none'
+        config.temp_encoder_layers = 0
+        super().__init__(config, use_template_encoder=False)
+        self.W_out = torch.nn.Linear(config.hopf_asso_dim, config.num_templates)
+        self.optimizer = getattr(torch.optim, self.config.optimizer)(self.parameters(), lr=self.lr)
+        self.steps = 0
+        self.hist = defaultdict(list)
+        self.to(self.config.device)
+    def forward(self, m, templates=None):
+        """
+        m: molecule in the form batch x fingerprint
+        templates: won't be used in this case
+        returns logits ranking the templates for each molecule
+        """
+        bs = m.shape[0] #batch_size
+        Xi = self.mol_encoder(m)
+        Xi = self.mol_encoder.af(Xi) # is not applied in encoder for last layer
+        out = self.W_out(Xi) # [bs, T] # softmax over dim=1
+        return out
+class StaticQK(MHN):
+    """ Static QK baseline - beware to have the same fingerprint for mol_encoder as for the template_encoder (fp2048 r4 rdk by default)"""
+    def __init__(self, config=None):
+        if config:
+            self.config = config
+        else:
+            self.config = ModelConfig()
+        super().__init__(config)
+        self.fp_size = 2048
+        self.fingerprint_type = 'rdk'
+        self.beta = 1
+    def update_template_embedding(self, which='rdk', fp_size=2048, radius=4, learnable=False):
+        bs = self.config.batch_size
+        split_template_list = [t.split('>>')[0].split('.') for t in self.template_list]
+        self.templates = torch.from_numpy(convert_smiles_to_fp(split_template_list,
+                                                               is_smarts=True, fp_size=fp_size,
+                                                               radius=radius, which=which).max(1)).float().to(self.config.device)
+    def forward(self, m, templates=None):
+        """
+        """
+        #states_emb = self.fcfe(state_fp)
+        bs = m.shape[0] #batch_size
+        Xi = m #[bs, emb]
+        X = self.templates #[T, emb])
+        XXi = [email protected] # [bs, T]
+        # normalize
+        t_sum = templates.sum(1) #[T]
+        t_sum = t_sum.view(1,-1).expand(bs, -1) #[bs, T]
+        XXi = XXi / t_sum
+        # not neccecaire because it is not trained
+        out = self.beta*XXi # [bs, T] # softmax over dim=1
+        return out
+class Retrosim(StaticQK):
+    """ Retrosim-like baseline only for template relevance prediction """
+    def fit_with_train(self, X_fp_train, y_train):
+        self.templates = torch.from_numpy(X_fp_train).float().to(self.config.device)
+        # train_samples, num_templates
+        self.sample2acttemplate = torch.nn.functional.one_hot(torch.from_numpy(y_train), self.config.num_templates).float()
+        tmpnorm = self.sample2acttemplate.sum(0)
+        tmpnorm[tmpnorm==0] = 1
+        self.sample2acttemplate = (self.sample2acttemplate / tmpnorm).to(self.config.device) # results in an average after dot product
+    def forward(self, m, templates=None):
+        """
+        """
+        out = super().forward(m, templates=templates)
+        # bs, train_samples
+        # map out to actual templates
+        out = out @ self.sample2acttemplate
+        return out

mhnreact/molutils.py ADDED Viewed

	@@ -0,0 +1,772 @@

+# -*- coding: utf-8 -*-
+"""
+Author: Philipp Seidl, Philipp Renz
+        ELLIS Unit Linz, LIT AI Lab, Institute for Machine Learning
+        Johannes Kepler University Linz
+Contact: [email protected]
+Molutils contains functions that aid in handling molecules or templates
+"""
+import logging
+import re
+import warnings
+from itertools import product, permutations
+from multiprocessing import Pool
+from tqdm.contrib.concurrent import process_map
+from tqdm.notebook import tqdm
+import swifter
+import rdkit.RDLogger as rkl
+from rdkit import Chem
+from rdkit.Chem import AllChem
+from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint
+from rdkit.Chem.rdmolops import FastFindRings
+from rdkit.Chem.rdMHFPFingerprint import MHFPEncoder
+from scipy import sparse
+from sklearn.feature_extraction import DictVectorizer
+import warnings
+import rdkit.RDLogger as rkl
+import numpy as np
+log = logging.getLogger(__name__)
+logger = rkl.logger()
+def remove_attom_mapping(smiles):
+    """ removes a number after a ':' """
+    return re.sub(r':\d+', '', str(smiles))
+def canonicalize_smi(smi, is_smarts=False, remove_atom_mapping=True):
+    r"""
+    Canonicalize SMARTS from https://github.com/rxn4chemistry/rxnfp/blob/master/rxnfp/tokenization.py#L249
+    """
+    mol = Chem.MolFromSmarts(smi)
+    if not mol:
+        raise ValueError("Molecule not canonicalizable")
+    if remove_atom_mapping:
+        for atom in mol.GetAtoms():
+            if atom.HasProp("molAtomMapNumber"):
+                atom.ClearProp("molAtomMapNumber")
+    return Chem.MolToSmiles(mol)
+def canonicalize_template(smarts):
+    smarts = str(smarts)
+    # remove attom-mapping
+    #smarts = remove_attom_mapping(smarts)
+    # order the list of smiles + canonicalize it
+    results = []
+    for part in smarts.split('>>'):
+        a = part.split('.')
+        a = [canonicalize_smi(x, is_smarts=True, remove_atom_mapping=True) for x in a]
+        #a = [remove_attom_mapping(x) for x in a]
+        a.sort()
+        results.append( '.'.join(a) )
+    return '>>'.join(results)
+def ebv2np(ebv):
+    """Explicit bit vector returned by rdkit to numpy array. """
+    return np.frombuffer(bytes(ebv.ToBitString(), 'utf-8'), 'u1') - ord('0')
+def smiles2morgan(smiles, radius=2):
+    """ computes ecfp from smiles """
+    return GetMorganFingerprint(smiles, radius)
+def getFingerprint(smiles, fp_size=4096, radius=2, is_smarts=False, which='morgan', sanitize=True):
+    """maccs+morganc+topologicaltorsion+erg+atompair+pattern+rdkc"""
+    if isinstance(smiles, list):
+        return np.array([getFingerprint(smi, fp_size, radius, is_smarts, which) for smi in smiles]).max(0) # max pooling if it's list of lists
+    if is_smarts:
+        mol = Chem.MolFromSmarts(str(smiles), mergeHs=False)
+        #mol.UpdatePropertyCache() #Correcting valence info
+        #FastFindRings(mol) #Providing ring info
+    else:
+        mol = Chem.MolFromSmiles(str(smiles), sanitize=False)
+    if mol is None:
+        msg = f"{smiles} couldn't be converted to a fingerprint using 0's instead"
+        logger.warning(msg)
+        #warnings.warn(msg)
+        return np.zeros(fp_size).astype(np.bool)
+    if sanitize:
+        faild_op = Chem.SanitizeMol(mol, catchErrors=True)
+        FastFindRings(mol) #Providing ring info
+    mol.UpdatePropertyCache(strict=False) #Correcting valence info # important operation
+    def mol2np(mol, which, fp_size):
+        is_dict = False
+        if which=='morgan':
+            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=fp_size, useFeatures=False, useChirality=True)
+        elif which=='rdk':
+            fp = Chem.RDKFingerprint(mol, fpSize=fp_size, maxPath=6)
+        elif which=='rdkc':
+            # https://greglandrum.github.io/rdkit-blog/similarity/reference/2021/05/26/similarity-threshold-observations1.html
+            # -- maxPath 6 found to be better for retrieval in databases
+            fp = AllChem.UnfoldedRDKFingerprintCountBased(mol, maxPath=6).GetNonzeroElements()
+            is_dict = True
+        elif which=='morganc':
+            fp = AllChem.GetMorganFingerprint(mol, radius, useChirality=True, useBondTypes=True, useFeatures=True,  useCounts=True).GetNonzeroElements()
+            is_dict = True
+        elif which=='topologicaltorsion':
+            fp = AllChem.GetTopologicalTorsionFingerprint(mol).GetNonzeroElements()
+            is_dict = True
+        elif which=='maccs':
+            fp = AllChem.GetMACCSKeysFingerprint(mol)
+        elif which=='erg':
+            v = AllChem.GetErGFingerprint(mol)
+            fp = {idx:v[idx] for idx in np.nonzero(v)[0]}
+            is_dict = True
+        elif which=='atompair':
+            fp = AllChem.GetAtomPairFingerprint(mol).GetNonzeroElements()
+            is_dict = True
+        elif which=='pattern':
+            fp = Chem.PatternFingerprint(mol, fpSize=fp_size)
+        elif which=='ecfp4':
+            # roughly equivalent to ECFP4
+            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=fp_size, useFeatures=False, useChirality=True)
+        elif which=='layered':
+            fp = AllChem.LayeredFingerprint(mol, fpSize=fp_size, maxPath=7)
+        elif which=='mhfp':
+            #TODO check if one can avoid instantiating the MHFP encoder
+            fp = MHFPEncoder().EncodeMol(mol, radius=radius, rings=True, isomeric=False, kekulize=False, min_radius=1)
+            fp = {f:1 for f in fp}
+            is_dict = True
+        elif not (type(which)==str):
+            fp = which(mol)
+        if is_dict:
+            nd = np.zeros(fp_size)
+            for k in fp:
+                nk = k%fp_size #remainder
+                #print(nk, k, fp_size)
+                #3160 36322170 3730
+                #print(nd[nk], fp[k])
+                if nd[nk]!=0:
+                    #print('c',end='')
+                    nd[nk] = nd[nk]+fp[k] #pooling colisions
+                nd[nk] = fp[k]
+            return nd #np.log(1+nd) # discussion with segler
+        return ebv2np(fp)
+    """ + for folding * for concat """
+    cc_symb = '*'
+    if ('+' in which) or (cc_symb in which):
+        concat = False
+        split_sym = '+'
+        if cc_symb in which:
+            concat=True
+            split_sym = '*'
+        np_fp = np.zeros(fp_size)
+        remaining_fps = (which.count(split_sym)+1)
+        fp_length_remain = fp_size
+        for fp_type in which.split(split_sym):
+            if concat:
+                fpp = mol2np(mol, fp_type, fp_length_remain//remaining_fps)
+                np_fp[(fp_size-fp_length_remain):(fp_size-fp_length_remain+len(fpp))] += fpp
+                fp_length_remain -= len(fpp)
+                remaining_fps -=1
+            else:
+                try:
+                  fpp = mol2np(mol, fp_type, fp_size)
+                  np_fp[:len(fpp)] += fpp
+                except:
+                  pass
+                  #print(fp_type,end='')
+        return np.log(1 + np_fp)
+    else:
+        return mol2np(mol, which, fp_size)
+def _getFingerprint(inp):
+  return getFingerprint(inp[0], inp[1], inp[2], inp[3], inp[4])
+def disable_rdkit_logging():
+    """
+    Disables RDKit whiny logging.
+    """
+    import rdkit.rdBase as rkrb
+    import rdkit.RDLogger as rkl
+    logger.setLevel(rkl.ERROR)
+    rkrb.DisableLog('rdApp.error')
+def convert_smiles_to_fp(list_of_smiles, fp_size=2048, is_smarts=False, which='morgan', radius=2, njobs=1, verbose=False):
+    """
+    list of smiles can be list of lists, than the resulting array will pe badded to the max list len
+    which: morgan, rdk, ecfp4, or object
+    NOTE: morgan or ecfp4 throws error for is_smarts
+    """
+    inp = [(smi, fp_size, radius, is_smarts, which) for smi in list_of_smiles]
+    #print(inp)
+    if verbose: print(f'starting pool with {njobs} workers')
+    if njobs>1:
+    #with Pool(njobs) as pool:
+    #    fps = pool.map(_getFingerprint, inp)
+        fps = process_map(_getFingerprint, inp, max_workers=njobs, chunksize=1, mininterval=0)
+    else:
+        fps = [getFingerprint(smi, fp_size=fp_size, radius=radius, is_smarts=is_smarts, which=which) for smi in list_of_smiles]
+    return np.array(fps)
+def convert_smartes_to_fp(list_of_smarts, fp_size=2048):
+    if isinstance(list_of_smarts, np.ndarray):
+        list_of_smarts = list_of_smarts.tolist()
+    if isinstance(list_of_smarts, list):
+        if isinstance(list_of_smarts[0], list):
+            pad = len(max(list_of_smarts, key=len))
+            fps = [[getTemplateFingerprint(smarts, fp_size=fp_size) for smarts in sample]
+                   + [np.zeros(fp_size, dtype=np.bool)] * (pad - len(sample))  # zero padding
+                   for sample in list_of_smarts]
+        else:
+            fps = [[getTemplateFingerprint(smarts, fp_size=fp_size) for smarts in list_of_smarts]]
+    return np.asarray(fps)
+def get_reactants_from_smarts(smarts):
+    """
+        from a (forward-)reaction given as a smart, only returns the reactants (not e.g. solvents or reagents)
+        returns list of smiles or empty list
+    """
+    from rdkit.Chem import RDConfig
+    import sys
+    sys.path.append(RDConfig.RDContribDir)
+    from RxnRoleAssignment import identifyReactants
+    try:
+        rdk_reaction = AllChem.ReactionFromSmarts(smarts)
+        rx_idx = identifyReactants.identifyReactants(rdk_reaction)[0][0]
+    except ValueError:
+        return []
+    # TODO what if a product is recognized as a reactanat.. is that possible??
+    return [Chem.MolToSmiles(rdk_reaction.GetReactants()[i]) for i in rx_idx]
+def smarts2rdkfp(smart, fp_size=2048):
+    mol = Chem.MolFromSmarts(str(smart))
+    if mol is None: return np.zeros(fp_size).astype(np.bool)
+    return AllChem.RDKFingerprint(mol)
+    # fp = np.asarray(fp).astype(np.bool) # takes ages =/
+def smiles2rdkfp(smiles, fp_size=2048):
+    mol = Chem.MolFromSmiles(str(smiles))
+    if mol is None: return np.zeros(fp_size).astype(np.bool)
+    return AllChem.RDKFingerprint(mol)
+def mol2morganfp(mol, radius=2, fp_size=2048):
+    try:
+        Chem.SanitizeMol(mol)  # due to error --> see https://sourceforge.net/p/rdkit/mailman/message/34828604/
+    except:
+        pass
+        # print(mol)
+        # return np.zeros(fp_size).astype(np.bool)
+        # TODO
+    return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=fp_size)
+def smarts2morganfp(smart, fp_size=2048, radius=2):
+    mol = Chem.MolFromSmarts(str(smart))
+    if mol is None: return np.zeros(fp_size).astype(np.bool)
+    return mol2morganfp(mol)
+def smiles2morganfp(smiles, fp_size=2048, radius=2):
+    mol = Chem.MolFromSmiles(str(smiles))
+    if mol is None: return np.zeros(fp_size).astype(np.bool)
+    return mol2morganfp(mol)
+def smarts2fp(smart, which='morgan', fp_size=2048, radius=2):
+    if which == 'rdk':
+        return smarts2rdkfp(smart, fp_size=fp_size)
+    else:
+        return smarts2morganfp(smart, fp_size=fp_size, radius=radius)
+def smiles2fp(smiles, which='morgan', fp_size=2048, radius=2):
+    if which == 'rdk':
+        return smiles2rdkfp(smiles, fp_size=fp_size)
+    else:
+        return smiles2morganfp(smiles, fp_size=fp_size, radius=radius)
+class FP_featurizer():
+    "FP_featurizer: Fingerprint featurizer"
+    def __init__(self,
+                 fp_types = ['MACCS','Morgan2CBF', 'Morgan6CBF', 'ErG','AtomPair','TopologicalTorsion','RDK','ECFP6'],
+                 max_features = 4096, counts=True, log_scale=True, folding=None, collision_pooling='max'):
+        self.v = DictVectorizer(sparse=True, dtype=np.uint16)
+        self.max_features = max_features
+        self.idx_col = None
+        self.counts = counts
+        self.fp_types = [fp_types] if isinstance(fp_types, str) else fp_types
+        self.log_scale = log_scale # from discussion with segler
+        self.folding = None
+        self.colision_pooling = collision_pooling
+    def compute_fp_list(self, smiles_list, is_smarts=False):
+        fp_list = []
+        for smiles in smiles_list:
+            try:
+                if isinstance(smiles, list):
+                    smiles = smiles[0]
+                if is_smarts:
+                    mol = Chem.MolFromSmarts(smiles)
+                else:
+                    mol = Chem.MolFromSmiles(smiles) #TODO small hack only applicable here!!!
+                fp_dict = {}
+                for fp_type in self.fp_types:
+                    fp_dict.update( fingerprintTypes[fp_type](mol) ) #returns a dict
+                fp_list.append(fp_dict)
+            except:
+                fp_list.append({})
+        return fp_list
+    def fit(self, x_train, is_smarts=False):
+        fp_list = self.compute_fp_list(x_train, is_smarts=is_smarts)
+        Xraw = self.v.fit_transform(fp_list)
+        # compute variance of a csr_matrix E[x**2] - E[x]**2
+        axis = 0
+        Xraw_sqrd = Xraw.copy()
+        Xraw_sqrd.data **= 2
+        var_col = Xraw_sqrd.mean(axis) - np.square(Xraw.mean(axis))
+        #idx_col = (-np.array((Xraw>0).var(axis=0)).argpartition(self.max_features))
+        #idx_col = np.array((Xraw>0).sum(axis=0)>=self.min_fragm_occur).flatten()
+        self.idx_col = (-np.array(var_col)).flatten().argpartition(min(self.max_features, Xraw.shape[1]-1))[:min(self.max_features, Xraw.shape[1])]
+        print(f'from {var_col.shape[1]} to {len(self.idx_col)}')
+        return self.scale(Xraw[:,self.idx_col].toarray())
+    def transform(self, x_test, is_smarts=False):
+        fp_list = self.compute_fp_list(x_test, is_smarts=is_smarts)
+        X_raw = self.v.transform(fp_list)
+        return self.scale(X_raw[:,self.idx_col].toarray())
+    def scale(self, X):
+        if self.log_scale:
+            return np.log(1 + X)
+        return X
+    def save(self, path='data/fpfeat.pkl'):
+        import pickle
+        with open(path, 'wb') as output:
+            pickle.dump(self, output, pickle.HIGHEST_PROTOCOL)
+    def load(self, path='data/fpfeat.pkl'):
+        import pickle
+        with open(path, 'rb') as input:
+            self = pickle.load(input)
+def getTemplateFingerprintOnBits(smarts, fp_size=2048):
+    rxn = AllChem.ReactionFromSmarts(str(smarts))
+    #construct a structural fingerprint for a ChemicalReaction by concatenating the reactant fingerprint and the product fingerprint
+    return (AllChem.CreateStructuralFingerprintForReaction(rxn)).GetOnBits()
+def calc_template_fingerprint_group_mapping(template_list, fp_size, save_path=''):
+    """
+    calculate the mapping from old idx to new idx for the templates
+    returns a set with a numpy array with the mapping and the indices to take
+    """
+    templ_df = pd.DataFrame()
+    templ_df['smarts'] = template_list
+    templ_df['templ_emb'] = templ_df['smarts'].swifter.apply(lambda smarts: str(list(getTemplateFingerprintOnBits(smarts, fp_size))))
+    templ_df['idx_orig'] = [ii for ii in range(len(templ_df))]
+    grouped_templ = templ_df.groupby('templ_emb').apply(lambda x: x.index.tolist())
+    grouped_templ = templ_df.groupby('templ_emb')
+    grouped_templ = grouped_templ.min().sort_values('idx_orig')
+    grouped_templ['new_idx'] = range(len(grouped_templ))
+    new_templ_df = templ_df.join(grouped_templ, on='templ_emb',how='right', lsuffix='_l', rsuffix='_r').sort_values('idx_orig_l')
+    map_orig2new = new_templ_df['new_idx'].values
+    take_those_indices_from_orig = grouped_templ.idx_orig.values
+    if save_path!='':
+        suffix_maporig2new = '_maporig2new.npy'
+        suffix_takethose = '_tfp_take_idxs.npy'
+        np.save(f'{save_path}{suffix_maporig2new}', map_orig2new,allow_pickle=False)
+        np.save(f'{save_path}{suffix_takethose}', take_those_indices_from_orig,allow_pickle=False)
+    return (map_orig2new, take_those_indices_from_orig)
+class ECFC_featurizer():
+    def __init__(self, radius=6, min_fragm_occur=50, useChirality=True, useFeatures=False):
+        self.v = DictVectorizer(sparse=True, dtype=np.uint16)
+        self.min_fragm_occur=min_fragm_occur
+        self.idx_col = None
+        self.radius=radius
+        self.useChirality = useChirality
+        self.useFeatures = useFeatures
+    def compute_fp_list(self, smiles_list):
+        fp_list = []
+        for smiles in smiles_list:
+            try:
+                if isinstance(smiles, list):
+                    smiles = smiles[0]
+                mol = Chem.MolFromSmiles(smiles) #TODO small hack only applicable here!!!
+                fp_list.append( AllChem.GetMorganFingerprint(mol, self.radius, useChirality=self.useChirality,
+                                                             useFeatures=self.useFeatures).GetNonzeroElements() ) #returns a dict
+            except:
+                fp_list.append({})
+        return fp_list
+    def fit(self, x_train):
+        fp_list = self.compute_fp_list(x_train)
+        Xraw = self.v.fit_transform(fp_list)
+        idx_col = np.array((Xraw>0).sum(axis=0)>=self.min_fragm_occur).flatten()
+        self.idx_col = idx_col
+        return Xraw[:,self.idx_col].toarray()
+    def transform(self, x_test):
+        fp_list = self.compute_fp_list(x_test)
+        X_raw = self.v.transform(fp_list)
+        return X_raw[:,self.idx_col].toarray()
+def ecfp2dict(mol, radius=3):
+    #SECFP (SMILES Extended Connectifity Fingerprint)
+    # from mhfp.encoder import MHFPEncoder
+    from mhfp.encoder import MHFPEncoder
+    v = MHFPEncoder.secfp_from_mol(mol, length=4068, radius=radius, rings=True, kekulize=True, min_radius=1)
+    return {f'ECFP{radius*2}_'+str(idx):1 for idx in np.nonzero(v)[0]}
+def erg2dict(mol):
+    v = AllChem.GetErGFingerprint(mol)
+    return {'erg'+str(idx):v[idx] for idx in np.nonzero(v)[0]}
+def morgan2dict(mol, radius=2, useChirality=True, useBondTypes=True, useFeatures=True, useConts=True):
+    mdic = AllChem.GetMorganFingerprint(mol, radius=radius, useChirality=useChirality, useBondTypes=True,
+                                    useFeatures=True, useCounts=True).GetNonzeroElements()
+    return {f'm{radius}{useChirality}{useBondTypes}{useFeatures}'+str(kk):mdic[kk]for kk in mdic}
+def atompair2dict(mol):
+    mdic = AllChem.GetAtomPairFingerprint(mol).GetNonzeroElements()
+    return {f'ap'+str(kk):mdic[kk]for kk in mdic}
+def tt2dict(mol):
+    mdic = AllChem.GetTopologicalTorsionFingerprint(mol).GetNonzeroElements()
+    return {f'tt'+str(kk):mdic[kk]for kk in mdic}
+def rdk2dict(mol):
+    mdic = AllChem.UnfoldedRDKFingerprintCountBased(mol).GetNonzeroElements()
+    return {f'rdk'+str(kk):mdic[kk]for kk in mdic}
+def pattern2dict(mol):
+    mdic = AllChem.PatternFingerprint(mol, fpSize=16384).GetOnBits()
+    return {'pt'+str(kk):1 for kk in mdic}
+fingerprintTypes = {
+    'MACCS' : lambda k: {'MCCS'+str(ob):1 for ob in AllChem.GetMACCSKeysFingerprint(k).GetOnBits()},
+    'Morgan2CBF' : lambda mol: morgan2dict(mol, 2, True, True, True, True),
+    'Morgan4CBF' : lambda mol: morgan2dict(mol, 4, True, True, True, True),
+    'Morgan6CBF' : lambda mol: morgan2dict(mol, 6, True, True, True, True),
+    'ErG' :  erg2dict,
+    'AtomPair' : atompair2dict,
+    'TopologicalTorsion' : tt2dict,
+    #'RDK' : lambda k: {'MCCS'+str(ob):1 for ob in AllChem.RDKFingerprint(k).GetOnBits()},
+    'RDK' : rdk2dict,
+    'ECFP6' : lambda mol: ecfp2dict(mol, radius=3),
+    'Pattern': pattern2dict,
+}
+def smarts2appl(product_smarts, template_product_smarts, fpsize=2048, v=False, use_tqdm=False, njobs=1, nsplits=1):
+    """This takes in a list of product smiles (misnamed in code) and a list of product sides
+    of templates and calculates which templates are applicable to which product.
+    This is basically a substructure search. Maybe there are faster versions but I wrote this one.
+    Args:
+        product_smarts: List of smiles of molecules to check.
+        template_product_smarts: List of substructures to check
+        fpsize: fingerprint size to use in screening
+        v: if v then information will be printed
+        use_tdqm: if True then a progressbar will be displayed but slows down the computation.
+        njobs: how many parallel jobs to run in parallel.
+        nsplits: how many splits should be made along the product_smarts list. Useful to avoid memory
+            explosion.
+    Returns: list of tuples (i,j) that indicates the product i has substructure j.
+    """
+    if v: print("Calculating template molecules")
+    template_mols = [Chem.MolFromSmarts(s) for s in template_product_smarts]
+    if v: print("Calculating template fingerprints")
+    template_ebvs = [Chem.PatternFingerprint(m, fpSize=fpsize) for m in template_mols]
+    if v: print(f'Building template ints: [{len(template_mols)}, {fpsize}]')
+    template_ints = [int(e.ToBitString(), base=2) for e in template_ebvs]
+    del template_ebvs
+    if njobs == 1 and nsplits == 1:
+        return _smarts2appl(product_smarts, template_product_smarts, template_ints, fpsize, v, use_tqdm)
+    elif nsplits == 1:
+        nsplits = njobs
+    # split products into batches
+    product_splits = np.array_split(np.array(product_smarts), nsplits)
+    ioffsets = [0] + list(np.cumsum([p.shape[0] for p in product_splits[:-1]]))
+    inps = [(ps, template_product_smarts, template_ints, fpsize, v, use_tqdm, ioff, 0) for ps, ioff in zip(product_splits, ioffsets)]
+    if v: print("Creating workers")
+    #results = process_map(__smarts2appl, inps, max_workers=njobs, chunksize=1)
+    with Pool(njobs) as pool:
+        results = pool.starmap(_smarts2appl, inps)
+    imatch = np.concatenate([r[0] for r in results])
+    jmatch = np.concatenate([r[1] for r in results])
+    return imatch, jmatch
+def __smarts2appl(inp):
+    return _smarts2appl(*inp)
+def _smarts2appl(product_smarts, template_product_smarts, template_ints, fpsize=2048, v=False, use_tqdm=True, ioffset=0, joffset=0):
+    """See smarts2appl for a description"""
+    if v: print("Calculating product molecules")
+    product_mols = [Chem.MolFromSmiles(s) for s in product_smarts]
+    if v: print("Calculating product fingerprints")
+    product_ebvs = [Chem.PatternFingerprint(m, fpSize=fpsize) for m in product_mols]
+    if v: print(f'Building product ints: [{len(product_mols)}, {fpsize}]')
+    # This loads each fingerprint into a python integer on which we can use bitwise operations.
+    product_ints = [int(e.ToBitString(), base=2) for e in product_ebvs]
+    del product_ebvs
+    # product_mols = {i: m for i,m in enumerate(product_mols)}
+    if v: print('Checking symbolically')
+    # buffer for template molecules. This are handed over as smarts as they are slow to pickle
+    template_mols = {}
+    # create iterator and add progressbar if use_tqdm is True
+    iterator = product(enumerate(product_ints), enumerate(template_ints))
+    if use_tqdm:
+        nelem = len(product_ints) * len(template_ints)
+        iterator = tqdm(iterator, total=nelem, miniters=1_000_000)
+    imatch = []
+    jmatch = []
+    for (i, p_int), (j, t_int) in iterator:
+        if (p_int & t_int) == t_int:        # fingerprint based screen
+            p = product_mols[i]
+            t = template_mols.get(j, False)
+            if not t:
+                t = Chem.MolFromSmarts(template_product_smarts[j])
+                template_mols[j] = t
+            if p.HasSubstructMatch(t):
+                imatch.append(i)
+                jmatch.append(j)
+    if v: print("Finished loop")
+    return np.array(imatch)+ioffset, np.array(jmatch)+joffset
+def extract_from_reaction(reaction, radius=1, verbose=False):
+    """adapted from rdchiral package"""
+    from rdchiral.template_extractor import mols_from_smiles_list, replace_deuterated, get_fragments_for_changed_atoms, expand_changed_atom_tags, canonicalize_transform, get_changed_atoms
+    reactants = mols_from_smiles_list(replace_deuterated(reaction['reactants']).split('.'))
+    products = mols_from_smiles_list(replace_deuterated(reaction['products']).split('.'))
+    # if rdkit cant understand molecule, return
+    if None in reactants: return {'reaction_id': reaction['_id']}
+    if None in products: return {'reaction_id': reaction['_id']}
+    # try to sanitize molecules
+    try:
+        #for i in range(len(reactants)):
+        #    reactants[i] = AllChem.RemoveHs(reactants[i]) # *might* not be safe
+        #for i in range(len(products)):
+        #    products[i] = AllChem.RemoveHs(products[i]) # *might* not be safe
+        #[Chem.SanitizeMol(mol) for mol in reactants + products] # redundant w/ RemoveHs
+        for mol in reactants + products:
+            Chem.SanitizeMol(mol, catchErrors=True)
+            FastFindRings(mol) #Providing ring info
+            mol.UpdatePropertyCache(strict=False) #Correcting valence info # important operation
+        #changed
+        #[Chem.SanitizeMol(mol, catchErrors=True) for mol in reactants + products] # redundant w/ RemoveHs
+        #[mol.UpdatePropertyCache() for mol in reactants + products]
+    except Exception as e:
+        # can't sanitize -> skip
+        print(e)
+        print('Could not load SMILES or sanitize')
+        print('ID: {}'.format(reaction['_id']))
+        return {'reaction_id': reaction['_id']}
+    are_unmapped_product_atoms = False
+    extra_reactant_fragment = ''
+    for product in products:
+        prod_atoms = product.GetAtoms()
+        if sum([a.HasProp('molAtomMapNumber') for a in prod_atoms]) < len(prod_atoms):
+            if verbose: print('Not all product atoms have atom mapping')
+            if verbose: print('ID: {}'.format(reaction['_id']))
+            are_unmapped_product_atoms = True
+    if are_unmapped_product_atoms: # add fragment to template
+        for product in products:
+            prod_atoms = product.GetAtoms()
+            # Get unmapped atoms
+            unmapped_ids = [
+                a.GetIdx() for a in prod_atoms if not a.HasProp('molAtomMapNumber')
+            ]
+            if len(unmapped_ids) > MAXIMUM_NUMBER_UNMAPPED_PRODUCT_ATOMS:
+                # Skip this example - too many unmapped product atoms!
+                return
+            # Define new atom symbols for fragment with atom maps, generalizing fully
+            atom_symbols = ['[{}]'.format(a.GetSymbol()) for a in prod_atoms]
+            # And bond symbols...
+            bond_symbols = ['~' for b in product.GetBonds()]
+            if unmapped_ids:
+                extra_reactant_fragment += AllChem.MolFragmentToSmiles(
+                    product, unmapped_ids,
+                    allHsExplicit = False, isomericSmiles = USE_STEREOCHEMISTRY,
+                    atomSymbols = atom_symbols, bondSymbols = bond_symbols
+                ) + '.'
+        if extra_reactant_fragment:
+            extra_reactant_fragment = extra_reactant_fragment[:-1]
+            if verbose: print('    extra reactant fragment: {}'.format(extra_reactant_fragment))
+        # Consolidate repeated fragments (stoichometry)
+        extra_reactant_fragment = '.'.join(sorted(list(set(extra_reactant_fragment.split('.')))))
+    if None in reactants + products:
+        print('Could not parse all molecules in reaction, skipping')
+        print('ID: {}'.format(reaction['_id']))
+        return {'reaction_id': reaction['_id']}
+    # Calculate changed atoms
+    changed_atoms, changed_atom_tags, err = get_changed_atoms(reactants, products)
+    if err:
+        if verbose:
+            print('Could not get changed atoms')
+            print('ID: {}'.format(reaction['_id']))
+        return
+    if not changed_atom_tags:
+        if verbose:
+            print('No atoms changed?')
+            print('ID: {}'.format(reaction['_id']))
+        # print('Reaction SMILES: {}'.format(example_doc['RXN_SMILES']))
+        return {'reaction_id': reaction['_id']}
+    try:
+        # Get fragments for reactants
+        reactant_fragments, intra_only, dimer_only = get_fragments_for_changed_atoms(reactants, changed_atom_tags,
+            radius = radius, expansion = [], category = 'reactants')
+        # Get fragments for products
+        # (WITHOUT matching groups but WITH the addition of reactant fragments)
+        product_fragments, _, _  = get_fragments_for_changed_atoms(products, changed_atom_tags,
+            radius = radius-1, expansion = expand_changed_atom_tags(changed_atom_tags, reactant_fragments),
+            category = 'products')
+    except ValueError as e:
+        if verbose:
+            print(e)
+            print(reaction['_id'])
+        return {'reaction_id': reaction['_id']}
+    # Put together and canonicalize (as best as possible)
+    rxn_string = '{}>>{}'.format(reactant_fragments, product_fragments)
+    rxn_canonical = canonicalize_transform(rxn_string)
+    # Change from inter-molecular to intra-molecular
+    rxn_canonical_split = rxn_canonical.split('>>')
+    rxn_canonical = rxn_canonical_split[0][1:-1].replace(').(', '.') + \
+        '>>' + rxn_canonical_split[1][1:-1].replace(').(', '.')
+    reactants_string = rxn_canonical.split('>>')[0]
+    products_string  = rxn_canonical.split('>>')[1]
+    retro_canonical = products_string + '>>' + reactants_string
+    # Load into RDKit
+    rxn = AllChem.ReactionFromSmarts(retro_canonical)
+    # edited
+    #if rxn.Validate()[1] != 0:
+    #    print('Could not validate reaction successfully')
+    #    print('ID: {}'.format(reaction['_id']))
+    #    print('retro_canonical: {}'.format(retro_canonical))
+    #    if VERBOSE: raw_input('Pausing...')
+    #    return {'reaction_id': reaction['_id']}
+    n_warning, n_errors = rxn.Validate()
+    if n_errors:
+      # resolves some errors
+      rxn = AllChem.ReactionFromSmarts(AllChem.ReactionToSmiles(rxn))
+      n_warning, n_errors = rxn.Validate()
+    template = {
+        'products': products_string,
+        'reactants': reactants_string,
+        'reaction_smarts': retro_canonical,
+        'intra_only': intra_only,
+        'dimer_only': dimer_only,
+        'reaction_id': reaction['_id'],
+        'necessary_reagent': extra_reactant_fragment,
+        'num_errors': n_errors,
+        'num_warnings': n_warning,
+    }
+    return template
+def extract_template(rxn_smi, radius=1):
+    if isinstance(rxn_smi, str):
+        reaction = {
+            'reactants': rxn_smi.split('>')[0],
+            'products': rxn_smi.split('>')[-1],
+            'id': rxn_smi,
+            '_id': rxn_smi
+        }
+    else:
+        reaction = rxn_smi
+    try:
+        res = extract_from_reaction(reaction, radius=radius)
+        return res['reaction_smarts'] # returns a retro-template
+    except:
+        msg = f'failed to extract template from "{rxn_smi}"'
+        log.warning(msg)
+        return None
+def getTemplateFingerprint(smarts, fp_size=4096):
+    """ CreateStructuralFingerprintForReaction """
+    if isinstance(smarts, (list,)):
+        return np.vstack([getTemplateFingerprint(sm) for sm in smarts])
+    rxn = AllChem.ReactionFromSmarts(str(smarts))
+    if rxn is None:
+        msg = f"{smarts} couldn't be converted to a fingerprint using 0's instead"
+        log.warning(msg)
+        #warnings.warn(msg)
+        return np.zeros(fp_size).astype(np.bool)
+    return np.array(list(AllChem.CreateStructuralFingerprintForReaction(rxn, )), dtype=np.bool)

mhnreact/plotutils.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# -*- coding: utf-8 -*-
+"""
+Author: Philipp Seidl
+        ELLIS Unit Linz, LIT AI Lab, Institute for Machine Learning
+        Johannes Kepler University Linz
+Contact: [email protected]
+Plot utils
+"""
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib import pyplot as plt
+plt.style.use('default')
+def normal_approx_interval(p_hat, n, z=1.96):
+    """ approximating the distribution of error about a binomially-distributed observation, {\hat {p)), with a normal distribution
+    z = 1.96 --> alpha =0.05
+    z = 1 --> std
+    https://www.wikiwand.com/en/Binomial_proportion_confidence_interval"""
+    return z*((p_hat*(1-p_hat))/n)**(1/2)
+our_colors = {
+    "lightblue": (  0/255, 132/255, 187/255),
+    "red":       (217/255,  92/255,  76/255),
+    "blue":      (  0/255, 132/255, 187/255),
+    "green":     ( 91/255, 167/255,  85/255),
+    "yellow":    (241/255, 188/255,  63/255),
+    "cyan":      ( 79/255, 176/255, 191/255),
+    "grey":      (125/255, 130/255, 140/255),
+    "lightgreen":(191/255, 206/255,  82/255),
+    "violett":   (174/255,  97/255, 157/255),
+}
+def plot_std(p_hats, n_samples,z=1.96, color=our_colors['red'], alpha=0.2, xs=None):
+    p_hats = np.array(p_hats)
+    stds = np.array([normal_approx_interval(p_hats[ii], n_samples[ii], z=z) for ii in range(len(p_hats))])
+    xs = range(len(p_hats)) if xs is None else xs
+    plt.fill_between(xs, p_hats-(stds), p_hats+stds, color=color, alpha=alpha)
+    #plt.errorbar(range(13), asdf, [normal_approx_interval(asdf[ii], n_samples[ii], z=z) for ii in range(len(asdf))],
+    #             c=our_colors['red'], linestyle='None', marker='.', ecolor=our_colors['red'])
+def plot_loss(hist):
+    plt.plot(hist['step'], hist['loss'] )
+    plt.plot(hist['steps_valid'], np.array(hist['loss_valid']))
+    plt.legend(['train','validation'])
+    plt.xlabel('update-step')
+    plt.ylabel('loss (categorical-crossentropy-loss)')
+def plot_topk(hist, sets=['train', 'valid', 'test'], with_last = 2):
+    ks = [1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
+    baseline_val_res = {1:0.4061, 10:0.6827, 50: 0.7883, 100:0.8400}
+    plt.plot(list(baseline_val_res.keys()), list(baseline_val_res.values()), 'k.--')
+    for i in range(1,with_last):
+        for s in sets:
+            plt.plot(ks, [hist[f't{k}_acc_{s}'][-i] for k in ks],'.--', alpha=1/i)
+    plt.xlabel('top-k')
+    plt.ylabel('Accuracy')
+    plt.legend(sets)
+    plt.title('Hopfield-NN')
+    plt.ylim([-0.02,1])
+def plot_nte(hist, dataset='Sm', last_cpt=1, include_bar=True, model_legend='MHN (ours)',
+             draw_std=True, z=1.96, n_samples=None, group_by_template_fp=False, schwaller_hist=None, fortunato_hist=None): #1.96 for 95%CI
+    markers = ['.']*4#['1','2','3','4']#['8','P','p','*']
+    lw = 2
+    ms = 8
+    k = 100
+    ntes = range(13)
+    if dataset=='Sm':
+        basel_values = [0.        , 0.38424785, 0.66807858, 0.7916149 , 0.9051132 ,
+       0.92531258, 0.87295875, 0.94865587, 0.91830721, 0.95993717,
+       0.97215858, 0.9896713 , 0.99917817] #old basel_values = [0.0, 0.3882, 0.674, 0.7925, 0.9023, 0.9272, 0.874, 0.947, 0.9185, 0.959, 0.9717, 0.9927, 1.0]
+        pretr_values = [0.08439423, 0.70743412, 0.85555528, 0.95200267, 0.96513376,
+       0.96976397, 0.98373613, 0.99960286, 0.98683919, 0.96684724,
+       0.95907246, 0.9839079 , 0.98683919]# old [0.094, 0.711, 0.8584, 0.952, 0.9683, 0.9717, 0.988, 1.0, 1.0, 0.984, 0.9717, 1.0, 1.0]
+        staticQK = [0.2096, 0.1992, 0.2291, 0.1787, 0.2301, 0.1753, 0.2142, 0.2693, 0.2651, 0.1786, 0.2834, 0.5366, 0.6636]
+        if group_by_template_fp:
+            staticQK = [0.2651, 0.2617, 0.261 , 0.2181, 0.2622, 0.2393, 0.2157, 0.2184, 0.2   , 0.225 , 0.2039, 0.4568, 0.5293]
+    if dataset=='Lg':
+        pretr_values = [0.03410448, 0.65397054, 0.7254572 , 0.78969294, 0.81329924,
+       0.8651173 , 0.86775655, 0.8593128 , 0.88184124, 0.87764794,
+       0.89734215, 0.93328846, 0.99531597]
+        basel_values = [0.        , 0.62478044, 0.68784314, 0.75089511, 0.77044644,
+       0.81229423, 0.82968149, 0.82965544, 0.83778338, 0.83049176,
+       0.8662873 , 0.92308414, 1.00042408]
+        #staticQK = [0.03638, 0.0339 , 0.03732, 0.03506, 0.03717, 0.0331 , 0.03003, 0.03613, 0.0304 , 0.02109, 0.0297 , 0.02632, 0.02217] # on 90k templates
+        staticQK = [0.006416,0.00686, 0.00616, 0.00825, 0.005085,0.006718,0.01041, 0.0015335,0.006668,0.004673,0.001706,0.02551,0.04074]
+    if dataset=='Golden':
+        staticQK = [0]*13
+        pretr_values = [0]*13
+        basel_values = [0]*13
+    if schwaller_hist:
+        midx = np.argmin(schwaller_hist['loss_valid'])
+        basel_values = ([schwaller_hist[f't100_acc_nte_{k}'][midx] for k in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, '>10', '>49']])
+    if fortunato_hist:
+        midx = np.argmin(fortunato_hist['loss_valid'])
+        pretr_values = ([fortunato_hist[f't100_acc_nte_{k}'][midx] for k in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, '>10', '>49']])
+    #hand_val = [0.0 , 0.4, 0.68, 0.79, 0.89, 0.91, 0.86, 0.9,0.88, 0.9, 0.93]
+    if include_bar:
+        if dataset=='Sm':
+            if n_samples is None:
+                n_samples = [610, 1699, 287, 180, 143, 105, 70, 48, 124, 86, 68, 2539, 1648]
+                if group_by_template_fp:
+                    n_samples = [460, 993, 433, 243, 183, 117, 102, 87, 110, 80, 103, 3048, 2203]
+        if dataset=='Lg':
+            if n_samples is None:
+                n_samples = [18861, 32226, 4220, 2546, 1573, 1191, 865, 652, 1350, 642, 586, 11638, 4958] #new
+                if group_by_template_fp:
+                    n_samples = [13923, 17709, 7637, 4322, 2936, 2137, 1586, 1260, 1272, 1044, 829, 21695, 10559]
+                        #[5169, 15904, 2814, 1853, 1238, 966, 766, 609, 1316, 664, 640, 30699, 21471]
+                        #[13424,17246, 7681, 4332, 2844,2129,1698,1269, 1336,1067, 833, 22491, 11202] #grouped fp
+        plt.bar(range(11+2), np.array(n_samples)/sum(n_samples[:-1]), alpha=0.4, color=our_colors['grey'])
+    xti = [*[str(i) for i in range(11)], '>10', '>49']
+    asdf = []
+    for nte in xti:
+        try:
+            asdf.append( hist[f't{k}_acc_nte_{nte}'][-last_cpt])
+        except:
+            asdf.append(None)
+    plt.plot(range(13), asdf,f'{markers[3]}--', markersize=ms,c=our_colors['red'], linewidth=lw,alpha=1)
+    plt.plot(ntes, pretr_values,f'{markers[1]}--', c=our_colors['green'],
+             linewidth=lw, alpha=1,markersize=ms) #old [0.08, 0.7, 0.85, 0.9, 0.91, 0.95, 0.98, 0.97,0.98, 1, 1]
+    plt.plot(ntes, basel_values,f'{markers[0]}--',linewidth=lw,
+             c=our_colors['blue'], markersize=ms,alpha=1)
+    plt.plot(range(len(staticQK)), staticQK, f'{markers[2]}--',markersize=ms,c=our_colors['yellow'],linewidth=lw, alpha=1)
+    plt.title(f'USPTO-{dataset}')
+    plt.xlabel('number of training examples')
+    plt.ylabel('top-100 test-accuracy')
+    plt.legend([model_legend, 'Fortunato et al.','FNN baseline',"FPM baseline", #static${\\xi X}: \\dfrac{|{\\xi} \\cap {X}|}{|{X}|}$
+                'test sample proportion'])
+    if draw_std:
+        alpha=0.2
+        plot_std(asdf, n_samples, z=z, color=our_colors['red'], alpha=alpha)
+        plot_std(pretr_values, n_samples, z=z, color=our_colors['green'], alpha=alpha)
+        plot_std(basel_values, n_samples, z=z, color=our_colors['blue'], alpha=alpha)
+        plot_std(staticQK, n_samples, z=z, color=our_colors['yellow'], alpha=alpha)
+    plt.xticks(range(13),xti);
+    plt.yticks(np.arange(0,1.05,0.1))
+    plt.grid('on', alpha=0.3)

mhnreact/retroeval.py ADDED Viewed

	@@ -0,0 +1,240 @@

+# -*- coding: utf-8 -*-
+"""
+Author: Philipp Seidl, Philipp Renz
+        ELLIS Unit Linz, LIT AI Lab, Institute for Machine Learning
+        Johannes Kepler University Linz
+Contact: [email protected]
+Evaluation functions for single-step-retrosynthesis
+"""
+import sys
+import rdchiral
+from rdchiral.main import rdchiralRun, rdchiralReaction, rdchiralReactants
+import hashlib
+from rdkit import Chem
+import torch
+import numpy as np
+import pandas as pd
+from collections import defaultdict
+from copy import deepcopy
+from glob import glob
+import os
+import pickle
+from multiprocessing import Pool
+import hashlib
+import pickle
+import logging
+#import timeout_decorator
+def _cont_hash(fn):
+    with open(fn, 'rb') as f:
+        return hashlib.md5(f.read()).hexdigest()
+def load_templates_only(path, cache_dir='/tmp'):
+    arg_hash_base = 'load_templates_only' + path
+    arg_hash = hashlib.md5(arg_hash_base.encode()).hexdigest()
+    matches = glob(os.path.join(cache_dir, arg_hash+'*'))
+    if len(matches) > 1:
+        raise RuntimeError('Too many matches')
+    elif len(matches) == 1:
+        fn = matches[0]
+        content_hash = _cont_hash(path)
+        content_hash_file = os.path.basename(fn).split('_')[1].split('.')[0]
+        if content_hash_file == content_hash:
+            with open(fn, 'rb') as f:
+                return pickle.load(f)
+    df = pd.read_json(path)
+    template_dict = {}
+    for row in range(len(df)):
+        template_dict[df.iloc[row]['index']] = df.iloc[row].reaction_smarts
+    # cache the file
+    content_hash = _cont_hash(path)
+    fn = os.path.join(cache_dir, f"{arg_hash}_{content_hash}.p")
+    with open(fn, 'wb') as f:
+        pickle.dump(template_dict, f)
+def load_templates_v2(path, get_complete_df=False):
+    if get_complete_df:
+        df = pd.read_json(path)
+        return df
+    return load_templates_only(path)
+def canonicalize_reactants(smiles, can_steps=2):
+    if can_steps==0:
+        return smiles
+    mol = Chem.MolFromSmiles(smiles)
+    for a in mol.GetAtoms():
+        a.ClearProp('molAtomMapNumber')
+    smiles = Chem.MolToSmiles(mol, True)
+    if can_steps==1:
+        return smiles
+    smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles), True)
+    if can_steps==2:
+        return smiles
+    raise ValueError("Invalid can_steps")
+def load_test_set(fn):
+    df = pd.read_csv(fn, index_col=0)
+    test = df[df.dataset=='test']
+    test_product_smarts = list(test.prod_smiles) # we make predictions for these
+    for s in test_product_smarts:
+        assert len(s.split('.')) == 1
+        assert '>' not in s
+    test_reactants = [] # we want to predict these
+    for rs in list(test.rxn_smiles):
+        rs = rs.split('>>')
+        assert len(rs) == 2
+        reactants_ori, products = rs
+        reactants = reactants_ori.split('.')
+        products = products.split('.')
+        assert len(reactants) >= 1
+        assert len(products) == 1
+        test_reactants.append(reactants_ori)
+    return test_product_smarts, test_reactants
+#@timeout_decorator.timeout(1, use_signals=False)
+def time_out_rdchiralRun(temp, prod_rct, combine_enantiomers=False):
+    rxn = rdchiralReaction(temp)
+    return rdchiralRun(rxn, prod_rct, combine_enantiomers=combine_enantiomers)
+def _run_templates_rdchiral(prod_appl):
+    prod, applicable_templates = prod_appl
+    prod_rct = rdchiralReactants(prod) # preprocess reactants with rdchiral
+    results = {}
+    for idx, temp in applicable_templates:
+        temp = str(temp)
+        try:
+            results[(idx, temp)] = time_out_rdchiralRun(temp, prod_rct, combine_enantiomers=False)
+        except:
+            pass
+    return results
+def _run_templates_rdchiral_original(prod_appl):
+    prod, applicable_templates = prod_appl
+    prod_rct = rdchiralReactants(prod) # preprocess reactants with rdchiral
+    results = {}
+    rxn_cache = {}
+    for idx, temp in applicable_templates:
+        temp = str(temp)
+        if temp in rxn_cache:
+            rxn = rxn_cache[(temp)]
+        else:
+            try:
+              rxn = rdchiralReaction(temp)
+              rxn_cache[temp] = rxn
+            except:
+              rxn_cache[temp] = None
+              msg = temp+' error converting to rdchiralReaction'
+              logging.debug(msg)
+        try:
+            res = rdchiralRun(rxn, prod_rct, combine_enantiomers=False)
+            results[(idx, temp)] =  res
+        except:
+            pass
+    return results
+def run_templates(test_product_smarts, templates, appl, njobs=32, cache_dir='/tmp'):
+    appl_dict = defaultdict(list)
+    for i,j in zip(*appl):
+        appl_dict[i].append(j)
+    prod_appl_list = []
+    for prod_idx, prod in enumerate(test_product_smarts):
+        applicable_templates = [(idx, templates[idx]) for idx in appl_dict[prod_idx]]
+        prod_appl_list.append((prod, applicable_templates))
+    arg_hash = hashlib.md5(pickle.dumps(prod_appl_list)).hexdigest()
+    cache_file = os.path.join(cache_dir, arg_hash+'.p')
+    if os.path.isfile(cache_file):
+        with open(cache_file, 'rb') as f:
+            print('loading results from file',f)
+            all_results = pickle.load(f)
+    #find /tmp -type f \( ! -user root \) -atime +3 -delete
+    # to delete the tmp files that havent been accessed 3 days
+    else:
+        #with Pool(njobs) as pool:
+        #    all_results = pool.map(_run_templates_rdchiral, prod_appl_list)
+        from tqdm.contrib.concurrent import process_map
+        all_results = process_map(_run_templates_rdchiral, prod_appl_list, max_workers=njobs, chunksize=1, mininterval=2)
+        #with open(cache_file, 'wb') as f:
+        #    print('saving applicable_templates to cache', cache_file)
+        #    pickle.dump(all_results, f)
+    prod_idx_reactants = []
+    prod_temp_reactants = []
+    for prod, idx_temp_reactants in zip(test_product_smarts, all_results):
+        prod_idx_reactants.append({idx_temp[0]: r for idx_temp, r  in idx_temp_reactants.items()})
+        prod_temp_reactants.append({idx_temp[1]: r for idx_temp, r  in idx_temp_reactants.items()})
+    return prod_idx_reactants, prod_temp_reactants
+def sort_by_template(template_scores, prod_idx_reactants):
+    sorted_results = []
+    for i, predictions in enumerate(prod_idx_reactants):
+        score_row = template_scores[i]
+        appl_idxs = np.array(list(predictions.keys()))
+        if len(appl_idxs) == 0:
+            sorted_results.append([])
+            continue
+        scores = score_row[appl_idxs]
+        sorted_idxs = appl_idxs[np.argsort(scores)][::-1]
+        sorted_reactants = [predictions[idx] for idx in sorted_idxs]
+        sorted_results.append(sorted_reactants)
+    return sorted_results
+def no_dup_same_order(l):
+    return list({r: 0 for r in l}.keys())
+def flatten_per_product(sorted_results, remove_duplicates=True):
+    flat_results = [sum((r for r in row), []) for row in sorted_results]
+    if remove_duplicates:
+        flat_results =  [no_dup_same_order(row) for row in flat_results]
+    return flat_results
+def topkaccuracy(test_reactants, predicted_reactants, ks=[1], ret_ranks=False):
+    ks = [k if k is not None else 1e10 for k in ks]
+    ranks = []
+    for true, pred in zip(test_reactants, predicted_reactants):
+        try:
+            rank = pred.index(true) + 1
+        except ValueError:
+            rank = 1e15
+        ranks.append(rank)
+    ranks = np.array(ranks)
+    if ret_ranks:
+        return ranks
+    return [np.mean([ranks <= k]) for k in ks]

mhnreact/train.py ADDED Viewed

	@@ -0,0 +1,804 @@

+# -*- coding: utf-8 -*-
+"""
+Author: Philipp Seidl
+        ELLIS Unit Linz, LIT AI Lab, Institute for Machine Learning
+        Johannes Kepler University Linz
+Contact: [email protected]
+Training
+"""
+from .utils import str2bool, lgamma, multinom_gk, top_k_accuracy
+from .data import load_templates, load_dataset_from_csv, load_USPTO
+from .model import ModelConfig, MHN, StaticQK, SeglerBaseline, Retrosim
+from .molutils import convert_smiles_to_fp, FP_featurizer, smarts2appl, getTemplateFingerprint, disable_rdkit_logging
+from collections import defaultdict
+import argparse
+import os
+import numpy as np
+import pandas as pd
+import datetime
+import sys
+from time import time
+import matplotlib.pyplot as plt
+import torch
+import multiprocessing
+import warnings
+from joblib import Memory
+cachedir = 'data/cache/'
+memory = Memory(cachedir, verbose=0, bytes_limit=80e9)
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train MHNreact.",
+                                     epilog="--", prog="Train")
+    parser.add_argument('-f', type=str)
+    parser.add_argument('--model_type', type=str, default='mhn',
+                        help="Model-type: choose from 'segler', 'fortunato', 'mhn' or 'staticQK', default:'mhn'")
+    parser.add_argument("--exp_name", type=str, default='', help="experiment name, (added as postfix to the file-names)")
+    parser.add_argument("-d", "--dataset_type", type=str, default='sm',
+                        help="Input Dataset 'sm' for Scheider-USPTO-50k 'lg' for USPTO large or 'golden' or use keyword '--csv_path to specify an input file', default: 'sm'")
+    parser.add_argument("--csv_path", default=None, type=str, help="path to preprocessed trainings file + split columns, default: None")
+    parser.add_argument("--split_col", default='split', type=str, help="split column of csv, default: 'split'")
+    parser.add_argument("--input_col", default='prod_smiles', type=str, help="input column of csv, default: 'pro_smiles'")
+    parser.add_argument("--reactants_col", default='reactants_can', type=str, help="reactant colum of csv, default: 'reactants_can'")
+    parser.add_argument("--fp_type", type=str, default='morganc',
+                        help="Fingerprint type for the input only!: default: 'morgan', other options: 'rdk', 'ECFP', 'ECFC', 'MxFP', 'Morgan2CBF' or a combination of fingerprints with '+'' for max-pooling and '&' for concatination e.g. maccs+morganc+topologicaltorsion+erg+atompair+pattern+rdkc+layered+mhfp, default: 'morganc'")
+    parser.add_argument("--template_fp_type", type=str, default='rdk',
+                        help="Fingerprint type for the template fingerprint, default: 'rdk'")
+    parser.add_argument("--device", type=str, default='best',
+                        help="Device to run the model on, preferably 'cuda:0', default: 'best' (takes the gpu with most RAM)")
+    parser.add_argument("--fp_size", type=int, default=4096,
+                        help="fingerprint-size used for templates as well as for inputs, default: 4096")
+    parser.add_argument("--fp_radius", type=int, default=2, help="fingerprint-radius (if applicable to the fingerprint-type), default: 2")
+    parser.add_argument("--epochs", type=int, default=10, help='number of epochs, default: 10')
+    parser.add_argument("--pretrain_epochs", type=int, default=0,
+                        help="applicability-matrix pretraining epochs if applicable (e.g. fortunato model_type), default: 0")
+    parser.add_argument("--save_model", type=str2bool, default=False, help="save the model, default: False")
+    parser.add_argument("--dropout", type=float, default=0.2, help="dropout rate for encoders, default: 0.2")
+    parser.add_argument("--lr", type=float, default=5e-4, help="learning-rate, dfeault: 5e-4")
+    parser.add_argument("--hopf_beta", type=float, default=0.05, help="hopfield beta parameter, default: 0.125")
+    parser.add_argument("--hopf_asso_dim", type=int, default=512, help="association dimension, default: 512")
+    parser.add_argument("--hopf_num_heads", type=int, default=1, help="hopfield number of heads, default: 1")
+    parser.add_argument("--hopf_association_activation", type=str, default='None',
+                        help="hopfield association activation function recommended:'Tanh' or 'None', other: 'ReLU', 'SeLU', 'GeLU', or 'None' for more, see torch.nn, default: 'None'")
+    parser.add_argument("--norm_input", default=True, type=str2bool,
+                        help="input-normalization, default: True")
+    parser.add_argument("--norm_asso", default=True, type=str2bool,
+                        help="association-normalization, default: True")
+    # additional experimental hyperparams
+    parser.add_argument("--hopf_n_layers", default=1, type=int, help="Number of hopfield-layers, default: 1")
+    parser.add_argument("--mol_encoder_layers", default=1, type=int, help="Number of molecule-encoder layers, default: 1")
+    parser.add_argument("--temp_encoder_layers", default=1, type=int, help="Number of template-encoder layers, default: 1")
+    parser.add_argument("--encoder_af", default='ReLU', type=str,
+                        help="Encoder-NN intermediate activation function (before association_activation function), default: 'ReLU'")
+    parser.add_argument("--hopf_pooling_operation_head", default='mean', type=str, help="Pooling operation over heads default=max, (max, min, mean, ...), default: 'mean'")
+    parser.add_argument("--splitting_scheme", default=None, type=str, help="Splitting_scheme for non-csv-input, default: None, other options: 'class-freq', 'random'")
+    parser.add_argument("--concat_rand_template_thresh", default=-1, type=int, help="Concatinates a random vector to the tempalte-fingerprint at all templates with num_training samples > this threshold; -1 (default) means deactivated")
+    parser.add_argument("--repl_quotient", default=10, type=float, help="Only if --concat_rand_template_thresh >= 0 - Quotient of how much should be replaced by random in template-embedding, (default: 10)")
+    parser.add_argument("--verbose", default=False, type=str2bool, help="If verbose, will print out more stuff, default: False")
+    parser.add_argument("--batch_size", default=128, type=int, help="Training batch-size, default: 128")
+    parser.add_argument("--eval_every_n_epochs", default=1, type=int, help="Evaluate every _ epochs (Evaluation is costly for USPTO-Lg), default: 1")
+    parser.add_argument("--save_preds", default=False, type=str2bool, help="Save predictions for test split at the end of training, default: False")
+    parser.add_argument("--wandb", default=False, type=str2bool, help="Save to wandb; login required, default: False")
+    parser.add_argument("--seed", default=None, type=int, help="Seed your run to make it reproducible, defualt: None")
+    parser.add_argument("--template_fp_type2", default=None, type=str, help="experimental template_fp_type for layer 2, default: None")
+    parser.add_argument("--layer2weight",default=0.2, type=float, help="hopf-layer2 weight of p, default: 0.2")
+    parser.add_argument("--reactant_pooling", default='max', type=str, help="reactant pooling operation over template-fingerprint, default: 'max', options: 'min','mean','lgamma'")
+    parser.add_argument("--ssretroeval", default=False, type=str2bool, help="single-step retro-synthesis eval, default: False")
+    parser.add_argument("--addval2train", default=False, type=str2bool, help="adds the validation set to the training set, default: False")
+    parser.add_argument("--njobs",default=-1, type=int, help="Number of jobs, default: -1 -> uses all available")
+    parser.add_argument("--eval_only_loss", default=False, type=str2bool, help="if only loss should be evaluated (if top-k acc may be time consuming), default: False")
+    parser.add_argument("--only_templates_in_batch", default=False, type=str2bool, help="while training only forwards templates that are in the batch, default: False")
+    parser.add_argument("--plot_res", default=False, type=str2bool, help="Plotting results for USPTO-sm/lg, default: False")
+    args = parser.parse_args()
+    if args.njobs ==-1:
+        args.njobs = int(multiprocessing.cpu_count())
+    if args.device=='best':
+        from .utils import get_best_gpu
+        try:
+            args.device = get_best_gpu()
+        except:
+            print('couldnt get the best gpu, using cpu instead')
+            args.device = 'cpu'
+    # some save checks on model type
+    if (args.model_type == 'segler') & (args.pretrain_epochs>=1):
+        print('changing model type to fortunato because of pretraining_epochs>0')
+        args.model_type = 'fortunato'
+    if ((args.model_type == 'staticQK') or (args.model_type == 'retrosim')) & (args.epochs>1):
+        print('changing epochs to 1 (StaticQK is not lernable ;)')
+        args.epochs=1
+        if args.template_fp_type != args.fp_type:
+            print('fp_type must be the same as template_fp_type --> setting template_fp_type to fp_type')
+            args.template_fp_type = args.fp_type
+    if args.save_model & (args.fp_type=='MxFP'):
+        warnings.warn('Currently MxFP is not recommended for saving the model paprameter (fragment dict for others would need to be saved or compued again, currently not implemented)')
+    return args
+@memory.cache(ignore=['njobs'])
+def featurize_smiles(X, fp_type='morgan', fp_size=4096, fp_radius=2, njobs=1, verbose=False):
+    X_fp = {}
+    if fp_type in ['MxFP','MACCS','Morgan2CBF','Morgan4CBF', 'Morgan6CBF', 'ErG','AtomPair','TopologicalTorsion','RDK']:
+        print('computing', fp_type)
+        if fp_type == 'MxFP':
+            fp_types = ['MACCS','Morgan2CBF','Morgan4CBF', 'Morgan6CBF', 'ErG','AtomPair','TopologicalTorsion','RDK']
+        else:
+            fp_types = [fp_type]
+        remaining = int(fp_size)
+        for fp_type in fp_types:
+            print(fp_type,end=' ')
+            feat = FP_featurizer(fp_types=fp_type,
+                                 max_features= (fp_size//len(fp_types)) if (fp_type != fp_types[-1]) else remaining )
+            X_fp[f'train_{fp_type}'] = feat.fit(X['train'])
+            X_fp[f'valid_{fp_type}'] = feat.transform(X['valid'])
+            X_fp[f'test_{fp_type}'] = feat.transform(X['test'])
+            remaining -= X_fp[f'train_{fp_type}'].shape[1]
+            #X_fp['train'].shape, X_fp['test'].shape
+        X_fp['train'] = np.hstack([ X_fp[f'train_{fp_type}'] for fp_type in fp_types])
+        X_fp['valid'] = np.hstack([ X_fp[f'valid_{fp_type}'] for fp_type in fp_types])
+        X_fp['test'] = np.hstack([ X_fp[f'test_{fp_type}'] for fp_type in fp_types])
+    else: #fp_type in ['rdk','morgan','ecfp4','pattern','morganc','rdkc']:
+        if verbose: print('computing', fp_type, 'folded')
+        for split in X.keys():
+            X_fp[split] = convert_smiles_to_fp(X[split], fp_size=fp_size, which=fp_type, radius=fp_radius, njobs=njobs, verbose=verbose)
+    return X_fp
+def compute_template_fp(fp_len=2048, reactant_pooling='max', do_log=True):
+    """Pre-Compute the template-fingerprint"""
+    # combine them to one fingerprint
+    comb_template_fp = np.zeros((max(template_list.keys())+1,fp_len if reactant_pooling!='concat' else fp_len*6))
+    for i in template_list:
+        tpl = template_list[i]
+        try:
+            pr, rea = str(tpl).split('>>')
+            idxx = temp_part_to_fp[pr]
+            prod_fp = templates_fp['fp'][idxx]
+        except:
+            print('err', pr, end='\r')
+            prod_fp = np.zeros(fp_len)
+        rea_fp = templates_fp['fp'][[temp_part_to_fp[r] for r in str(rea).split('.')]] # max-pooling
+        if reactant_pooling=='only_product':
+            rea_fp = np.zeros(fp_len)
+        if reactant_pooling=='max':
+            rea_fp = np.log(1 + rea_fp.max(0))
+        elif reactant_pooling=='mean':
+            rea_fp = np.log(1 + rea_fp.mean(0))
+        elif reactant_pooling=='sum':
+            rea_fp = np.log(1 + rea_fp.mean(0))
+        elif reactant_pooling=='lgamma':
+            rea_fp = multinom_gk(rea_fp, axis=0)
+        elif reactant_pooling=='concat':
+            rs = str(rea).split('.')
+            rs.sort()
+            for ii, r in enumerate(rs):
+                idx = temp_part_to_fp[r]
+                rea_fp = templates_fp['fp'][idx]
+                comb_template_fp[i, (fp_len*(ii+1)):(fp_len*(ii+2))] = np.log(1 + rea_fp)
+        comb_template_fp[i,:prod_fp.shape[0]] = np.log(1 + prod_fp) #- rea_fp*0.5
+        if reactant_pooling!='concat':
+            #comb_template_fp[i] = multinom_gk(np.stack([np.log(1+prod_fp), rea_fp]))
+            #comb_template_fp[i,fp_len:] = rea_fp
+            comb_template_fp[i,:rea_fp.shape[0]] = comb_template_fp[i, :rea_fp.shape[0]] - rea_fp*0.5
+    return comb_template_fp
+def set_up_model(args, template_list=None):
+    hpn_config = ModelConfig(num_templates = int(max(template_list.keys()))+1,
+                             #len(template_list.values()),  #env.num_templates, #
+                             dropout=args.dropout,
+                             fingerprint_type=args.fp_type,
+                             template_fp_type = args.template_fp_type,
+                             fp_size = args.fp_size,
+                             fp_radius= args.fp_radius,
+                             device=args.device,
+                             lr=args.lr,
+                             hopf_beta=args.hopf_beta,  #1/(128**0.5),#1/(2048**0.5),
+                             hopf_input_size=args.fp_size,
+                             hopf_output_size=None,
+                             hopf_num_heads=args.hopf_num_heads,
+                             hopf_asso_dim=args.hopf_asso_dim,
+                             hopf_association_activation = args.hopf_association_activation,  #or ReLU, Tanh works better, SELU, GELU
+                             norm_input = args.norm_input,
+                             norm_asso = args.norm_asso,
+                             hopf_n_layers= args.hopf_n_layers,
+                             mol_encoder_layers=args.mol_encoder_layers,
+                             temp_encoder_layers=args.temp_encoder_layers,
+                             encoder_af=args.encoder_af,
+                             hopf_pooling_operation_head = args.hopf_pooling_operation_head,
+                             batch_size=args.batch_size,
+                             )
+    print(hpn_config.__dict__)
+    if args.model_type=='segler': # baseline
+        clf = SeglerBaseline(hpn_config)
+    elif args.model_type=='mhn':
+        clf = MHN(hpn_config, layer2weight=args.layer2weight)
+    elif args.model_type=='fortunato': # pretraining with applicability-matrix
+        clf = SeglerBaseline(hpn_config)
+    elif args.model_type=='staticQK': # staticQK
+        clf = StaticQK(hpn_config)
+    elif args.model_type=='retrosim': # staticQK
+        clf = Retrosim(hpn_config)
+    else:
+        raise NotImplementedError
+    return clf, hpn_config
+def set_up_template_encoder(args, clf, label_to_n_train_samples=None, template_list=None):
+    if isinstance(clf, SeglerBaseline):
+        clf.templates = []
+    elif args.model_type=='staticQK':
+        clf.template_list = list(template_list.values())
+        clf.update_template_embedding(which=args.template_fp_type, fp_size=args.fp_size, radius=args.fp_radius, njobs=args.njobs)
+    elif args.model_type=='retrosim':
+        #clf.template_list = list(X['train'].values())
+        clf.fit_with_train(X_fp['train'], y['train'])
+    else:
+        import hashlib
+        PATH = './data/cache/'
+        if not os.path.exists(PATH):
+            os.mkdir(PATH)
+        fn_templ_emb = f'{PATH}templ_emb_{args.fp_size}_{args.template_fp_type}{args.fp_radius}_{len(template_list)}_{int(hashlib.sha512((str(template_list)).encode()).hexdigest(), 16)}.npy'
+        if (os.path.exists(fn_templ_emb)): # load the template embedding
+            print(f'loading tfp from file {fn_templ_emb}')
+            templ_emb = np.load(fn_templ_emb)
+            # !!! beware of different fingerprint types
+            clf.template_list = list(template_list.values())
+            if args.only_templates_in_batch:
+                clf.templates_np = templ_emb
+                clf.templates = None
+            else:
+                clf.templates = torch.from_numpy(templ_emb).float().to(clf.config.device)
+        else:
+            if args.template_fp_type=='MxFP':
+                clf.template_list = list(template_list.values())
+                clf.templates = torch.from_numpy(comb_template_fp).float().to(clf.config.device)
+                clf.set_templates_recursively()
+            elif args.template_fp_type=='Tfidf':
+                clf.template_list = list(template_list.values())
+                clf.templates = torch.from_numpy(tfidf_template_fp).float().to(clf.config.device)
+                clf.set_templates_recursively()
+            elif args.template_fp_type=='random':
+                clf.template_list = list(template_list.values())
+                clf.templates = torch.from_numpy(np.random.rand(len(template_list),args.fp_size)).float().to(clf.config.device)
+                clf.set_templates_recursively()
+            else:
+                clf.set_templates(list(template_list.values()), which=args.template_fp_type, fp_size=args.fp_size,
+                                  radius=args.fp_radius, learnable=False, njobs=args.njobs, only_templates_in_batch=args.only_templates_in_batch)
+                #if len(template_list)<100000:
+                np.save(fn_templ_emb, clf.templates_np if args.only_templates_in_batch else clf.templates.detach().cpu().numpy().astype(np.float16))
+        # concatinate the current fingerprint with a random fingerprint if the threshold is above
+        if (args.concat_rand_template_thresh != -1) & (args.repl_quotient>0):
+            REPLACE_FACTOR = int(args.repl_quotient) # default was 8
+            # fold the original fingerprint
+            pre_comp_templates = clf.templates_np if args.only_templates_in_batch else clf.templates.detach().cpu().numpy()
+            # mask of labels with mor than 49 training samples
+            l_mask = np.array([label_to_n_train_samples[k]>=args.concat_rand_template_thresh for k in template_list])
+            print(f'Num of templates with added rand-vect of size {pre_comp_templates.shape[1]//REPLACE_FACTOR} due to >=thresh ({args.concat_rand_template_thresh}):',l_mask.sum())
+            # remove the bits with the lowest variance
+            v = pre_comp_templates.var(0)
+            idx_lowest_var_half = v.argsort()[:(pre_comp_templates.shape[1]//REPLACE_FACTOR)]
+            # the new zero-init-vectors
+            pre = np.zeros([pre_comp_templates.shape[0], pre_comp_templates.shape[1]//REPLACE_FACTOR]).astype(np.float)
+            print(pre.shape, l_mask.shape, l_mask.sum()) #(616, 1700) (11790,) 519
+            print(pre_comp_templates.shape, len(template_list)) #(616, 17000) 616
+            # only the ones with >thresh will receive a random vect
+            pre[l_mask] = np.random.rand(l_mask.sum(), pre.shape[1])
+            pre_comp_templates[:,idx_lowest_var_half] = pre
+            #clf.templates = torch.from_numpy(pre_comp_templates).float().to(clf.config.device)
+            if pre_comp_templates.shape[0]<100000:
+                print('adding template_matrix to params')
+                param = torch.nn.Parameter(torch.from_numpy(pre_comp_templates).float(), requires_grad=False)
+                clf.register_parameter(name='templates+noise', param=param)
+                clf.templates = param.to(clf.config.device)
+                clf.set_templates_recursively()
+            else: #otherwise might cause memory issues
+                print('more than 100k templates')
+                if args.only_templates_in_batch:
+                    clf.templates = None
+                    clf.templates_np = pre_comp_templates
+                else:
+                    clf.templates = torch.from_numpy(pre_comp_templates).float()
+                    clf.set_templates_recursively()
+    # set's this for the first layer!!
+    if args.template_fp_type2=='MxFP':
+        print('first_layer template_fingerprint is set to MxFP')
+        clf.templates = torch.from_numpy(comb_template_fp).float().to(clf.config.device)
+    elif args.template_fp_type2=='Tfidf':
+        print('first_layer template_fingerprint is set to Tfidf')
+        clf.templates = torch.from_numpy(tfidf_template_fp).float().to(clf.config.device)
+    elif args.template_fp_type2=='random':
+        print('first_layer template_fingerprint is set to random')
+        clf.templates = torch.from_numpy(np.random.rand(len(template_list),args.fp_size)).float().to(clf.config.device)
+    elif args.template_fp_type2=='stfp':
+        print('first_layer template_fingerprint is set to stfp ! only works with 4096 fp_size')
+        tfp = getTemplateFingerprint(list(template_list.values()))
+        clf.templates = torch.from_numpy(tfp).float().to(clf.config.device)
+    return clf
+if __name__ == '__main__':
+    args = parse_args()
+    run_id = str(time()).split('.')[0]
+    fn_postfix = str(args.exp_name) + '_' + run_id
+    if args.wandb:
+        import wandb
+        wandb.init(project='mhn-react', entity='phseidl', name=args.dataset_type+'_'+args.model_type+'_'+fn_postfix, config=args.__dict__)
+    else:
+        wandb=None
+    if not args.verbose:
+        disable_rdkit_logging()
+    if args.seed is not None:
+        from .utils import seed_everything
+        seed_everything(args.seed)
+        print('seeded with',args.seed)
+    # load csv or data
+    if args.csv_path is None:
+        X, y = load_USPTO(which=args.dataset_type)
+        template_list = load_templates(which=args.dataset_type)
+    else:
+        X, y, template_list, test_reactants_can = load_dataset_from_csv(**vars(args))
+    if args.addval2train:
+        print('adding val to train')
+        X['train'] = [*X['train'],*X['valid']]
+        y['train'] = np.concatenate([y['train'],y['valid']])
+    splits = ['train', 'valid', 'test']
+    #TODO split up in seperate class
+    if args.splitting_scheme == 'class-freq':
+        X_all = np.concatenate([X[split] for split in splits], axis=0)
+        y_all = np.concatenate([y[split] for split in splits])
+        # sort class by frequency / assumes class-index is ordered (wich is mildely violated)
+        res = y_all.argsort()
+        # use same split proportions
+        cum_split_lens = np.cumsum([len(y[split]) for split in splits]) #cumulative split length
+        X['train'] = X_all[res[0:cum_split_lens[0]]]
+        y['train'] = y_all[res[0:cum_split_lens[0]]]
+        X['valid'] = X_all[res[cum_split_lens[0]:cum_split_lens[1]]]
+        y['valid'] = y_all[res[cum_split_lens[0]:cum_split_lens[1]]]
+        X['test'] = X_all[res[cum_split_lens[1]:]]
+        y['test'] = y_all[res[cum_split_lens[1]:]]
+        for split in splits:
+            print(split, y[split].shape[0], 'samples (', y[split].max(),'max label)')
+    if args.splitting_scheme == 'remove_once_in_train_and_not_in_test':
+        print('remove_once_in_train')
+        from collections import Counter
+        cc = Counter()
+        cc.update(y['train'])
+        classes_set_only_once_in_train = set(np.array(list(cc.keys()))[ (np.array(list(cc.values())))==1])
+        not_in_test = set(y['train']).union(y['valid']) - (set(y['test']))
+        classes_set_only_once_in_train = (classes_set_only_once_in_train.intersection(not_in_test))
+        remove_those_mask = np.array([yii in classes_set_only_once_in_train for yii in y['train']])
+        X['train'] = np.array(X['train'])[~remove_those_mask]
+        y['train'] = np.array(y['train'])[~remove_those_mask]
+        print(remove_those_mask.mean(),'%', remove_those_mask.sum(), 'samples removed')
+    if args.splitting_scheme == 'random':
+        print('random-splitting-scheme:8-1-1')
+        if args.ssretroeval:
+            print('ssretroeval not available')
+            raise NotImplementedError
+        import numpy as np
+        from sklearn.model_selection import train_test_split
+        def _unpack(lod):
+            r = []
+            for k,v in lod.items():
+                [r.append(i) for i in v]
+            return r
+        X_all = _unpack(X)
+        y_all = np.array( _unpack(y) )
+        X['train'], X['test'], y['train'], y['test'] = train_test_split(X_all, y_all, test_size=0.2, random_state=70135)
+        X['test'], X['valid'], y['test'], y['valid'] = train_test_split(X['test'], y['test'], test_size=0.5, random_state=70135)
+        zero_shot = set(y['test']).difference( set(y['train']).union(set(y['valid'])) )
+        zero_shot_mask = np.array([yi in zero_shot for yi in y['test']])
+        print(sum(zero_shot_mask))
+        #y['test'][zero_shot_mask] = list(zero_shot)[0] #not right but quick
+    if args.model_type=='staticQK' or args.model_type=='retrosim':
+        print('staticQK model: caution: use pattern, or rdk -fingerprint-embedding')
+    fp_size = args.fp_size
+    radius = args.fp_radius #quite important ;)
+    fp_embedding = args.fp_type
+    X_fp = featurize_smiles(X, fp_type=args.fp_type, fp_size=args.fp_size, fp_radius=args.fp_radius, njobs=args.njobs)
+    if args.template_fp_type=='MxFP' or (args.template_fp_type2=='MxFP'):
+        temp_part_to_fp = {}
+        for i in template_list:
+            tpl = template_list[i]
+            for part in str(tpl).split('>>'):
+                for p in str(part).split('.'):
+                    temp_part_to_fp[p]=None
+        for i, k in enumerate(temp_part_to_fp):
+            temp_part_to_fp[k] = i
+        fp_types = ['Morgan2CBF','Morgan4CBF', 'Morgan6CBF','AtomPair','TopologicalTorsion', 'Pattern', 'RDK']
+        #MACCS ErG don't work --> errors with explicit / inplicit valence
+        templates_fp = {}
+        remaining = args.fp_size
+        for fp_type in fp_types:
+            #print(fp_type, end='\t')
+            # if it's that last use up the remaining fps
+            te_feat = FP_featurizer(fp_types=fp_type,
+                                    max_features=(args.fp_size//len(fp_types)) if (fp_type != fp_types[-1]) else remaining,
+                                    log_scale=False
+                                    )
+            templates_fp[fp_type] = te_feat.fit(list(temp_part_to_fp.keys())[:], is_smarts=True)
+            #print(np.unique(templates_fp[fp_type]), end='\r')
+            remaining -= templates_fp[fp_type].shape[1]
+        templates_fp['fp'] = np.hstack([ templates_fp[f'{fp_type}'] for fp_type in fp_types])
+    if args.template_fp_type=='MxFP' or (args.template_fp_type2=='MxFP'):
+        comb_template_fp = compute_template_fp(fp_len= args.fp_size, reactant_pooling=args.reactant_pooling)
+    if args.template_fp_type=='Tfidf' or (args.template_fp_type2 == 'Tfidf'):
+        print('using tfidf template-fingerprint')
+        from sklearn.feature_extraction.text import TfidfVectorizer
+        corpus = (list(template_list.values()))
+        vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,12), max_features=args.fp_size)
+        tfidf_template_fp = vectorizer.fit_transform(corpus).toarray()
+        tfidf_template_fp.shape
+    acutal_fp_size = X_fp['train'].shape[1]
+    if acutal_fp_size != args.fp_size:
+        args.fp_size = int(X_fp['train'].shape[1])
+        print('Warning: fp-size has changed to', acutal_fp_size)
+    label_to_n_train_samples = {}
+    n_train_samples_to_label = defaultdict(list)
+    n_templates = max(template_list.keys())+1 #max(max(y['train']), max(y['test']), max(y['valid']))
+    for i in range(n_templates):
+        n_train_samples = (y['train']==i).sum()
+        label_to_n_train_samples[i] = n_train_samples
+        n_train_samples_to_label[n_train_samples].append(i)
+    up_to = 11
+    n_samples = []
+    masks = []
+    ntes = range(up_to)
+    mask_dict = {}
+    for nte in ntes: # Number of training examples
+        split = f'nte_{nte}'
+        #print(split)
+        mask = np.zeros(y['test'].shape)
+        if isinstance(nte, int):
+            for label_with_nte in n_train_samples_to_label[nte]:
+                mask += (y['test'] == label_with_nte)
+        mask = mask>=1
+        masks.append(mask)
+        mask_dict[str(nte)] = mask
+        n_samples.append(mask.sum())
+    # for greater than 10 # >10
+    n_samples.append((np.array(masks).max(0)==0).sum())
+    mask_dict['>10'] = (np.array(masks).max(0)==0)
+    sum(n_samples), mask.shape
+    ntes = range(50) #to 49
+    for nte in ntes: # Number of training examples
+        split = f'nte_{nte}'
+        #print(split)
+        mask = np.zeros(y['test'].shape)
+        for label_with_nte in n_train_samples_to_label[nte]:
+            mask += (y['test'] == label_with_nte)
+        mask = mask>=1
+        masks.append(mask)
+    # for greater than 10 # >49
+    n_samples.append((np.array(masks).max(0)==0).sum())
+    mask_dict['>49'] = np.array(masks).max(0)==0
+    print(n_samples)
+    clf, hpn_config = set_up_model(args, template_list=template_list)
+    clf = set_up_template_encoder(args, clf, label_to_n_train_samples=label_to_n_train_samples, template_list=template_list)
+    if args.verbose:
+        print(clf.config.__dict__)
+        print(clf)
+    wda = torch.optim.AdamW(clf.parameters(), lr=args.lr, weight_decay=1e-2)
+    if args.wandb:
+        wandb.watch(clf)
+    # pretraining with applicablity matrix, if applicable
+    if args.model_type == 'fortunato' or args.pretrain_epochs>1:
+        print('pretraining on applicability-matrix -- loading the matrix')
+        _, y_appl = load_USPTO(args.dataset_type, is_appl_matrix=True)
+        if args.splitting_scheme == 'remove_once_in_train_and_not_in_test':
+            y_appl['train'] = y_appl['train'][~remove_those_mask]
+        # check random if the applicability is true for y
+        splt = 'train'
+        for i in range(500):
+            i = np.random.randint(len(y[splt]))
+            #assert ( y_appl[splt][i].indices == y[splt][i] ).sum()==1
+        print('pre-training (BCE-loss)')
+        for epoch in range(args.pretrain_epochs):
+            clf.train_from_np(X_fp['train'], X_fp['train'], y_appl['train'], use_dataloader=True, is_smiles=False,
+                          epochs=1, wandb=wandb, verbose=args.verbose, bs=args.batch_size,
+                          permute_batches=True, shuffle=True, optimizer=wda,
+                          only_templates_in_batch=args.only_templates_in_batch)
+            y_pred = clf.evaluate(X_fp['valid'], X_fp['valid'], y_appl['valid'],
+                                  split='pretrain_valid', is_smiles=False, only_loss=True,
+                                  bs=args.batch_size,wandb=wandb)
+            appl_acc = ((y_appl['valid'].toarray()) == (y_pred>0.5)).mean()
+            print(f'{epoch:2.0f} -- train_loss: {clf.hist["loss"][-1]:1.3f}, loss_valid: {clf.hist["loss_pretrain_valid"][-1]:1.3f}, train_acc: {appl_acc:1.5f}')
+    fn_hist = None
+    y_preds = None
+    for epoch in range(round(args.epochs / args.eval_every_n_epochs)):
+        if not isinstance(clf, StaticQK):
+            now = time()
+            clf.train_from_np(X_fp['train'], X_fp['train'], y['train'], use_dataloader=True, is_smiles=False,
+                          epochs=args.eval_every_n_epochs, wandb=wandb, verbose=args.verbose, bs=args.batch_size,
+                              permute_batches=True, shuffle=True, optimizer=wda, only_templates_in_batch=args.only_templates_in_batch)
+            if args.verbose: print(f'training took {(time()-now)/60:3.1f} min for {args.eval_every_n_epochs} epochs')
+        for split in ['valid', 'test']:
+            print(split, 'evaluating', end='\r')
+            now = time()
+            #only_loss = ((epoch%5)==4) if args.dataset_type=='lg' else True
+            y_preds = clf.evaluate(X_fp[split], X_fp[split], y[split], is_smiles=False, split=split, bs=args.batch_size, only_loss=args.eval_only_loss, wandb=wandb);
+            if args.verbose: print(f'eval {split} took',(time()-now)/60,'min')
+        if not isinstance(clf, StaticQK):
+            try:
+                print(f'{epoch:2.0f} -- train_loss: {clf.hist["loss"][-1]:1.3f}, loss_valid: {clf.hist["loss_valid"][-1]:1.3f}, val_t1acc: {clf.hist["t1_acc_valid"][-1]:1.3f}, val_t100acc: {clf.hist["t100_acc_valid"][-1]:1.3f}')
+            except:
+                pass
+        now = time()
+        ks = [1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 100]
+        for nte in mask_dict: # Number of training examples
+            split = f'nte_{nte}'
+            #print(split)
+            mask = mask_dict[nte]
+            topkacc = top_k_accuracy(np.array(y['test'])[mask], y_preds[mask, :], k=ks, ret_arocc=False)
+            new_hist = {}
+            for k, tkacc in zip(ks, topkacc):
+                new_hist[f't{k}_acc_{split}'] = tkacc
+            #new_hist[(f'arocc_{split}')] = (arocc)
+            new_hist[f'steps_{split}'] = (clf.steps)
+            for k in new_hist:
+                clf.hist[k].append(new_hist[k])
+        if args.verbose: print(f'eval nte-test took',(time()-now)/60,'min')
+        fn_hist = clf.save_hist(prefix=f'USTPO_{args.dataset_type}_{args.model_type}_', postfix=fn_postfix)
+    if args.save_preds:
+        PATH = './data/preds/'
+        if not os.path.exists(PATH):
+            os.mkdir(PATH)
+        pred_fn = f'{PATH}USPTO_{args.dataset_type}_test_{args.model_type}_{fn_postfix}.npy'
+        print('saving predictions to',pred_fn)
+        np.save(pred_fn,y_preds)
+        args.save_preds = pred_fn
+    if args.save_model:
+        model_save_path = clf.save_model(prefix=f'USPTO_{args.dataset_type}_{args.model_type}_valloss{clf.hist.get("loss_valid",[-1])[-1]:1.3f}_',name_as_conf=False, postfix=fn_postfix)
+        # Serialize data into file:
+        import json
+        json.dump( args.__dict__, open( f"data/model/{fn_postfix}_args.json", 'w' ) )
+        json.dump( hpn_config.__dict__,
+                  open( f"data/model/{fn_postfix}_config.json", 'w' ) )
+        print('model saved to', model_save_path)
+    print(min(clf.hist.get('loss_valid',[-1])))
+    if args.plot_res:
+        from plotutils import plot_topk, plot_nte
+        plt.figure()
+        clf.plot_loss()
+        plt.draw()
+        plt.figure()
+        plot_topk(clf.hist, sets=['valid'])
+        if args.dataset_type=='sm':
+            baseline_val_res = {1:0.4061, 10:0.6827, 50: 0.7883, 100:0.8400}
+            plt.plot(list(baseline_val_res.keys()), list(baseline_val_res.values()), 'k.--')
+        plt.draw()
+        plt.figure()
+        best_cpt = np.array(clf.hist['loss_valid'])[::-1].argmin()+1
+        print(best_cpt)
+        try:
+            best_cpt = np.array(clf.hist['t10_acc_valid'])[::-1].argmax()+1
+            print(best_cpt)
+        except:
+            print('err with t10_acc_valid')
+        plot_nte(clf.hist, dataset=args.dataset_type.capitalize(), last_cpt=best_cpt, include_bar=True, model_legend=args.exp_name,
+                 n_samples=n_samples, z=1.96)
+        if os.path.exists('data/figs/'):
+            try:
+                os.mkdir(f'data/figs/{args.exp_name}/')
+            except:
+                pass
+            plt.savefig(f'data/figs/{args.exp_name}/training_examples_vs_top100_acc_{args.dataset_type}_{hash(str(args))}.pdf')
+        plt.draw()
+        fn_hist = clf.save_hist(prefix=f'USTPO_{args.dataset_type}_{args.model_type}_', postfix=fn_postfix)
+    if args.ssretroeval:
+        print('testing on the real test set ;)')
+        from .data import load_templates
+        from .retroeval import run_templates, topkaccuracy
+        from .utils import sort_by_template_and_flatten
+        a = list(template_list.keys())
+        #assert list(range(len(a))) == a
+        templates = list(template_list.values())
+        #templates = [*templates, *expert_templates]
+        template_product_smarts = [str(s).split('>')[0] for s in templates]
+        #execute all template
+        print('execute all templates')
+        test_product_smarts = [xi[0] for xi in X['test']] #added later
+        smarts2appl = memory.cache(smarts2appl, ignore=['njobs','nsplits', 'use_tqdm'])
+        appl = smarts2appl(test_product_smarts, template_product_smarts, njobs=args.njobs)
+        n_pairs = len(test_product_smarts) * len(template_product_smarts)
+        n_appl = len(appl[0])
+        print(n_pairs, n_appl, n_appl/n_pairs)
+        #forward
+        split = 'test'
+        print('len(X_fp[test]):',len(X_fp[split]))
+        y[split] = np.zeros(len(X[split])).astype(np.int)
+        clf.eval()
+        if y_preds is None:
+            y_preds = clf.evaluate(X_fp[split], X_fp[split], y[split], is_smiles=False,
+                               split='ttest', bs=args.batch_size, only_loss=True, wandb=None);
+        template_scores = y_preds #this should allready be test
+        ####
+        if y_preds.shape[1]>100000:
+            kth = 200
+            print(f'only evaluating top {kth} applicable predicted templates')
+            # only take top kth and multiply by applicability matrix
+            appl_mtrx = np.zeros_like(y_preds, dtype=bool)
+            appl_mtrx[appl[0], appl[1]] = 1
+            appl_and_topkth = ([], [])
+            for row in range(len(y_preds)):
+                argpreds = (np.argpartition(-(y_preds[row]*appl_mtrx[row]), kth, axis=0)[:kth])
+                # if there are less than kth applicable
+                mask = appl_mtrx[row][argpreds]
+                argpreds = argpreds[mask]
+                #if len(argpreds)!=kth:
+                #    print('changed to ', len(argpreds))
+                appl_and_topkth[0].extend([row for _ in range(len(argpreds))])
+                appl_and_topkth[1].extend(list(argpreds))
+            appl = appl_and_topkth
+        ####
+        print('running the templates')
+        run_templates = run_templates #memory.cache( ) ... allready cached to tmp
+        prod_idx_reactants, prod_temp_reactants =  run_templates(test_product_smarts, templates, appl, njobs=args.njobs)
+        #sorted_results = sort_by_template(template_scores, prod_idx_reactants)
+        #flat_results = flatten_per_product(sorted_results, remove_duplicates=True)
+        #now aglomerates over same outcome
+        flat_results = sort_by_template_and_flatten(y_preds, prod_idx_reactants, agglo_fun=sum)
+        accs = topkaccuracy(test_reactants_can, flat_results, [*list(range(1,101)), 100000])
+        mtrcs2 = {f't{k}acc_ttest':accs[k-1] for k in [1,2,3,5,10,20,50,100,101]}
+        if wandb:
+            wandb.log(mtrcs2)
+        print('Single-step retrosynthesis-evaluation, results on ttest:')
+        #print([k[:-6]+'|' for k in mtrcs2.keys()])
+        [print(k[:-6],end='\t') for k in mtrcs2.keys()]
+        print()
+        for k,v in mtrcs2.items():
+            print(f'{v*100:2.2f}',end='\t')
+    # save the history of this experiment
+    EXP_DIR = 'data/experiments/'
+    df = pd.DataFrame([args.__dict__])
+    df['min_loss_valid'] = min(clf.hist.get('loss_valid', [-1]))
+    df['min_loss_train'] = 0 if ((args.model_type=='staticQK') or (args.model_type=='retrosim')) else min(clf.hist.get('loss',[-1]))
+    try:
+        df['max_t1_acc_valid'] = max(clf.hist.get('t1_acc_valid', [0]))
+        df['max_t100_acc_valid'] = max(clf.hist.get('t100_acc_valid', [0]))
+    except:
+        pass
+    df['hist'] = [clf.hist]
+    df['n_samples'] = [n_samples]
+    df['fn_hist'] = fn_hist if fn_hist else None
+    df['fn_model'] = '' if not args.save_model else model_save_path
+    df['date'] = str(datetime.datetime.fromtimestamp(time()))
+    df['cmd'] = ' '.join(sys.argv[:])
+    if not os.path.exists(EXP_DIR):
+        os.mkdir(EXP_DIR)
+    df.to_csv(f'{EXP_DIR}{run_id}.tsv', sep='\t')
+    df

mhnreact/utils.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# -*- coding: utf-8 -*-
+"""
+Author: Philipp Seidl
+        ELLIS Unit Linz, LIT AI Lab, Institute for Machine Learning
+        Johannes Kepler University Linz
+Contact: [email protected]
+General utility functions
+"""
+import argparse
+from collections import defaultdict
+import numpy as np
+import pandas as pd
+import math
+import torch
+# used and fastest version
+def top_k_accuracy(y_true, y_pred, k=5, ret_arocc=False, ret_mrocc=False, verbose=False, count_equal_as_correct=False, eps_noise=0):
+    """ partly from http://stephantul.github.io/python/pytorch/2020/09/18/fast_topk/
+        count_equal counts equal values as beein a correct choice e.g. all preds = 0 --> T1acc = 1
+        ret_mrocc ... also return median rank of correct choice
+        eps_noise ... if >0 ads noise*eps to y_pred .. recommended e.g. 1e-10
+    """
+    if eps_noise>0:
+        if torch.is_tensor(y_pred):
+            y_pred = y_pred + torch.rand(y_pred.shape)*eps_noise
+        else:
+            y_pred = y_pred + np.random.rand(*y_pred.shape)*eps_noise
+    if count_equal_as_correct:
+        greater = (y_pred > y_pred[range(len(y_pred)), y_true][:,None]).sum(1) # how many are bigger
+    else:
+        greater = (y_pred >= y_pred[range(len(y_pred)), y_true][:,None]).sum(1) # how many are bigger or equal
+    if torch.is_tensor(y_pred):
+        greater = greater.long()
+    if isinstance(k, int): k = [k]  # pack it into a list
+    tkaccs = []
+    for ki in k:
+        if count_equal_as_correct:
+            tkacc = (greater<=(ki-1))
+        else:
+            tkacc = (greater<=(ki))
+        if torch.is_tensor(y_pred):
+            tkacc = tkacc.float().mean().detach().cpu().numpy()
+        else:
+            tkacc = tkacc.mean()
+        tkaccs.append(tkacc)
+        if verbose: print('Top', ki, 'acc:\t', str(tkacc)[:6])
+    if ret_arocc:
+        arocc = greater.float().mean()+1
+        if torch.is_tensor(arocc):
+            arocc = arocc.detach().cpu().numpy()
+        return (tkaccs[0], arocc) if len(tkaccs) == 1 else (tkaccs, arocc)
+    if ret_mrocc:
+        mrocc = greater.median()+1
+        if torch.is_tensor(mrocc):
+            mrocc = mrocc.float().detach().cpu().numpy()
+        return (tkaccs[0], mrocc) if len(tkaccs) == 1 else (tkaccs, mrocc)
+    return tkaccs[0] if len(tkaccs) == 1 else tkaccs
+def seed_everything(seed=70135):
+    """ does what it says ;) - from https://gist.github.com/KirillVladimirov/005ec7f762293d2321385580d3dbe335"""
+    import numpy as np
+    import random
+    import os
+    import torch
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+def get_best_gpu():
+    '''Get the gpu with most RAM on the machine. From P. Neves'''
+    import torch
+    if torch.cuda.is_available():
+        gpus_ram = []
+        for ind in range(torch.cuda.device_count()):
+            gpus_ram.append(torch.cuda.get_device_properties(ind).total_memory/1e9)
+        return f"cuda:{gpus_ram.index(max(gpus_ram))}"
+    else:
+        raise ValueError("No gpus were detected in this machine.")
+def sort_by_template_and_flatten(template_scores, prod_idx_reactants, agglo_fun=sum):
+    flat_results = []
+    for ii in range(len(template_scores)):
+        idx_prod_reactants = defaultdict(list)
+        for k,v in prod_idx_reactants[ii].items():
+            for iv in v:
+                idx_prod_reactants[iv].append(template_scores[ii,k])
+        d2 = {k: agglo_fun(v) for k, v in idx_prod_reactants.items()}
+        if len(d2)==0:
+            flat_results.append([])
+        else:
+            flat_results.append(pd.DataFrame.from_dict(d2, orient='index').sort_values(0, ascending=False).index.values.tolist())
+    return flat_results
+def str2bool(v):
+    """adapted from https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse"""
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1', '',' '):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+@np.vectorize
+def lgamma(x):
+    return math.lgamma(x)
+def multinom_gk(array, axis=0):
+    """Multinomial lgamma pooling over a given axis"""
+    res = lgamma(np.sum(array,axis=axis)+2) - np.sum(lgamma(array+1),axis=axis)
+    return res

mhnreact/view.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# -*- coding: utf-8 -*-
+"""
+Author: Philipp Seidl
+        ELLIS Unit Linz, LIT AI Lab, Institute for Machine Learning
+        Johannes Kepler University Linz
+Contact: [email protected]
+Loading log-files from training
+"""
+from pathlib import Path
+import os
+import datetime
+import pandas as pd
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+def load_experiments(EXP_DIR = Path('data/experiments/')):
+    dfs = []
+    for fn in os.listdir(EXP_DIR):
+        print(fn, end='\r')
+        if fn.split('.')[-1]=='tsv':
+            df = pd.read_csv(EXP_DIR/fn, sep='\t', index_col=0)
+            try:
+                with open(df['fn_hist'][0]) as f:
+                    hist = eval(f.readlines()[0] )
+                df['hist'] = [hist]
+                df['fn'] = fn
+            except:
+                print('err')
+                #print(df['fn_hist'])
+            dfs.append( df )
+    df = pd.concat(dfs,ignore_index=True)
+    return df
+def get_x(k, kw, operation='max', index=None):
+    operation = getattr(np,operation)
+    try:
+        if index is not None:
+            return k[kw][index]
+        return operation(k[kw])
+    except:
+        return 0
+def get_min_val_loss_idx(k):
+    return get_x(k, 'loss_valid', 'argmin') #changed from argmax to argmin!!
+def get_tauc(hist):
+    idx = get_min_val_loss_idx(hist)
+    # takes max TODO take idx
+    return np.mean([get_x(hist, f't100_acc_nte_{nt}') for nt in [*range(11),'>10']])
+def get_stats_from_hist(df):
+    df['0shot_acc'] = df['hist'].apply(lambda k: get_x(k, 't100_acc_nte_0'))
+    df['1shot_acc'] = df['hist'].apply(lambda k: get_x(k, 't100_acc_nte_1'))
+    df['>49shot_acc'] = df['hist'].apply(lambda k: get_x(k, 't100_acc_nte_>49'))
+    df['min_loss_valid'] = df['hist'].apply(lambda k: get_x(k, 'loss_valid', 'min'))
+    return df