File size: 13,486 Bytes
5c814f7
 
 
2770f17
95f97c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f37ea64
 
95f97c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f37ea64
95f97c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0980c20
95f97c5
 
 
031d775
95f97c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97258aa
 
 
 
 
 
 
 
 
95f97c5
 
 
 
 
 
 
 
 
 
 
 
4625f11
95f97c5
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
subprocess.run('pip install -U timm', shell=True)
import spaces
import os
import torch
import argparse
import warnings
from rdkit import Chem
from rdkit.Chem import CanonSmiles
from rdkit.Chem import MolFromSmiles, MolToSmiles
from data_provider.pretrain_dm import PretrainDM
from data_provider.tune_dm import *
from model.opt_flash_attention import replace_opt_attn_with_flash_attn
from model.blip2_model import Blip2Model
from data_provider.data_utils import json_read, json_write
from data_provider.data_utils import smiles2data, reformat_smiles
import gradio as gr
from datetime import datetime

## disable online tokenizers parallelism to avoid deadlocks
os.environ["TOKENIZERS_PARALLELISM"] = "false"
## for pyg bug
warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
## for A5000 gpus
torch.set_float32_matmul_precision('medium') # can be medium (bfloat16), high (tensorfloat32), highest (float32)

def smiles_split(string, separator='.'):
    string = str(string)
    mols = []
    for smi in string.split(separator):
        mol = MolFromSmiles(smi)
        if mol is None:
            continue  # Skip invalid SMILES strings
        mols.append(mol)

    parts = []
    current_part = []
    charge_count = 0

    for mol in mols:
        charge = Chem.GetFormalCharge(mol)
        if charge==0:
            if current_part:
                smiles = '.'.join([MolToSmiles(m) for m in current_part])
                smiles = CanonSmiles(smiles)
                parts.append(smiles)
                current_part = []
                charge_count = 0
            parts.append(MolToSmiles(mol))
        else:
            charge_count += charge
            current_part.append(mol)
            if charge_count == 0:
                smiles = '.'.join([MolToSmiles(m) for m in current_part])
                smiles = CanonSmiles(smiles)
                parts.append(smiles)
                current_part = []
                charge_count = 0
    if current_part:
        smiles = '.'.join([MolToSmiles(m) for m in current_part])
        smiles = CanonSmiles(smiles)
        parts.append(smiles)

    return parts

def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--filename', type=str, default="main")
    parser.add_argument('--seed', type=int, default=42, help='random seed')
    # MM settings
    parser.add_argument('--mode', type=str, default='pretrain', choices=['pretrain', 'ft', 'eval', 'pretrain_eval'])
    parser.add_argument('--strategy_name', type=str, default='mydeepspeed')
    parser.add_argument('--iupac_prediction', action='store_true', default=False)
    parser.add_argument('--ckpt_path', type=str, default=None)
    # parser = Trainer.add_argparse_args(parser)
    parser = Blip2Model.add_model_specific_args(parser)  # add model args
    parser = PretrainDM.add_model_specific_args(parser)
    parser.add_argument('--accelerator', type=str, default='gpu')
    parser.add_argument('--devices', type=str, default='0,1,2,3')
    parser.add_argument('--precision', type=str, default='bf16-mixed')
    parser.add_argument('--downstream_task', type=str, default='action', choices=['action', 'synthesis', 'caption', 'chebi'])
    parser.add_argument('--max_epochs', type=int, default=10)
    parser.add_argument('--enable_flash', action='store_true', default=False)
    parser.add_argument('--disable_graph_cache', action='store_true', default=False)
    parser.add_argument('--generate_restrict_tokens', action='store_true', default=False)
    parser.add_argument('--train_restrict_tokens', action='store_true', default=False)
    parser.add_argument('--smiles_type', type=str, default='default', choices=['default', 'canonical', 'restricted', 'unrestricted', 'r_smiles'])
    parser.add_argument('--accumulate_grad_batches', type=int, default=1)
    parser.add_argument('--tqdm_interval', type=int, default=50)
    parser.add_argument('--check_val_every_n_epoch', type=int, default=1)
    args = parser.parse_args()

    if args.enable_flash:
        replace_opt_attn_with_flash_attn()
    return args

app_config = {
    "init_checkpoint": "all_checkpoints/ckpt_tune_hybridFeb11_May31/last_converted.ckpt",
    "filename": "app",
    "opt_model": "facebook/galactica-1.3b",
    "num_workers": 4,
    "rxn_max_len": 512,
    "text_max_len": 512,
    "precision": "bf16-mixed",
    "max_inference_len": 512,
}

class InferenceRunner:
    def __init__(self, model, tokenizer, rxn_max_len, smi_max_len,
                 smiles_type='default', device='cuda', args=None):
        self.model = model
        self.rxn_max_len = rxn_max_len
        self.smi_max_len = smi_max_len
        self.tokenizer = tokenizer
        self.collater = Collater([], [])
        self.mol_ph = '<mol>' * args.num_query_token
        self.mol_token_id = tokenizer.mol_token_id
        self.is_gal = args.opt_model.find('galactica') >= 0
        self.collater = Collater([], [])
        self.device = device
        self.smiles_type = smiles_type
        self.args = args
        time_stamp = datetime.now().strftime("%Y.%m.%d-%H:%M")
        self.cache_dir = f'results/{self.args.filename}/{time_stamp}'
        os.makedirs(self.cache_dir, exist_ok=True)
    
    def make_query_dict(self, rxn_string):
        try:
            reactant, solvent, product = rxn_string.split('>')
            reactant = smiles_split(reactant)
            product = smiles_split(product)
            solvent = smiles_split(solvent) if solvent else []
            assert reactant and product
        except:
            raise gr.Error('Please input a valid reaction string')

        extracted_molecules = {product[0]: "$-1$"}
        for mol in reactant+solvent:
            extracted_molecules[mol] = f"${len(extracted_molecules)}$"

        result_dict = {}
        result_dict['time_stamp'] = datetime.now().strftime("%Y.%m.%d %H:%M:%S.%f")[:-3]
        result_dict['reaction_string'] = rxn_string
        result_dict['REACTANT'] = reactant
        result_dict['SOLVENT'] = solvent
        result_dict['CATALYST'] = []
        result_dict['PRODUCT'] = product
        result_dict['extracted_molecules'] = extracted_molecules
        return result_dict
    
    def save_prediction(self, result_dict):
        os.makedirs(self.cache_dir, exist_ok=True)
        result_id = result_dict['time_stamp']
        result_path = os.path.join(self.cache_dir, f'{result_id}.json')
        json_write(result_path, result_dict)

    def make_prompt(self, param_dict, smi_max_len=128):
        smiles_list = []
        prompt = ''
        prompt += 'Reactants: '
        smiles_wrapper = lambda x: reformat_smiles(x, smiles_type=self.smiles_type)[:smi_max_len]
        for smi in param_dict['REACTANT']:
            prompt += f'{param_dict["extracted_molecules"][smi]}: [START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
            smiles_list.append(smi)
            
        prompt += 'Product: '
        for smi in param_dict['PRODUCT']:
            prompt += f'{param_dict["extracted_molecules"][smi]}: [START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
            smiles_list.append(smi)

        if param_dict['CATALYST']:
            prompt += 'Catalysts: '
            for smi in param_dict['CATALYST']:
                if smi in param_dict["extracted_molecules"]:
                    prompt += f'{param_dict["extracted_molecules"][smi]}: [START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
                else:
                    prompt += f'[START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
                smiles_list.append(smi)

        if param_dict['SOLVENT']:
            prompt += 'Solvents: '
            for smi in param_dict['SOLVENT']:
                if smi in param_dict["extracted_molecules"]:
                    prompt += f'{param_dict["extracted_molecules"][smi]}: [START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
                else:
                    prompt += f'[START_SMILES]{smiles_wrapper(smi)}[END_SMILES] '
                smiles_list.append(smi)

        prompt += 'Action Squence: '
        return prompt, smiles_list

    def get_action_elements(self, rxn_dict):
        input_text, smiles_list = self.make_prompt(rxn_dict, self.smi_max_len)

        graph_list = []
        for smiles in smiles_list:
            graph_item = smiles2data(smiles)
            graph_list.append(graph_item)
        return graph_list, input_text

    @spaces.GPU
    @torch.no_grad()
    def predict(self, rxn_dict, temperature=1):
        graphs, prompt_tokens = self.tokenize(rxn_dict)
        self.model.blip2opt = self.model.blip2opt.to('cuda')
        result_dict = rxn_dict
        samples = {'graphs': graphs, 'prompt_tokens': prompt_tokens}
        prediction = self.model.blip2opt.generate(
            samples,
            do_sample=self.args.do_sample,
            num_beams=self.args.num_beams,
            max_length=self.args.max_inference_len,
            min_length=self.args.min_inference_len,
            num_captions=self.args.num_generate_captions,
            temperature=temperature,
            use_graph=True
        )[0]
        for k, v in result_dict['extracted_molecules'].items():
            prediction = prediction.replace(v, k)
        result_dict['prediction'] = prediction
        return result_dict
    
    def tokenize(self, rxn_dict):
        graph_list, input_text = self.get_action_elements(rxn_dict)
        if graph_list:
            graphs = self.collater(graph_list).to(self.device)
        input_prompt = smiles_handler(input_text, self.mol_ph, self.is_gal)[0]

        ## deal with prompt
        self.tokenizer.padding_side = 'left'
        input_prompt_tokens = self.tokenizer(input_prompt, 
                                              truncation=True, 
                                              padding='max_length', 
                                              add_special_tokens=True,
                                              max_length=self.rxn_max_len, 
                                              return_tensors='pt', 
                                              return_attention_mask=True).to(self.device)
        is_mol_token = input_prompt_tokens.input_ids == self.mol_token_id
        input_prompt_tokens['is_mol_token'] = is_mol_token
        return graphs, input_prompt_tokens

def main(args):
    device = torch.device('cuda')
    # model
    if args.init_checkpoint:
        model = Blip2Model(args).to(device)
        ckpt = torch.load(args.init_checkpoint, map_location='cpu')
        model.load_state_dict(ckpt['state_dict'], strict=False)
        print(f"loaded model from {args.init_checkpoint}")
    else:
        model = Blip2Model(args).to(device)
    model.eval()

    print('total params:', sum(p.numel() for p in model.parameters()))

    if args.opt_model.find('galactica') >= 0 or args.opt_model.find('t5') >= 0:
        tokenizer = model.blip2opt.opt_tokenizer
    elif args.opt_model.find('llama') >= 0 or args.opt_model.find('vicuna') >= 0:
        tokenizer = model.blip2opt.llm_tokenizer
    else:
        raise NotImplementedError

    infer_runner = InferenceRunner(
        model=model,
        tokenizer=tokenizer,
        rxn_max_len=args.rxn_max_len,
        smi_max_len=args.smi_max_len,
        device=device,
        args=args
    )
    example_inputs = json_read('demo.json')
    example_inputs = [[e] for e in example_inputs]

    def online_chat(reaction_string, temperature=1):
        data_item = infer_runner.make_query_dict(reaction_string)
        result = infer_runner.predict(data_item, temperature=temperature)
        infer_runner.save_prediction(result)
        prediction = result['prediction'].replace(' ; ', ' ;\n')
        return prediction
    
    with gr.Blocks(css="""
            .center { display: flex; justify-content: center; }
        """) as demo:
        gr.HTML(
        """
        <center><h1><b>ReactXT</b></h1></center>
        <p style="font-size:20px; font-weight:bold;">This is the demo page of our ACL 2024 paper 
        <i>ReactXT: Understanding Molecular “Reaction-ship” via Reaction-Contextualized Molecule-Text Pretraining.</i></p>
        <center><img src="/file=./figures/frameworks.jpg" alt="Framework" style="width:1000px;"></center>
        <p style="font-size:16px;"> Please input one chemical reaction below, and we will generate the predicted experimental procedure.</p>
        <p style="font-size:16px;"> The reaction should be in form of <b>Reactants>Reagents>Product</b>.</p>
        """)

        reaction_string = gr.Textbox(placeholder="Input one reaction", label='Input Reaction')
        gr.Examples(example_inputs, [reaction_string,], fn=online_chat, label='Example Reactions')
        with gr.Row():
            btn = gr.Button("Submit")
            clear_btn = gr.Button("Clear")
        temperature = gr.Slider(0.1, 1, value=1, label='Temperature')
        with gr.Row():
            out = gr.Textbox(label="ReactXT's Output", placeholder="Predicted experimental procedure")
        btn.click(fn=online_chat, inputs=[reaction_string, temperature], outputs=[out])
        clear_btn.click(fn=lambda:("", ""), inputs=[], outputs=[reaction_string, out])

    demo.launch(allowed_paths=['/home/user/app/figures/'])



if __name__ == '__main__':
    args = get_args()
    vars(args).update(app_config)
    main(args)