File size: 10,257 Bytes
f8ea2c9
848ce1e
 
f8ea2c9
 
8bef29c
 
f8ea2c9
8bef29c
 
 
f8ea2c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bef29c
 
 
c3164df
8bef29c
 
bf92928
8bef29c
 
 
f8ea2c9
8bef29c
 
 
 
 
b898ea7
 
 
 
 
 
 
bf92928
 
 
 
 
 
 
 
 
 
 
 
8bef29c
 
f8ea2c9
8bef29c
 
b898ea7
 
 
 
bf92928
 
 
 
 
 
8132ec4
8bef29c
8132ec4
f8ea2c9
 
 
 
bdb5c5d
 
 
f8ea2c9
 
f8692f6
8132ec4
 
f8692f6
 
8bef29c
f8692f6
29fd41c
f8692f6
98a1fbe
f8692f6
98a1fbe
f8692f6
98a1fbe
29fd41c
848ce1e
 
98a1fbe
43ba66f
838b9d1
43ba66f
848ce1e
 
98a1fbe
 
 
 
 
 
 
 
848ce1e
8bef29c
 
b898ea7
 
d506e15
415a25e
f8ea2c9
 
1920c80
f8ea2c9
 
 
 
 
 
 
bdb5c5d
f8ea2c9
8bef29c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import os, yaml
import gradio as gr
import requests
import argparse

from PIL import Image

import numpy as np
import torch
from transformers import AutoModelForCausalLM

from huggingface_hub import hf_hub_download


## InstructIR Plugin ##
from insir_models import instructir
from insir_text.models import LanguageModel, LMHead

hf_hub_download(repo_id="marcosv/InstructIR", filename="im_instructir-7d.pt", local_dir="./")
hf_hub_download(repo_id="marcosv/InstructIR", filename="lm_instructir-7d.pt", local_dir="./")

CONFIG     = "eval5d.yml"
LM_MODEL   = "lm_instructir-7d.pt"
MODEL_NAME = "im_instructir-7d.pt"

def dict2namespace(config):
    namespace = argparse.Namespace()
    for key, value in config.items():
        if isinstance(value, dict):
            new_value = dict2namespace(value)
        else:
            new_value = value
        setattr(namespace, key, new_value)
    return namespace


# parse config file
with open(os.path.join(CONFIG), "r") as f:
    config = yaml.safe_load(f)

cfg = dict2namespace(config)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
ir_model = instructir.create_model(input_channels =cfg.model.in_ch, width=cfg.model.width, enc_blks = cfg.model.enc_blks, 
                            middle_blk_num = cfg.model.middle_blk_num, dec_blks = cfg.model.dec_blks, txtdim=cfg.model.textdim)
ir_model = ir_model.to(device)
print ("IMAGE MODEL CKPT:", MODEL_NAME)
ir_model.load_state_dict(torch.load(MODEL_NAME, map_location="cpu"), strict=True)

os.environ["TOKENIZERS_PARALLELISM"] = "false"
LMODEL = cfg.llm.model
language_model = LanguageModel(model=LMODEL)
lm_head = LMHead(embedding_dim=cfg.llm.model_dim, hidden_dim=cfg.llm.embd_dim, num_classes=cfg.llm.nclasses)
lm_head = lm_head.to(device)

print("LMHEAD MODEL CKPT:", LM_MODEL)
lm_head.load_state_dict(torch.load(LM_MODEL, map_location="cpu"), strict=True)

def process_img(image, prompt=None):
    if prompt is None:
        prompt = chat("How to improve the quality of the image?", [], image, None, None, None)
        prompt += "Please help me improve its quality!"
        print(prompt)
    img = np.array(image)
    img = img / 255.
    img = img.astype(np.float32)
    y = torch.tensor(img).permute(2,0,1).unsqueeze(0).to(device)

    lm_embd = language_model(prompt)
    lm_embd = lm_embd.to(device)

    with torch.no_grad():
        text_embd, deg_pred = lm_head(lm_embd)
        x_hat = ir_model(y, text_embd)

    restored_img = x_hat.squeeze().permute(1,2,0).clamp_(0, 1).cpu().detach().numpy()
    restored_img = np.clip(restored_img, 0. , 1.)

    restored_img = (restored_img * 255.0).round().astype(np.uint8)  # float32 to uint8
    return Image.fromarray(restored_img) #(image, Image.fromarray(restored_img))

## InstructIR Plugin ##
model = AutoModelForCausalLM.from_pretrained("q-future/co-instruct-preview", 
                                             trust_remote_code=True, 
                                             torch_dtype=torch.float16, 
                                             attn_implementation="eager",
                                             device_map={"":"cuda:0"})

def chat(message, history, image_1, image_2, image_3, image_4):
    print(history)
    if history:
        if image_1 is not None and image_2 is None:
            past_message = "USER: The input image: <|image|>" + history[0][0] + " ASSISTANT:" + history[0][1]
            for i in range((len(history) - 1)):
                past_message += "USER:" +history[i][0] + " ASSISTANT:" + history[i][1] + "</s>"
            message = past_message + "USER:" + message + " ASSISTANT:"
            images = [image_1]
        if image_1 is not None and image_2 is not None:
            if image_3 is None:
                past_message = "USER: The first image: <|image|>\nThe second image: <|image|>" + history[0][0] + " ASSISTANT:" + history[0][1] + "</s>"
                for i in range((len(history) - 1)):
                    past_message += "USER:" + history[i][0] + " ASSISTANT:" + history[i][1] + "</s>"
                message = past_message + "USER:" + message + " ASSISTANT:"
                images = [image_1, image_2]
            else:
                if image_4 is None:
                    past_message = "USER: The first image: <|image|>\nThe second image: <|image|>\nThe third image:<|image|>" + history[0][0] + " ASSISTANT:" + history[0][1] + "</s>"
                    for i in range((len(history) - 1)):
                        past_message += "USER:" + history[i][0] + " ASSISTANT:" + history[i][1] + "</s>"
                    message = past_message + "USER:" + message + " ASSISTANT:"
                    images = [image_1, image_2, image_3]
                else:
                    past_message = "USER: The first image: <|image|>\nThe second image: <|image|>\nThe third image:<|image|>\nThe fourth image:<|image|>" + history[0][0] + " ASSISTANT:" + history[0][1] + "</s>"
                    for i in range((len(history) - 1)):
                        past_message += "USER:" + history[i][0] + " ASSISTANT:" + history[i][1] + "</s>"
                    message = past_message + "USER:" + message + " ASSISTANT:"
                    images = [image_1, image_2, image_3, image_4]
    else:  
        if image_1 is not None and image_2 is None:
            message = "USER: The input image: <|image|>" + message + " ASSISTANT:"
            images = [image_1]
        if image_1 is not None and image_2 is not None:
            if image_3 is None:
                message = "USER: The first image: <|image|>\nThe second image: <|image|>" + message + " ASSISTANT:"
                images = [image_1, image_2]
            else:
                if image_4 is None:
                    message = "USER: The first image: <|image|>\nThe second image: <|image|>\nThe third image:<|image|>" + message + " ASSISTANT:"
                    images = [image_1, image_2, image_3]
                else:
                    message = "USER: The first image: <|image|>\nThe second image: <|image|>\nThe third image:<|image|>\nThe fourth image:<|image|>" + message + " ASSISTANT:"
                    images = [image_1, image_2, image_3, image_4]
    
    print(message)
    
    return model.tokenizer.batch_decode(model.chat(message, images, max_new_tokens=600).clamp(0, 100000))[0].split("ASSISTANT:")[-1]

#### Image,Prompts examples
examples = [
    ["Which part of the image is relatively clearer, the upper part or the lower part? Please analyze in details.", "examples/sausage.jpg", None],
    ["Which image is noisy, and which one is with motion blur? Please analyze in details.", "examples/211.jpg", "examples/frog.png"],
            ["What is the problem in this image, and how to fix it? Please answer my questions one by one.", "examples/lol_748.png", None],
]

#<h1 align="center"><a href="https://github.com/Q-Future/Q-Instruct"><img src="https://github.com/Q-Future/Q-Instruct/blob/main/q_instruct_logo.png?raw=true", alt="Q-Instruct (mPLUG-Owl-2)" border="0" style="margin: 0 auto; height: 85px;" /></a> </h1>


title = "Co-Instruct-Plus🧑‍🏫🖌️"
with gr.Blocks(title="Co-Instruct-Plus🧑‍🏫🖌️") as demo:
    title_markdown = ("""

<h1 align="center"><a href="https://github.com/Q-Future/Co-Instruct"><img src="https://raw.githubusercontent.com/Q-Future/Co-Instruct/main/co-instruct.png", alt="Co-Instruct" border="0" style="margin: 0 auto; height: 85px;" /></a> </h1>

<div align="center">Built upon <strong>Q-Instruct: Improving Low-level Visual Abilities for Multi-modality Foundation Models (CVPR 2024)</strong></div>

<div align="center">Built upon the Upgraded Version, Co-Instruct, supporting up to 4 images: <strong>Towards Open-ended Visual Quality Comparison (Arxiv 2024)</strong></div>   

<div align="center">We also support <a href='https://huggingface.co/marcosv/InstructIR'>InstructIR</a> as PLUGIN to restore image!</div>
<h5 align="center"> Please find our more accurate visual scoring demo on <a href='https://huggingface.co/spaces/teowu/OneScorer'>[OneScorer]</a> (Q-Align)!</h2>
<div align="center">
    <div style="display:flex; gap: 0.25rem;" align="center">
    <strong>Q-Instruct Resources:</strong>
        <a href='https://github.com/Q-Future/Q-Instruct'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
        <a href="https://Q-Instruct.github.io/Q-Instruct/fig/Q_Instruct_v0_1_preview.pdf"><img src="https://img.shields.io/badge/Technical-Report-red"></a>
        <a href='https://github.com/Q-Future/Q-Instruct/stargazers'><img src='https://img.shields.io/github/stars/Q-Future/Q-Instruct.svg?style=social'></a>
    </div>
</div>
<div align="center">
    <div style="display:flex; gap: 0.25rem;" align="center">
    <strong>Co-Instruct Resources:</strong>
        <a href='https://github.com/Q-Future/Co-Instruct'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
        <a href="https://arxiv.org/pdf/2402.16641.pdf"><img src="https://img.shields.io/badge/Technical-Report-red"></a>
        <a href='https://github.com/Q-Future/Co-Instruct/stargazers'><img src='https://img.shields.io/github/stars/Q-Future/Co-Instruct.svg?style=social'></a>
    </div>
</div>
""")
    gr.Markdown(title_markdown)
    with gr.Row():
            input_img_1 = gr.Image(type='pil', label="Image 1 (First image)")
            input_img_2 = gr.Image(type='pil', label="Image 2 (Second image)")
            input_img_3 = gr.Image(type='pil', label="Image 3 (Third image)")
            input_img_4 = gr.Image(type='pil', label="Image 4 (Fourth image)")
    with gr.Row():
        with gr.Column(scale=2):
            gr.ChatInterface(fn = chat, additional_inputs=[input_img_1, input_img_2, input_img_3, input_img_4], theme="Soft", examples=examples)
        with gr.Column(scale=1):
            input_image_ir = gr.Image(type="pil", label="Image for Auto Restoration")
            output_image_ir = gr.Image(type="pil", label="Output of Auto Restoration")
            gr.Interface(
                fn=process_img,
                inputs=[input_image_ir],
                outputs=[output_image_ir],
                examples=["examples/gopro.png", "examples/noise50.png", "examples/lol_748.png"],
            )
    demo.launch(share=True)