Spaces:

tsujuifu
/

ml-mgie

Running on Zero

App Files Files Community

tsujuifu commited on Feb 10

Commit

893b461

•

1 Parent(s): bdbb79e

update v2

Browse files

Files changed (7) hide show

README.md +1 -1
app.py +30 -22
conversation.py +370 -0
llava.py → mgie_llava.py +22 -19
pre-requirements.txt +4 -4
requirements.txt +4 -4
train.py +0 -831

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 👩‍🎨
 colorFrom: blue
 colorTo: gray
 sdk: gradio
-sdk_version: 3.37.0
 app_file: app.py
 license: other
 ---

 colorFrom: blue
 colorTo: gray
 sdk: gradio
+sdk_version: 4.12.0
 app_file: app.py
 license: other
 ---

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import os
-# os.system('cp -r ./_ckpt/LLaVA-7B-v1 /data/LLaVA-7B-v1'), os.system('cp -r ./_ckpt/mgie_7b /data/mgie_7b')
-os.system('ls /data'), os.system('df -h /data')
-[os.system('mv llava.py /home/user/.pyenv/versions/3.10.13/lib/python3.10/site-packages/llava/model/llava.py'),
- os.system('mv train.py /home/user/.pyenv/versions/3.10.13/lib/python3.10/site-packages/llava/train/train.py')]
 from PIL import Image
@@ -11,8 +11,8 @@ import numpy as np
 import torch as T
 import transformers, diffusers
-from llava.conversation import conv_templates
-from llava.model import *
 import gradio as gr
@@ -39,7 +39,7 @@ DEFAULT_IMAGE_TOKEN = '<image>'
 DEFAULT_IMAGE_PATCH_TOKEN = '<im_patch>'
 DEFAULT_IM_START_TOKEN = '<im_start>'
 DEFAULT_IM_END_TOKEN = '<im_end>'
-PATH_LLAVA = '/data/LLaVA-7B-v1'
 tokenizer = transformers.AutoTokenizer.from_pretrained(PATH_LLAVA)
 model = LlavaLlamaForCausalLM.from_pretrained(PATH_LLAVA, low_cpu_mem_usage=True, torch_dtype=T.float16, use_cache=True).cuda()
@@ -48,7 +48,7 @@ image_processor = transformers.CLIPImageProcessor.from_pretrained(model.config.m
 tokenizer.padding_side = 'left'
 tokenizer.add_tokens(['[IMG0]', '[IMG1]', '[IMG2]', '[IMG3]', '[IMG4]', '[IMG5]', '[IMG6]', '[IMG7]'], special_tokens=True)
 model.resize_token_embeddings(len(tokenizer))
-ckpt = T.load('/data/mgie_7b/mllm.pt', map_location='cpu')
 model.load_state_dict(ckpt, strict=False)
 mm_use_im_start_end = getattr(model.config, 'mm_use_im_start_end', False)
@@ -65,15 +65,17 @@ if mm_use_im_start_end: vision_config.im_start_token, vision_config.im_end_token
 image_token_len = (vision_config.image_size//vision_config.patch_size)**2
 _ = model.eval()
-EMB = ckpt['emb'].cuda()
-with T.inference_mode(): NULL = model.edit_head(T.zeros(1, 8, 4096).half().to('cuda'), EMB)
 pipe = diffusers.StableDiffusionInstructPix2PixPipeline.from_pretrained('timbrooks/instruct-pix2pix', torch_dtype=T.float16).to('cuda')
 pipe.set_progress_bar_config(disable=True)
-pipe.unet.load_state_dict(T.load('/data/mgie_7b/unet.pt', map_location='cpu'))
 print('--init MGIE--')
 def go_mgie(img, txt, seed, cfg_txt, cfg_img):
     img, seed = crop_resize(Image.fromarray(img).convert('RGB')), int(seed)
     inp = img
@@ -87,6 +89,7 @@ def go_mgie(img, txt, seed, cfg_txt, cfg_img):
     txt, mask = T.as_tensor(txt['input_ids']), T.as_tensor(txt['attention_mask'])
     with T.inference_mode():
         out = model.generate(txt.unsqueeze(dim=0).cuda(), images=img.half().unsqueeze(dim=0).cuda(), attention_mask=mask.unsqueeze(dim=0).cuda(),
                              do_sample=False, max_new_tokens=96, num_beams=1, no_repeat_ngram_size=3,
                              return_dict_in_generate=True, output_hidden_states=True)
@@ -98,6 +101,7 @@ def go_mgie(img, txt, seed, cfg_txt, cfg_img):
         hid = hid[p:p+8]
         out = remove_alter(tokenizer.decode(out))
         emb = model.edit_head(hid.unsqueeze(dim=0), EMB)
         res = pipe(image=inp, prompt_embeds=emb, negative_prompt_embeds=NULL,
                    generator=T.Generator(device='cuda').manual_seed(seed), guidance_scale=cfg_txt, image_guidance_scale=cfg_img).images[0]
@@ -105,14 +109,14 @@ def go_mgie(img, txt, seed, cfg_txt, cfg_img):
     return res, out
 def go_example(seed, cfg_txt, cfg_img):
-    txt = ['make the frame red', 'turn the day into night', 'give him a beard', 'make cottage a mansion',
            'remove yellow object from dogs paws', 'change the hair from red to blue', 'remove the text', 'increase the image contrast',
            'remove the people in the background', 'please make this photo professional looking', 'darken the image, sharpen it', 'photoshop the girl out',
            'make more brightness', 'take away the brown filter form the image', 'add more contrast to simulate more light', 'dark on rgb',
            'make the face happy', 'change view as ocean', 'replace basketball with soccer ball', 'let the floor be made of wood']
-    i = T.randint(len(txt), (1, )).item()
-    return './_input/%d.jpg'%(i), txt[i], seed, cfg_txt, cfg_img
 go_mgie(np.array(Image.open('./_input/0.jpg').convert('RGB')), 'make the frame red', 13331, 7.5, 1.5)
 print('--init GO--')
@@ -120,25 +124,29 @@ print('--init GO--')
 with gr.Blocks() as app:
     gr.Markdown(
         """
-        🔔 we will have a maintenance at 3 a.m. (PST)
         # [ICLR\'24] Guiding Instruction-based Image Editing via Multimodal Large Language Models<br>
         🔔 this demo is hosted by [Tsu-Jui Fu](https://github.com/tsujuifu/pytorch_mgie)<br>
         🔔 a black image means that the output did not pass the [safety checker](https://huggingface.co/CompVis/stable-diffusion-safety-checker)<br>
-        🔔 if the queue is full (*this app is too busy*), you can also try it [here](http://128.111.41.13:7122)<br>
         🔔 if the building process takes too long, please try refreshing the page
         """
     )
     with gr.Row(): inp, res = [gr.Image(height=384, width=384, label='Input Image', interactive=True),
-                               gr.Image(height=384, width=384, label='Goal Image', interactive=False)]
     with gr.Row(): txt, out = [gr.Textbox(label='Instruction', interactive=True),
                                gr.Textbox(label='Expressive Instruction', interactive=False)]
     with gr.Row(): seed, cfg_txt, cfg_img = [gr.Number(value=13331, label='Seed', interactive=True),
                                              gr.Number(value=7.5, label='Text CFG', interactive=True),
                                              gr.Number(value=1.5, label='Image CFG', interactive=True)]
-    with gr.Row(): btn_sub, btn_exp = [gr.Button('Submit'),
-                                       gr.Button('Example')]
-    btn_sub.click(fn=go_mgie, inputs=[inp, txt, seed, cfg_txt, cfg_img], outputs=[res, out])
     btn_exp.click(fn=go_example, inputs=[seed, cfg_txt, cfg_img], outputs=[inp, txt, seed, cfg_txt, cfg_img])
-app.queue(concurrency_count=1, max_size=75), app.launch()

 import os
+import huggingface_hub, spaces
+huggingface_hub.snapshot_download(repo_id='tsujuifu/ml-mgie', repo_type='model', local_dir='_ckpt', local_dir_use_symlinks=False)
+os.system('ls _ckpt')
 from PIL import Image
 import torch as T
 import transformers, diffusers
+from conversation import conv_templates
+from mgie_llava import *
 import gradio as gr
 DEFAULT_IMAGE_PATCH_TOKEN = '<im_patch>'
 DEFAULT_IM_START_TOKEN = '<im_start>'
 DEFAULT_IM_END_TOKEN = '<im_end>'
+PATH_LLAVA = '_ckpt/LLaVA-7B-v1'
 tokenizer = transformers.AutoTokenizer.from_pretrained(PATH_LLAVA)
 model = LlavaLlamaForCausalLM.from_pretrained(PATH_LLAVA, low_cpu_mem_usage=True, torch_dtype=T.float16, use_cache=True).cuda()
 tokenizer.padding_side = 'left'
 tokenizer.add_tokens(['[IMG0]', '[IMG1]', '[IMG2]', '[IMG3]', '[IMG4]', '[IMG5]', '[IMG6]', '[IMG7]'], special_tokens=True)
 model.resize_token_embeddings(len(tokenizer))
+ckpt = T.load('_ckpt/mgie_7b/mllm.pt', map_location='cpu')
 model.load_state_dict(ckpt, strict=False)
 mm_use_im_start_end = getattr(model.config, 'mm_use_im_start_end', False)
 image_token_len = (vision_config.image_size//vision_config.patch_size)**2
 _ = model.eval()
 pipe = diffusers.StableDiffusionInstructPix2PixPipeline.from_pretrained('timbrooks/instruct-pix2pix', torch_dtype=T.float16).to('cuda')
 pipe.set_progress_bar_config(disable=True)
+pipe.unet.load_state_dict(T.load('_ckpt/mgie_7b/unet.pt', map_location='cpu'))
 print('--init MGIE--')
+@spaces.GPU(enable_queue=True)
 def go_mgie(img, txt, seed, cfg_txt, cfg_img):
+    EMB = ckpt['emb'].cuda()
+    with T.inference_mode(): NULL = model.edit_head(T.zeros(1, 8, 4096).half().to('cuda'), EMB)
     img, seed = crop_resize(Image.fromarray(img).convert('RGB')), int(seed)
     inp = img
     txt, mask = T.as_tensor(txt['input_ids']), T.as_tensor(txt['attention_mask'])
     with T.inference_mode():
+        _ = model.cuda()
         out = model.generate(txt.unsqueeze(dim=0).cuda(), images=img.half().unsqueeze(dim=0).cuda(), attention_mask=mask.unsqueeze(dim=0).cuda(),
                              do_sample=False, max_new_tokens=96, num_beams=1, no_repeat_ngram_size=3,
                              return_dict_in_generate=True, output_hidden_states=True)
         hid = hid[p:p+8]
         out = remove_alter(tokenizer.decode(out))
+        _ = model.cuda()
         emb = model.edit_head(hid.unsqueeze(dim=0), EMB)
         res = pipe(image=inp, prompt_embeds=emb, negative_prompt_embeds=NULL,
                    generator=T.Generator(device='cuda').manual_seed(seed), guidance_scale=cfg_txt, image_guidance_scale=cfg_img).images[0]
     return res, out
 def go_example(seed, cfg_txt, cfg_img):
+    ins = ['make the frame red', 'turn the day into night', 'give him a beard', 'make cottage a mansion',
            'remove yellow object from dogs paws', 'change the hair from red to blue', 'remove the text', 'increase the image contrast',
            'remove the people in the background', 'please make this photo professional looking', 'darken the image, sharpen it', 'photoshop the girl out',
            'make more brightness', 'take away the brown filter form the image', 'add more contrast to simulate more light', 'dark on rgb',
            'make the face happy', 'change view as ocean', 'replace basketball with soccer ball', 'let the floor be made of wood']
+    i = T.randint(len(ins), (1, )).item()
+    return './_input/%d.jpg'%(i), ins[i], seed, cfg_txt, cfg_img
 go_mgie(np.array(Image.open('./_input/0.jpg').convert('RGB')), 'make the frame red', 13331, 7.5, 1.5)
 print('--init GO--')
 with gr.Blocks() as app:
     gr.Markdown(
         """
         # [ICLR\'24] Guiding Instruction-based Image Editing via Multimodal Large Language Models<br>
         🔔 this demo is hosted by [Tsu-Jui Fu](https://github.com/tsujuifu/pytorch_mgie)<br>
         🔔 a black image means that the output did not pass the [safety checker](https://huggingface.co/CompVis/stable-diffusion-safety-checker)<br>
+        🔔 if the queue is full (*no GPU available*), you can also try it [here](http://128.111.41.13:7122)<br>
         🔔 if the building process takes too long, please try refreshing the page
         """
     )
     with gr.Row(): inp, res = [gr.Image(height=384, width=384, label='Input Image', interactive=True),
+                               gr.Image(height=384, width=384, label='Goal Image', interactive=True)]
     with gr.Row(): txt, out = [gr.Textbox(label='Instruction', interactive=True),
                                gr.Textbox(label='Expressive Instruction', interactive=False)]
     with gr.Row(): seed, cfg_txt, cfg_img = [gr.Number(value=13331, label='Seed', interactive=True),
                                              gr.Number(value=7.5, label='Text CFG', interactive=True),
                                              gr.Number(value=1.5, label='Image CFG', interactive=True)]
+    with gr.Row(): btn_exp, btn_sub = [gr.Button('More Example'), gr.Button('Submit')]
     btn_exp.click(fn=go_example, inputs=[seed, cfg_txt, cfg_img], outputs=[inp, txt, seed, cfg_txt, cfg_img])
+    btn_sub.click(fn=go_mgie, inputs=[inp, txt, seed, cfg_txt, cfg_img], outputs=[res, out])
+    ins = ['make the frame red', 'turn the day into night', 'give him a beard', 'make cottage a mansion',
+           'remove yellow object from dogs paws', 'change the hair from red to blue', 'remove the text', 'increase the image contrast',
+           'remove the people in the background', 'please make this photo professional looking', 'darken the image, sharpen it', 'photoshop the girl out',
+           'make more brightness', 'take away the brown filter form the image', 'add more contrast to simulate more light', 'dark on rgb',
+           'make the face happy', 'change view as ocean', 'replace basketball with soccer ball', 'let the floor be made of wood']
+    gr.Examples(examples=[['./_input/%d.jpg'%(i), ins[i]] for i in [1, 5, 8, 14, 16]], inputs=[inp, txt])
+app.launch()

conversation.py ADDED Viewed

	@@ -0,0 +1,370 @@

+# modified from https://github.com/haotian-liu/LLaVA/blob/7ace501183c4bdec6052ec1a30039cdc3242a67c/llava/conversation.py
+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        if self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode == "Crop":
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((224, 224))
+                    else:
+                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="JPEG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    # image = image.resize((224, 224))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = msg.replace('<image>', img_str)
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Give three tips for staying healthy."),
+        ("Assistant",
+            "Sure, here are three tips for staying healthy:\n"
+            "1. Exercise regularly: Regular physical activity can help improve your overall health and wellbeing. "
+            "It can also help reduce your risk of chronic conditions such as obesity, diabetes, heart disease, "
+            "and certain cancers. Aim for at least 150 minutes of moderate-intensity aerobic exercise or "
+            "75 minutes of vigorous-intensity aerobic exercise per week, along with muscle-strengthening "
+            "activities at least two days per week.\n"
+            "2. Eat a balanced diet: Eating a balanced diet that is rich in fruits, "
+            "vegetables, whole grains, lean proteins, and healthy fats can help support "
+            "your overall health. Try to limit your intake of processed and high-sugar foods, "
+            "and aim to drink plenty of water throughout the day.\n"
+            "3. Get enough sleep: Getting enough quality sleep is essential for your physical "
+            "and mental health. Adults should aim for seven to nine hours of sleep per night. "
+            "Establish a regular sleep schedule and try to create a relaxing bedtime routine to "
+            "help improve the quality of your sleep.")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_v1_2 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1_1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+- You are a helpful language and vision assistant.
+- You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.
+- You should follow the instructions carefully and explain your answers in detail.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_mpt_text = Conversation(
+    system="""<|im_start|>system
+- You are a helpful assistant chatbot trained by MosaicML.
+- You answer questions.
+- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_bair_v1 = Conversation(
+    system="BEGINNING OF CONVERSATION:",
+    roles=("USER", "GPT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+simple_conv = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Hi!"),
+        ("Assistant", "Hi there! How can I help you today?")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+simple_conv_multimodal = Conversation(
+    system="You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab."
+           "You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "Follow the instructions carefully and explain your answers in detail.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Hi!"),
+        ("Assistant", "Hi there!  How can I help you today?\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+simple_conv_mpt_multimodal = Conversation(
+    system="""<|im_start|>system
+- You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab.
+- You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.
+- You should follow the instructions carefully and explain your answers in detail.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+simple_conv_legacy = Conversation(
+    system="You are LLaVA, a large language model trained by UW Madison WAIV Lab."
+           "You are designed to assist human with a variety of tasks using natural language."
+           "Follow the instructions carefully.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "Hi!\n\n### Response:"),
+        ("Assistant", "Hi there!  How can I help you today?\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v1 = Conversation(
+    system="You are LLaVA, a large language and vision assistant trained by UW Madison WAIV Lab."
+           "You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "Follow the instructions carefully and explain your answers in detail.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+default_conversation = conv_v1_2
+conv_templates = {
+    "default": conv_v1_2,
+    "simple": simple_conv,
+    "simple_legacy": simple_conv_legacy,
+    "multimodal": simple_conv_multimodal,
+    "mpt_multimodal": simple_conv_mpt_multimodal,
+    "llava_v1": conv_llava_v1,
+    # fastchat
+    "v1": conv_v1_2,
+    "bair_v1": conv_bair_v1,
+    "vicuna_v1_1": conv_vicuna_v1_1,
+    "mpt": conv_mpt,
+    "mpt_text": conv_mpt_text,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

llava.py → mgie_llava.py RENAMED Viewed

@@ -1,4 +1,7 @@
 # modified from https://github.com/haotian-liu/LLaVA/blob/7ace501183c4bdec6052ec1a30039cdc3242a67c/llava/model/llava.py
 from typing import List, Optional, Tuple, Union
@@ -184,19 +187,19 @@ class LlavaLlamaModel(LlamaModel):
 class EditMapper(nn.Module):
     def __init__(self):
         super().__init__()
         self.llm2hid = nn.Linear(4096, 512)
         self.query = nn.Parameter(torch.randn(1, 77, 512))
-        self.mapper = nn.Transformer(batch_first=True, norm_first=True,
-                                     d_model=512, nhead=4, num_encoder_layers=4, num_decoder_layers=4,
                                      dim_feedforward=2048, dropout=0.0)
         self.hid2feat = nn.Linear(512, 768)
     def forward(self, llm, emb):
         hid = self.llm2hid(llm+emb)
         hid = self.mapper(hid, self.query.repeat(llm.shape[0], 1, 1))
         feat = self.hid2feat(hid)
         return feat
 class LlavaLlamaForCausalLM(LlamaForCausalLM):
@@ -209,9 +212,9 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM):
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.edit_head = EditMapper()
-        '''self.scheduler, self.vae, self.unet = [diffusers.DDPMScheduler.from_pretrained('runwayml/stable-diffusion-v1-5', subfolder='scheduler'),
-                                               diffusers.AutoencoderKL.from_pretrained('runwayml/stable-diffusion-v1-5', subfolder='vae'),
                                                diffusers.UNet2DConditionModel.from_pretrained('runwayml/stable-diffusion-v1-5', subfolder='unet')]
         self.vae.requires_grad_(False)
         self.unet.register_to_config(in_channels=8)
@@ -220,7 +223,7 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM):
             conv.weight.zero_()
             conv.weight[:, :4, :, :].copy_(self.unet.conv_in.weight)
             self.unet.conv_in = conv'''
         # Initialize weights and apply final processing
         self.post_init()
@@ -236,7 +239,7 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM):
         if type(vision_tower) is list:
             vision_tower = vision_tower[0]
         return vision_tower
     def forward(
         self,
         input_ids: torch.LongTensor = None,
@@ -248,7 +251,7 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         images: Optional[torch.FloatTensor] = None,
-        return_dict: Optional[bool] = None,
         p2p_inp=None, p2p_ans=None
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -297,13 +300,13 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM):
             hid_edit = self.edit_head(llm, self.model.embed_tokens.weight[-8:].unsqueeze(dim=0).repeat(labels.shape[0], 1, 1))
             B, DROP = labels.shape[0], 0.05
-            hid_null = self.edit_head(torch.zeros(B, 8, 4096, device=labels.device),
                                       self.model.embed_tokens.weight[-8:].unsqueeze(dim=0).repeat(labels.shape[0], 1, 1))
             with torch.no_grad():
                 lat_ans, lat_inp = self.vae.encode(p2p_ans).latent_dist.sample()*self.vae.config.scaling_factor, self.vae.encode(p2p_inp).latent_dist.mode()
-                lat_ans, lat_inp = [torch.from_numpy(lat_ans.data.cpu().float().numpy()).to(lat_ans.device),
                                     torch.from_numpy(lat_inp.data.cpu().float().numpy()).to(lat_inp.device)]
             noise = torch.randn_like(lat_ans)
@@ -317,15 +320,15 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM):
             lat_inp *= mask
             out = self.unet(torch.cat([lat_noise, lat_inp], dim=1), ts, hid_edit).sample
             loss_ce, loss_edit = loss, nn.functional.mse_loss(out, noise, reduction='mean')
             if int(os.environ['LOCAL_RANK'])==0: print('loss_ce:', loss_ce, '/', 'loss_edit:', loss_edit)
             loss = loss_ce+loss_edit*0.5
         if not return_dict:
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
@@ -371,7 +374,7 @@ class LlavaLlamaForCausalLM(LlamaForCausalLM):
             if num_new_tokens > 0:
                 input_embeddings = self.get_input_embeddings().weight.data
                 output_embeddings = self.get_output_embeddings().weight.data
                 input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
                     dim=0, keepdim=True)
                 output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(

+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+#
 # modified from https://github.com/haotian-liu/LLaVA/blob/7ace501183c4bdec6052ec1a30039cdc3242a67c/llava/model/llava.py
 from typing import List, Optional, Tuple, Union
 class EditMapper(nn.Module):
     def __init__(self):
         super().__init__()
         self.llm2hid = nn.Linear(4096, 512)
         self.query = nn.Parameter(torch.randn(1, 77, 512))
+        self.mapper = nn.Transformer(batch_first=True, norm_first=True,
+                                     d_model=512, nhead=4, num_encoder_layers=4, num_decoder_layers=4,
                                      dim_feedforward=2048, dropout=0.0)
         self.hid2feat = nn.Linear(512, 768)
     def forward(self, llm, emb):
         hid = self.llm2hid(llm+emb)
         hid = self.mapper(hid, self.query.repeat(llm.shape[0], 1, 1))
         feat = self.hid2feat(hid)
         return feat
 class LlavaLlamaForCausalLM(LlamaForCausalLM):
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.edit_head = EditMapper()
+        '''self.scheduler, self.vae, self.unet = [diffusers.DDPMScheduler.from_pretrained('runwayml/stable-diffusion-v1-5', subfolder='scheduler'),
+                                               diffusers.AutoencoderKL.from_pretrained('runwayml/stable-diffusion-v1-5', subfolder='vae'),
                                                diffusers.UNet2DConditionModel.from_pretrained('runwayml/stable-diffusion-v1-5', subfolder='unet')]
         self.vae.requires_grad_(False)
         self.unet.register_to_config(in_channels=8)
             conv.weight.zero_()
             conv.weight[:, :4, :, :].copy_(self.unet.conv_in.weight)
             self.unet.conv_in = conv'''
         # Initialize weights and apply final processing
         self.post_init()
         if type(vision_tower) is list:
             vision_tower = vision_tower[0]
         return vision_tower
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
         p2p_inp=None, p2p_ans=None
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
             hid_edit = self.edit_head(llm, self.model.embed_tokens.weight[-8:].unsqueeze(dim=0).repeat(labels.shape[0], 1, 1))
             B, DROP = labels.shape[0], 0.05
+            hid_null = self.edit_head(torch.zeros(B, 8, 4096, device=labels.device),
                                       self.model.embed_tokens.weight[-8:].unsqueeze(dim=0).repeat(labels.shape[0], 1, 1))
             with torch.no_grad():
                 lat_ans, lat_inp = self.vae.encode(p2p_ans).latent_dist.sample()*self.vae.config.scaling_factor, self.vae.encode(p2p_inp).latent_dist.mode()
+                lat_ans, lat_inp = [torch.from_numpy(lat_ans.data.cpu().float().numpy()).to(lat_ans.device),
                                     torch.from_numpy(lat_inp.data.cpu().float().numpy()).to(lat_inp.device)]
             noise = torch.randn_like(lat_ans)
             lat_inp *= mask
             out = self.unet(torch.cat([lat_noise, lat_inp], dim=1), ts, hid_edit).sample
             loss_ce, loss_edit = loss, nn.functional.mse_loss(out, noise, reduction='mean')
             if int(os.environ['LOCAL_RANK'])==0: print('loss_ce:', loss_ce, '/', 'loss_edit:', loss_edit)
             loss = loss_ce+loss_edit*0.5
         if not return_dict:
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
             if num_new_tokens > 0:
                 input_embeddings = self.get_input_embeddings().weight.data
                 output_embeddings = self.get_output_embeddings().weight.data
                 input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
                     dim=0, keepdim=True)
                 output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(

pre-requirements.txt CHANGED Viewed

@@ -1,9 +1,9 @@
 sentencepiece
-transformers
 diffusers
-tokenizers
 datasets
 accelerate
 evaluate
-gradio
-git+https://github.com/haotian-liu/LLaVA@7ace501

 sentencepiece
+git+https://github.com/huggingface/transformers.git@cae78c46
 diffusers
+tokenizers==0.12.1
 datasets
 accelerate
 evaluate
+gradio==4.12.0
+gradio_client==0.8.0

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
--i https://download.pytorch.org/whl/cu113
-torch==1.12.0
-torchvision==0.13.0
-torchaudio==0.12.0

+-i https://download.pytorch.org/whl/cu118
+torch==2.0
+torchvision==0.15
+torchaudio==2.0

train.py DELETED Viewed

@@ -1,831 +0,0 @@
-# modified from https://github.com/haotian-liu/LLaVA/blob/7ace501183c4bdec6052ec1a30039cdc3242a67c/llava/train/train.py
-import os
-import copy
-from dataclasses import dataclass, field
-import json
-import logging
-import pathlib
-from typing import Dict, Optional, Sequence, List
-import torch
-import transformers
-from torch.utils.data import Dataset
-from llava.train.llava_trainer import LLaVATrainer
-from llava import conversation as conversation_lib
-from llava.model import *
-from PIL import Image
-import torch.nn as nn
-# TODO: import and use code from ../data/dataset.py
-IGNORE_INDEX = -100
-DEFAULT_PAD_TOKEN = "[PAD]"
-DEFAULT_EOS_TOKEN = "</s>"
-DEFAULT_BOS_TOKEN = "<s>"
-DEFAULT_UNK_TOKEN = "<unk>"
-DEFAULT_IMAGE_TOKEN = "<image>"
-DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
-DEFAULT_IM_START_TOKEN = "<im_start>"
-DEFAULT_IM_END_TOKEN = "<im_end>"
-import io, base64, pickle, random
-from tqdm import tqdm
-import numpy as np
-def b2f(b): return Image.open(io.BytesIO(base64.b64decode(b))).convert('RGB')
-def resize(f):
-    w, h = f.size
-    if w>h:
-        p = (w-h)//2
-        f = f.crop([p, 0, p+h, h])
-    elif h>w:
-        p = (h-w)//2
-        f = f.crop([0, p, w, p+w])
-    f = f.resize([512, 512])
-    return f
-def img2npy(f): return (2.0*np.array(f)/255.0-1.0).transpose((2, 0, 1)).astype(np.float32)
-@dataclass
-class ModelArguments:
-    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
-    version: Optional[str] = field(default="v0")
-    freeze_backbone: bool = field(default=False)
-    tune_mm_mlp_adapter: bool = field(default=False)
-    vision_tower: Optional[str] = field(default=None)
-    mm_vision_select_layer: Optional[int] = field(default=-1)   # default to the last layer
-    pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
-    mm_use_im_start_end: bool = field(default=False)
-@dataclass
-class DataArguments:
-    data_path: str = field(default=None,
-                           metadata={"help": "Path to the training data."})
-    lazy_preprocess: bool = False
-    is_multimodal: bool = False
-    sep_image_conv_front: bool = False
-    image_token_len: int = 0
-    image_folder: Optional[str] = field(default=None)
-    image_aspect_ratio: str = 'square'
-@dataclass
-class TrainingArguments(transformers.TrainingArguments):
-    cache_dir: Optional[str] = field(default=None)
-    optim: str = field(default="adamw_torch")
-    remove_unused_columns: bool = field(default=False)
-    freeze_mm_mlp_adapter: bool = field(default=False)
-    force_fsdp: bool = field(default=False)
-    model_max_length: int = field(
-        default=512,
-        metadata={
-            "help":
-            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
-        },
-    )
-    double_quant: bool = field(
-        default=True,
-        metadata={"help": "Compress the quantization statistics through double quantization."}
-    )
-    quant_type: str = field(
-        default="nf4",
-        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
-    )
-    bits: int = field(
-        default=16,
-        metadata={"help": "How many bits to use."}
-    )
-    lora_enable: bool = False
-    lora_r: int = 64
-    lora_alpha: int = 16
-    lora_dropout: float = 0.05
-    lora_weight_path: str = ""
-    lora_bias: str = "none"
-def maybe_zero_3(param, ignore_status=False, name=None):
-    from deepspeed import zero
-    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
-    if hasattr(param, "ds_id"):
-        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
-            if not ignore_status:
-                logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
-        with zero.GatheredParameters([param]):
-            param = param.data.detach().cpu().clone()
-    else:
-        param = param.detach().cpu().clone()
-    return param
-# Borrowed from peft.utils.get_peft_model_state_dict
-def get_peft_state_maybe_zero_3(named_params, bias):
-    if bias == "none":
-        to_return = {k: t for k, t in named_params if "lora_" in k}
-    elif bias == "all":
-        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
-    elif bias == "lora_only":
-        to_return = {}
-        maybe_lora_bias = {}
-        lora_bias_names = set()
-        for k, t in named_params:
-            if "lora_" in k:
-                to_return[k] = t
-                bias_name = k.split("lora_")[0] + "bias"
-                lora_bias_names.add(bias_name)
-            elif "bias" in k:
-                maybe_lora_bias[k] = t
-        for k, t in maybe_lora_bias:
-            if bias_name in lora_bias_names:
-                to_return[bias_name] = t
-    else:
-        raise NotImplementedError
-    to_return = {k: maybe_zero_3(v, name=k) for k, v in to_return.items()}
-    return to_return
-def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
-    to_return = {k: t for k, t in named_params if "lora_" not in k}
-    if require_grad_only:
-        to_return = {k: t for k, t in to_return.items() if t.requires_grad}
-    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
-    return to_return
-def find_all_linear_names(model):
-    cls = torch.nn.Linear
-    lora_module_names = set()
-    for name, module in model.named_modules():
-        if isinstance(module, cls):
-            names = name.split('.')
-            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
-    if 'lm_head' in lora_module_names: # needed for 16-bit
-        lora_module_names.remove('lm_head')
-    return list(lora_module_names)
-def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
-                                   output_dir: str):
-    """Collects the state dict and dump to disk."""
-    if trainer.deepspeed:
-        torch.cuda.synchronize()
-        trainer.save_model(output_dir)
-        return
-    state_dict = trainer.model.state_dict()
-    if trainer.args.should_save:
-        cpu_state_dict = {
-            key: value.cpu()
-            for key, value in state_dict.items()
-        }
-        del state_dict
-        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
-def smart_tokenizer_and_embedding_resize(
-    special_tokens_dict: Dict,
-    tokenizer: transformers.PreTrainedTokenizer,
-    model: transformers.PreTrainedModel,
-):
-    """Resize tokenizer and embedding.
-    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
-    """
-    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
-    model.resize_token_embeddings(len(tokenizer))
-    if num_new_tokens > 0:
-        input_embeddings = model.get_input_embeddings().weight.data
-        output_embeddings = model.get_output_embeddings().weight.data
-        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
-            dim=0, keepdim=True)
-        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
-            dim=0, keepdim=True)
-        input_embeddings[-num_new_tokens:] = input_embeddings_avg
-        output_embeddings[-num_new_tokens:] = output_embeddings_avg
-def _tokenize_fn(strings: Sequence[str],
-                 tokenizer: transformers.PreTrainedTokenizer) -> Dict:
-    """Tokenize a list of strings."""
-    tokenized_list = [
-        tokenizer(
-            text,
-            return_tensors="pt",
-            padding="longest",
-            max_length=tokenizer.model_max_length,
-            truncation=True,
-        ) for text in strings
-    ]
-    input_ids = labels = [
-        tokenized.input_ids[0] for tokenized in tokenized_list
-    ]
-    input_ids_lens = labels_lens = [
-        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
-        for tokenized in tokenized_list
-    ]
-    return dict(
-        input_ids=input_ids,
-        labels=labels,
-        input_ids_lens=input_ids_lens,
-        labels_lens=labels_lens,
-    )
-def _mask_targets(target, tokenized_lens, speakers):
-    # cur_idx = 0
-    cur_idx = tokenized_lens[0]
-    tokenized_lens = tokenized_lens[1:]
-    target[:cur_idx] = IGNORE_INDEX
-    for tokenized_len, speaker in zip(tokenized_lens, speakers):
-        if speaker == "human":
-            target[cur_idx+2:cur_idx + tokenized_len] = IGNORE_INDEX
-        cur_idx += tokenized_len
-def _add_speaker_and_signal(header, source, get_conversation=True):
-    """Add speaker and start/end signal on each round."""
-    BEGIN_SIGNAL = "### "
-    END_SIGNAL = "\n"
-    conversation = header
-    for sentence in source:
-        from_str = sentence["from"]
-        if from_str.lower() == "human":
-            from_str = conversation_lib.default_conversation.roles[0]
-        elif from_str.lower() == "gpt":
-            from_str = conversation_lib.default_conversation.roles[1]
-        else:
-            from_str = 'unknown'
-        sentence["value"] = (BEGIN_SIGNAL + from_str + ": " +
-                             sentence["value"] + END_SIGNAL)
-        if get_conversation:
-            conversation += sentence["value"]
-    conversation += BEGIN_SIGNAL
-    return conversation
-def preprocess_multimodal(
-    sources: Sequence[str],
-    multimodal_cfg: dict,
-    cur_token_len: int,
-) -> Dict:
-    is_multimodal = multimodal_cfg['is_multimodal']
-    # image_token_len = multimodal_cfg['image_token_len']
-    image_token_len = cur_token_len
-    if not is_multimodal:
-        return sources
-    for source in sources:
-        if multimodal_cfg['sep_image_conv_front']:
-            assert DEFAULT_IMAGE_TOKEN in source[0]['value']
-            source[0]['value'] = source[0]['value'].replace(DEFAULT_IMAGE_TOKEN, '').strip()
-            source[0]['value'] = DEFAULT_IMAGE_TOKEN + conversation_lib.default_conversation.sep + conversation_lib.default_conversation.roles[0] + ": " + source[0]['value']
-        for sentence in source:
-            replace_token = DEFAULT_IMAGE_PATCH_TOKEN * image_token_len
-            if multimodal_cfg['use_im_start_end']:
-                replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
-            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
-    return sources
-def preprocess_v1(
-    sources,
-    tokenizer: transformers.PreTrainedTokenizer,
-) -> Dict:
-    conv = conversation_lib.default_conversation.copy()
-    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
-    # Apply prompt templates
-    conversations = []
-    for i, source in enumerate(sources):
-        if roles[source[0]["from"]] != conv.roles[0]:
-            # Skip the first one if it is not from human
-            source = source[1:]
-        conv.messages = []
-        for j, sentence in enumerate(source):
-            role = roles[sentence["from"]]
-            assert role == conv.roles[j % 2], f"{i}"
-            conv.append_message(role, sentence["value"])
-        conversations.append(conv.get_prompt())
-    # Tokenize conversations
-    input_ids = tokenizer(
-        conversations,
-        return_tensors="pt",
-        padding="longest",
-        max_length=tokenizer.model_max_length,
-        truncation=True,
-    ).input_ids
-    targets = input_ids.clone()
-    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
-    # Mask targets
-    sep = conv.sep + conv.roles[1] + ": "
-    for conversation, target in zip(conversations, targets):
-        total_len = int(target.ne(tokenizer.pad_token_id).sum())
-        rounds = conversation.split(conv.sep2)
-        cur_len = 1
-        target[:cur_len] = IGNORE_INDEX
-        for i, rou in enumerate(rounds):
-            if rou == "":
-                break
-            parts = rou.split(sep)
-            if len(parts) != 2:
-                break
-            parts[0] += sep
-            round_len = len(tokenizer(rou).input_ids)
-            instruction_len = len(tokenizer(parts[0]).input_ids) - 2
-            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
-            cur_len += round_len
-        target[cur_len:] = IGNORE_INDEX
-        if cur_len < tokenizer.model_max_length:
-            if cur_len != total_len:
-                target[:] = IGNORE_INDEX
-                print(
-                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
-                    f" (ignored)"
-                )
-    return dict(
-        input_ids=input_ids,
-        labels=targets,
-    )
-def preprocess_mpt(
-    sources,
-    tokenizer: transformers.PreTrainedTokenizer,
-) -> Dict:
-    conv = conversation_lib.default_conversation.copy()
-    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
-    # Apply prompt templates
-    conversations = []
-    for i, source in enumerate(sources):
-        if roles[source[0]["from"]] != conv.roles[0]:
-            # Skip the first one if it is not from human
-            source = source[1:]
-        conv.messages = []
-        for j, sentence in enumerate(source):
-            role = roles[sentence["from"]]
-            assert role == conv.roles[j % 2], f"{i}"
-            conv.append_message(role, sentence["value"])
-        conversations.append(conv.get_prompt())
-    # Tokenize conversations
-    input_ids = tokenizer(
-        conversations,
-        return_tensors="pt",
-        padding="longest",
-        max_length=tokenizer.model_max_length,
-        truncation=True,
-    ).input_ids
-    targets = input_ids.clone()
-    assert conv.sep_style == conversation_lib.SeparatorStyle.MPT
-    # Mask targets
-    sep = conv.sep + conv.roles[1]
-    for conversation, target in zip(conversations, targets):
-        total_len = int(target.ne(tokenizer.pad_token_id).sum())
-        rounds = conversation.split(conv.sep)
-        re_rounds = [conv.sep.join(rounds[:3])] # system + user + gpt
-        for conv_idx in range(3, len(rounds), 2):
-            re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx+2]))    # user + gpt
-        cur_len = 0
-        target[:cur_len] = IGNORE_INDEX
-        for i, rou in enumerate(re_rounds):
-            if rou == "":
-                break
-            parts = rou.split(sep)
-            if len(parts) != 2:
-                break
-            parts[0] += sep
-            round_len = len(tokenizer(rou).input_ids) + len(tokenizer(conv.sep).input_ids)
-            instruction_len = len(tokenizer(parts[0]).input_ids)
-            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
-            cur_len += round_len
-        target[cur_len:] = IGNORE_INDEX
-        if cur_len < tokenizer.model_max_length:
-            if cur_len != total_len:
-                target[:] = IGNORE_INDEX
-                print(
-                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
-                    f" (ignored)"
-                )
-    return dict(
-        input_ids=input_ids,
-        labels=targets,
-    )
-def preprocess(
-    sources: Sequence[str],
-    tokenizer: transformers.PreTrainedTokenizer,
-) -> Dict:
-    """
-    Given a list of sources, each is a conversation list. This transform:
-    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
-    2. Concatenate conversations together;
-    3. Tokenize the concatenated conversation;
-    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
-    """
-    if conversation_lib.default_conversation.version == "v1":
-        return preprocess_v1(sources, tokenizer)
-    if conversation_lib.default_conversation.version == "mpt":
-        return preprocess_mpt(sources, tokenizer)
-    # add end signal and concatenate together
-    conversations = []
-    for source in sources:
-        header = f"{conversation_lib.default_conversation.system}\n\n"
-        conversation = _add_speaker_and_signal(header, source)
-        conversations.append(conversation)
-    # tokenize conversations
-    conversations_tokenized = _tokenize_fn(conversations, tokenizer)
-    input_ids = conversations_tokenized["input_ids"]
-    targets = copy.deepcopy(input_ids)
-    for target, source in zip(targets, sources):
-        tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source],
-                                      tokenizer)["input_ids_lens"]
-        speakers = [sentence["from"] for sentence in source]
-        _mask_targets(target, tokenized_lens, speakers)
-    return dict(input_ids=input_ids, labels=targets)
-class SupervisedDataset(Dataset):
-    """Dataset for supervised fine-tuning."""
-    def __init__(self, data_path: str,
-                 tokenizer: transformers.PreTrainedTokenizer):
-        super(SupervisedDataset, self).__init__()
-        logging.warning("Loading data...")
-        list_data_dict = json.load(open(data_path, "r"))
-        logging.warning("Formatting inputs...")
-        sources = [example["conversations"] for example in list_data_dict]
-        data_dict = preprocess(sources, tokenizer)
-        self.input_ids = data_dict["input_ids"]
-        self.labels = data_dict["labels"]
-    def __len__(self):
-        return len(self.input_ids)
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-        return dict(input_ids=self.input_ids[i], labels=self.labels[i])
-class LazySupervisedDataset(Dataset):
-    def __init__(self, data_path: str,
-                 tokenizer: transformers.PreTrainedTokenizer,
-                 multimodal_cfg: dict):
-        super(LazySupervisedDataset, self).__init__()
-        self.tokenizer, self.multimodal_cfg = tokenizer, multimodal_cfg
-        self.pkl, self.prompt = pickle.load(open('./_data/ipr2pr.pkl', 'rb'))['task'], json.load(open('./_data/ipr2pr_expressive.json', 'r'))
-        random.shuffle(self.pkl)
-        print('--pkl: %d--'%(len(self.pkl)))
-    def __len__(self):
-        return len(self.pkl)
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-        item = self.pkl[i][0]
-        tsv = open('./_data/ipr2pr.tsv', 'r')
-        tsv.seek(item['lineidx'])
-        b = tsv.readline().strip().split('\t')
-        image = resize(b2f(b[0]))
-        processor = self.multimodal_cfg['image_processor']
-        image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
-        cur_token_len = (image.shape[1]//14)*(image.shape[2]//14)
-        query = "what will this image be like if '%s'\n%s"%(item['instruction'], DEFAULT_IMAGE_TOKEN)
-        ans = '%s [IMG0] [IMG1] [IMG2] [IMG3] [IMG4] [IMG5] [IMG6] [IMG7]'%(self.prompt[item['input']]['expressive'])
-        sources = preprocess_multimodal(copy.deepcopy([[{'from': 'human', 'value': query}, {'from': 'gpt', 'value': ans}]]),
-                                        self.multimodal_cfg, cur_token_len)
-        data_dict = preprocess(sources, self.tokenizer)
-        if isinstance(i, int): data_dict = dict(input_ids=data_dict['input_ids'][0],
-                                                labels=data_dict['labels'][0])
-        data_dict['image'] = image
-        p2p_inp, p2p_ans = img2npy(resize(b2f(b[0])).resize([256, 256])), img2npy(resize(b2f(b[1])).resize([256, 256]))
-        data_dict['p2p_inp'], data_dict['p2p_ans'] = p2p_inp, p2p_ans
-        return data_dict
-@dataclass
-class DataCollatorForSupervisedDataset(object):
-    """Collate examples for supervised fine-tuning."""
-    tokenizer: transformers.PreTrainedTokenizer
-    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
-        input_ids, labels = tuple([instance[key] for instance in instances]
-                                  for key in ("input_ids", "labels"))
-        input_ids = torch.nn.utils.rnn.pad_sequence(
-            input_ids,
-            batch_first=True,
-            padding_value=self.tokenizer.pad_token_id)
-        labels = torch.nn.utils.rnn.pad_sequence(labels,
-                                                 batch_first=True,
-                                                 padding_value=IGNORE_INDEX)
-        batch = dict(
-            input_ids=input_ids,
-            labels=labels,
-            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
-        )
-        if 'image' in instances[0]:
-            images = [instance['image'] for instance in instances]
-            if all(x is not None and x.shape == images[0].shape for x in images):
-                batch['images'] = torch.stack(images)
-            else:
-                batch['images'] = images
-        batch['p2p_inp'], batch['p2p_ans'] = [torch.cat([torch.from_numpy(d['p2p_inp']).unsqueeze(dim=0) for d in instances], dim=0),
-                                              torch.cat([torch.from_numpy(d['p2p_ans']).unsqueeze(dim=0) for d in instances], dim=0)]
-        return batch
-def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
-                                data_args) -> Dict:
-    """Make dataset and collator for supervised fine-tuning."""
-    dataset_cls = (LazySupervisedDataset
-                   if data_args.lazy_preprocess else SupervisedDataset)
-    train_dataset = dataset_cls(tokenizer=tokenizer,
-                                data_path=data_args.data_path,
-                                multimodal_cfg=dict(
-                                    is_multimodal=data_args.is_multimodal,
-                                    sep_image_conv_front=data_args.sep_image_conv_front,
-                                    image_token_len=data_args.image_token_len,
-                                    image_folder=data_args.image_folder,
-                                    image_aspect_ratio=data_args.image_aspect_ratio,
-                                    use_im_start_end=getattr(data_args, 'mm_use_im_start_end', False),
-                                    image_processor=getattr(data_args, 'image_processor', None)))
-    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
-    return dict(train_dataset=train_dataset,
-                eval_dataset=None,
-                data_collator=data_collator)
-def train():
-    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-    compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
-    bnb_model_from_pretrained_args = {}
-    if training_args.bits in [4, 8]:
-        from transformers import BitsAndBytesConfig
-        from peft import prepare_model_for_int8_training
-        bnb_model_from_pretrained_args.update(dict(
-            device_map={"": training_args.device},
-            load_in_4bit=training_args.bits == 4,
-            load_in_8bit=training_args.bits == 8,
-            quantization_config=BitsAndBytesConfig(
-                load_in_4bit=training_args.bits == 4,
-                load_in_8bit=training_args.bits == 8,
-                llm_int8_threshold=6.0,
-                llm_int8_has_fp16_weight=False,
-                bnb_4bit_compute_dtype=compute_dtype,
-                bnb_4bit_use_double_quant=training_args.double_quant,
-                bnb_4bit_quant_type=training_args.quant_type # {'fp4', 'nf4'}
-            )
-        ))
-    if model_args.vision_tower is not None:
-        if 'mpt' in model_args.model_name_or_path:
-            model = LlavaMPTForCausalLM.from_pretrained(
-                model_args.model_name_or_path,
-                cache_dir=training_args.cache_dir,
-                **bnb_model_from_pretrained_args
-            )
-        else:
-            model = LlavaLlamaForCausalLM.from_pretrained(
-                model_args.model_name_or_path,
-                cache_dir=training_args.cache_dir,
-                **bnb_model_from_pretrained_args
-            )
-    else:
-        model = transformers.LlamaForCausalLM.from_pretrained(
-            model_args.model_name_or_path,
-            cache_dir=training_args.cache_dir,
-            **bnb_model_from_pretrained_args
-        )
-    model.config.use_cache = False
-    if model_args.freeze_backbone:
-        model.model.requires_grad_(False)
-    if training_args.bits in [4, 8]:
-        model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
-        model = prepare_model_for_int8_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
-    if training_args.gradient_checkpointing and model_args.vision_tower is None:
-        if hasattr(model, "enable_input_require_grads"):
-            model.enable_input_require_grads()
-        else:
-            def make_inputs_require_grad(module, input, output):
-                output.requires_grad_(True)
-            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
-    if training_args.lora_enable:
-        from peft import LoraConfig, get_peft_model
-        lora_config = LoraConfig(
-            r=training_args.lora_r,
-            lora_alpha=training_args.lora_alpha,
-            target_modules=find_all_linear_names(model),
-            lora_dropout=training_args.lora_dropout,
-            bias=training_args.lora_bias,
-            task_type="CAUSAL_LM",
-        )
-        if training_args.bits == 16:
-            if training_args.bf16:
-                model.to(torch.bfloat16)
-            if training_args.fp16:
-                model.to(torch.float16)
-        logging.warning("Adding LoRA adapters...")
-        model = get_peft_model(model, lora_config)
-    if 'mpt' in model_args.model_name_or_path:
-        tokenizer = transformers.AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path,
-            cache_dir=training_args.cache_dir,
-            model_max_length=training_args.model_max_length,
-            padding_side="right"
-        )
-    else:
-        tokenizer = transformers.AutoTokenizer.from_pretrained(
-            model_args.model_name_or_path,
-            cache_dir=training_args.cache_dir,
-            model_max_length=training_args.model_max_length,
-            padding_side="right",
-            use_fast=False,
-        )
-    if model_args.version == "v0":
-        if tokenizer.pad_token is None:
-            smart_tokenizer_and_embedding_resize(
-                special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
-                tokenizer=tokenizer,
-                model=model,
-            )
-        if "llama" in model_args.model_name_or_path:
-            tokenizer.add_special_tokens({
-                "eos_token": DEFAULT_EOS_TOKEN,
-                "bos_token": DEFAULT_BOS_TOKEN,
-                "unk_token": DEFAULT_UNK_TOKEN,
-            })
-    else:
-        tokenizer.pad_token = tokenizer.unk_token
-        if "mpt" in model_args.model_name_or_path:
-            conversation_lib.default_conversation = conversation_lib.conv_templates["mpt"]
-        else:
-            conversation_lib.default_conversation = conversation_lib.conv_templates["vicuna_v1_1"]
-    if model_args.vision_tower is not None:
-        model_vision_dict = model.get_model().initialize_vision_modules(
-            vision_tower=model_args.vision_tower,
-            mm_vision_select_layer=model_args.mm_vision_select_layer,
-            pretrain_mm_mlp_adapter=model_args.pretrain_mm_mlp_adapter,
-            fsdp=training_args.fsdp
-        )
-        model.get_vision_tower().to(dtype=torch.float16, device=training_args.device)
-        vision_config = model_vision_dict['vision_config']
-        data_args.image_token_len = model_vision_dict['image_token_len']
-        data_args.image_processor = model_vision_dict['image_processor']
-        data_args.is_multimodal = True
-        model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
-        if model_args.tune_mm_mlp_adapter:
-            model.requires_grad_(False)
-            for p in model.get_model().mm_projector.parameters():
-                p.requires_grad = True
-        model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
-        if training_args.freeze_mm_mlp_adapter:
-            for p in model.get_model().mm_projector.parameters():
-                p.requires_grad = False
-        if training_args.bits in [4, 8]:
-            model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
-        model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end
-        vision_config.use_im_start_end = training_args.use_im_start_end = model_args.mm_use_im_start_end
-        model.config.sep_image_conv_front = data_args.sep_image_conv_front
-        model.initialize_vision_tokenizer(mm_use_im_start_end=model_args.mm_use_im_start_end, tokenizer=tokenizer, device=training_args.device,
-                                          tune_mm_mlp_adapter=model_args.tune_mm_mlp_adapter, pretrain_mm_mlp_adapter=model_args.pretrain_mm_mlp_adapter)
-        params_no_grad = [n for n, p in model.named_parameters() if not p.requires_grad]
-        if len(params_no_grad) > 0:
-            if training_args.fsdp is not None and len(training_args.fsdp) > 0:
-                if len(params_no_grad) < 10:
-                    print('[WARNING] Attempting to use FSDP while {} parameters do not require gradients: {}'. format(len(params_no_grad), params_no_grad))
-                else:
-                    print('[WARNING] Attempting to use FSDP while {} parameters do not require gradients: {}...(omitted)'. format(len(params_no_grad), ', '.join(params_no_grad[:10])))
-                print("[WARNING] Attempting to use FSDP with partially frozen paramters, this is experimental.")
-                print("[WARNING] As of 4/30/23, this feature requires PyTorch-nightly build.  See here for details: https://github.com/haotian-liu/LLaVA#experimental-use-fsdp-to-save-memory-in-pretraining")
-                from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
-                def patch_FSDP_use_orig_params(func):
-                    def wrap_func(*args, **kwargs):
-                        use_orig_params = kwargs.pop('use_orig_params', True)
-                        return func(*args, **kwargs, use_orig_params=use_orig_params)
-                    return wrap_func
-                FSDP.__init__ = patch_FSDP_use_orig_params(FSDP.__init__)
-    if training_args.bits in [4, 8]:
-        from peft.tuners.lora import LoraLayer
-        for name, module in model.named_modules():
-            if isinstance(module, LoraLayer):
-                if training_args.bf16:
-                    module = module.to(torch.bfloat16)
-            if 'norm' in name:
-                module = module.to(torch.float32)
-            if 'lm_head' in name or 'embed_tokens' in name:
-                if hasattr(module, 'weight'):
-                    if training_args.bf16 and module.weight.dtype == torch.float32:
-                        module = module.to(torch.bfloat16)
-    # start for MGIE
-    os.makedirs('_log', exist_ok=True)
-    pt = {}
-    for i in tqdm(range(2)): pt.update(torch.load('./_ckpt/LLaVA-7B-v1/pytorch_model-0000%d-of-00002.bin'%(i+1), map_location='cpu'))
-    miss, unexp = model.load_state_dict(pt, strict=False)
-    print('miss:', miss), print('unexp:', unexp)
-    tokenizer.add_tokens(['[IMG0]', '[IMG1]', '[IMG2]', '[IMG3]', '[IMG4]', '[IMG5]', '[IMG6]', '[IMG7]'], special_tokens=True)
-    model.resize_token_embeddings(len(tokenizer))
-    print(tokenizer), json.dump(tokenizer.get_vocab(), open('_log/vocabs.json', 'w'), indent=2)
-    for n, p in model.named_parameters():
-        if 'embed_tokens' in n or 'lm_head' in n or 'edit_head' in n or 'unet' in n: p.requires_grad = True
-        else: p.requires_grad = False
-    with open('_log/parameters.txt', 'w') as F:
-        for n, p in model.named_parameters(): F.write('%s %s %s\n'%(n, str(p.shape), str(p.requires_grad)))
-    with open('_log/args_train.txt', 'w') as F:
-        for key in vars(training_args): F.write('%s: %s\n'%(str(key), str(vars(training_args)[key])))
-    # end for MGIE
-    data_module = make_supervised_data_module(tokenizer=tokenizer,
-                                              data_args=data_args)
-    trainer = LLaVATrainer(model=model,
-                    tokenizer=tokenizer,
-                    args=training_args,
-                    **data_module)
-    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
-        trainer.train(resume_from_checkpoint=True)
-    else:
-        trainer.train()
-    trainer.save_state()
-    if training_args.lora_enable:
-        state_dict = get_peft_state_maybe_zero_3(
-            model.named_parameters(), training_args.lora_bias
-        )
-        non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(
-            model.named_parameters()
-        )
-        if training_args.local_rank == 0 or training_args.local_rank == -1:
-            model.config.save_pretrained(training_args.output_dir)
-            model.save_pretrained(training_args.output_dir, state_dict=state_dict)
-            torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
-    else:
-        safe_save_model_for_hf_trainer(trainer=trainer,
-                                       output_dir=training_args.output_dir)
-if __name__ == "__main__":
-    train()