Spaces:

TencentARC
/

SEED-Story

Runtime error

+import os
+import numpy as np
+import datetime
+import json
+from typing import Optional
+import transformers
+from dataclasses import dataclass, field
+import io
+import spaces
+import base64
+from PIL import Image
+import gradio as gr
+import time
+import hashlib
+from utils import build_logger
+from conversation import conv_seed_llama2
+import hydra
+import pyrootutils
+import torch
+import re
+import time
+from omegaconf import OmegaConf
+from flask import Flask
+import json
+from typing import Optional
+import cv2
+from diffusers import AutoencoderKL, UNet2DConditionModel, EulerDiscreteScheduler, StableDiffusionImg2ImgPipeline
+pyrootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+from src.data.any_res import process_anyres_image
+BOI_TOKEN = '<img>'
+BOP_TOKEN = '<patch>'
+EOI_TOKEN = '</img>'
+EOP_TOKEN = '</patch>'
+IMG_TOKEN = '<img_{:05d}>'
+IMG_FLAG = '<image>'
+num_img_in_tokens = 64
+num_img_out_tokens = 64
+resolution_grids = ['1x1', '1x2', '1x3', '1x4', '1x5', '1x6', '1x10', '2x1', '3x1', '4x1', '5x1', '6x1', '10x1', '2x2',
+                    '2x3', '3x2', '2x4', '4x2']
+base_resolution = 448
+app = Flask(__name__)
+def decode_image(encoded_image: str) -> Image:
+    decoded_bytes = base64.b64decode(encoded_image.encode('utf-8'))
+    buffer = io.BytesIO(decoded_bytes)
+    image = Image.open(buffer)
+    return image
+def encode_image(image: Image.Image, format: str = 'PNG') -> str:
+    with io.BytesIO() as buffer:
+        image.save(buffer, format=format)
+        encoded_image = base64.b64encode(buffer.getvalue()).decode('utf-8')
+        return encoded_image
+@dataclass
+class Arguments:
+    image_transform: Optional[str] = field(default='configs/processer/qwen_448_transform.yaml',
+                                           metadata={"help": "config path of image transform"})
+    tokenizer: Optional[str] = field(default='configs/tokenizer/clm_llama_tokenizer.yaml',
+                                     metadata={"help": "config path of tokenizer used to initialize tokenizer"})
+    llm: Optional[str] = field(default='configs/clm_models/llama2chat7b_lora.yaml', metadata={"help": "config path of llm"})
+    visual_encoder: Optional[str] = field(default='configs/visual_tokenzier/qwen_vitg_448.yaml',
+                                          metadata={"help": "config path of visual encoder"})
+    sd_adapter: Optional[str] = field(
+        default='configs/detokenizer/detokenizer_sdxl_qwen_vit_adapted.yaml',
+        metadata={"help": "config path of sd adapter"})
+    agent: Optional[str] = field(default='configs/clm_models/agent_7b_sft.yaml',
+                                 metadata={"help": "config path of agent model"})
+    diffusion_path: Optional[str] = field(default='stabilityai/stable-diffusion-xl-base-1.0',
+                                          metadata={"help": "diffusion model path"})
+    port: Optional[str] = field(default=80, metadata={"help": "network port"})
+    llm_device: Optional[str] = field(default='cuda:0', metadata={"help": "llm device"})
+    vit_sd_device: Optional[str] = field(default='cuda:0', metadata={"help": "sd and vit device"})
+    dtype: Optional[str] = field(default='fp16', metadata={"help": "mix percision"})
+parser = transformers.HfArgumentParser(Arguments)
+args, = parser.parse_args_into_dataclasses()
+class LLMService:
+    def __init__(self, args) -> None:
+        self.llm_device = args.llm_device
+        self.vit_sd_device = args.vit_sd_device
+        dtype = args.dtype
+        if dtype == 'fp16':
+            self.dtype = torch.float16
+        elif dtype == 'bf16':
+            self.dtype = torch.bfloat16
+        else:
+            raise ValueError
+        image_transform_cfg = OmegaConf.load(args.image_transform)
+        self.image_transform = hydra.utils.instantiate(image_transform_cfg)
+        tokenizer_cfg = OmegaConf.load(args.tokenizer)
+        self.tokenizer = hydra.utils.instantiate(tokenizer_cfg)
+        visual_encoder_cfg = OmegaConf.load(args.visual_encoder)
+        self.visual_encoder = hydra.utils.instantiate(visual_encoder_cfg)
+        self.visual_encoder.eval().to(self.vit_sd_device, dtype=self.dtype)
+        print('Init visual encoder done')
+        llm_cfg = OmegaConf.load(args.llm)
+        llm = hydra.utils.instantiate(llm_cfg, torch_dtype=self.dtype)
+        print('Init llm done.')
+        agent_cfg = OmegaConf.load(args.agent)
+        self.agent = hydra.utils.instantiate(agent_cfg, llm=llm)
+        self.agent.eval().to(self.llm_device, dtype=self.dtype)
+        print('Init agent mdoel Done')
+        noise_scheduler = EulerDiscreteScheduler.from_pretrained(args.diffusion_path, subfolder="scheduler")
+        vae = AutoencoderKL.from_pretrained(args.diffusion_path, subfolder="vae").to(self.vit_sd_device,
+                                                                                     dtype=self.dtype)
+        unet = UNet2DConditionModel.from_pretrained(args.diffusion_path, subfolder="unet").to(self.vit_sd_device,
+                                                                                              dtype=self.dtype)
+        sd_adapter_cfg = OmegaConf.load(args.sd_adapter)
+        self.sd_adapter = hydra.utils.instantiate(sd_adapter_cfg, unet=unet).eval().to(self.vit_sd_device,
+                                                                                       dtype=self.dtype)
+        # self.sd_adapter.init_pipe(vae=vae,
+        #                           scheduler=noise_scheduler,
+        #                           visual_encoder=self.visual_encoder.cpu(),
+        #                           image_transform=self.image_transform,
+        #                           discrete_model=None,
+        #                           dtype=self.dtype,
+        #                           device="cpu")
+        self.sd_adapter.init_pipe(vae=vae,
+                                  scheduler=noise_scheduler,
+                                  visual_encoder=self.visual_encoder,
+                                  image_transform=self.image_transform,
+                                  discrete_model=None,
+                                  dtype=self.dtype,
+                                  device=self.vit_sd_device)
+        print('Init sd adapter pipe done.')
+        self.visual_encoder.to(self.vit_sd_device, dtype=self.dtype)
+        model_id_or_path = "stablediffusionapi/realistic-vision-v51"
+        self.vae_pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, safety_checker=None,
+                                                                       torch_dtype=torch.float16)
+        # self.vae_pipe = self.vae_pipe.to(self.vit_sd_device)
+        self.boi_token_id = self.tokenizer.encode(BOI_TOKEN, add_special_tokens=False)[0]
+        self.eoi_token_id = self.tokenizer.encode(EOI_TOKEN, add_special_tokens=False)[0]
+service = LLMService(args)
+@spaces.GPU
+def generate(text_list, image_list, max_new_tokens, force_boi, force_bbox, force_polish):
+    with torch.no_grad():
+        text_list = text_list.split(IMG_FLAG)
+        top_p = 0.5
+        assert len(text_list) == len(image_list) + 1
+        image_tokens = BOI_TOKEN + ''.join(
+            [IMG_TOKEN.format(int(item)) for item in range(num_img_in_tokens)]) + EOI_TOKEN
+        input_images = []
+        if len(image_list) > 0:
+            image_tensor_list = []
+            embeds_cmp_mask = []
+            embeds_gen_mask = []
+            if service.multi_resolution:
+                patch_pos = []
+                image_patch_length = []
+                image_size_list = []
+            for idx, image_item in enumerate(image_list):
+                if isinstance(image_item, str):
+                    image = decode_image(image_item)
+                    print('after decode image size:', image.size)
+                    input_images.append(image)
+                    # if service.multi_resolution:
+                    #     image_size_list.append(image.size)
+                    #     print('image size:', image.size)
+                    #     image_tensor, patch_pos_tensor = process_anyres_image(image, service.image_transform,
+                    #                                                           service.grid_pinpoints,
+                    #                                                           service.base_resolution)
+                    #     image_tensor_list.append(image_tensor)
+                    #     patch_pos.append(patch_pos_tensor)
+                    #     image_patch_length.append(image_tensor.shape[0])
+                    #     print('image_patch_length', image_patch_length)
+                    #     embeds_cmp_mask.extend([True] * image_tensor.shape[0])
+                    #     embeds_gen_mask.extend([False] * image_tensor.shape[0])
+                    #
+                    # else:
+                    image_tensor = service.image_transform(image)
+                    image_tensor_list.append(image_tensor)
+                    embeds_cmp_mask.append(True)
+                    embeds_gen_mask.append(False)
+                else:
+                    raise ValueError
+            if service.multi_resolution:
+                pixel_values = torch.cat(image_tensor_list).to(service.vit_sd_device, dtype=service.dtype)
+                patch_position = torch.cat(patch_pos, dim=0)
+                image_tokens_list = []
+                for patch_length in image_patch_length:
+                    image_tokens = ''
+                    for _ in range(patch_length - 1):
+                        image_tokens += BOP_TOKEN + ''.join(
+                            IMG_TOKEN.format(int(item)) for item in range(num_img_in_tokens)) + EOP_TOKEN
+                    image_tokens += BOI_TOKEN + ''.join(
+                        IMG_TOKEN.format(int(item)) for item in range(num_img_in_tokens)) + EOI_TOKEN
+                    image_tokens_list.append(image_tokens)
+            else:
+                pixel_values = torch.stack(image_tensor_list).to(service.vit_sd_device, dtype=service.dtype)
+            image_embeds = service.visual_encoder(pixel_values)
+            image_embeds = image_embeds.to(service.llm_device)
+            embeds_cmp_mask = torch.tensor(embeds_cmp_mask, dtype=torch.bool).to(service.llm_device)
+            embeds_gen_mask = torch.tensor(embeds_gen_mask, dtype=torch.bool).to(service.llm_device)
+        else:
+            image_embeds = None
+            patch_position = 0
+            embeds_cmp_mask = None
+            embeds_gen_mask = None
+        input_text = image_tokens.join(text_list)
+        print('input_text:', input_text)
+        input_ids = service.tokenizer.encode(input_text, add_special_tokens=False)
+        input_ids = [service.tokenizer.bos_token_id] + input_ids
+        input_ids = torch.tensor(input_ids).to(service.llm_device, dtype=torch.long)
+        ids_cmp_mask = torch.zeros_like(input_ids, dtype=torch.bool).to(service.llm_device)
+        ids_gen_mask = torch.zeros_like(input_ids, dtype=torch.bool).to(service.llm_device)
+        boi_indices = torch.where(input_ids == service.boi_token_id)[0].tolist()
+        eoi_indices = torch.where(input_ids == service.eoi_token_id)[0].tolist()
+        for boi_idx, eoi_idx in zip(boi_indices, eoi_indices):
+            ids_cmp_mask[boi_idx + 1:eoi_idx] = True
+        input_ids = input_ids.unsqueeze(0)
+        ids_cmp_mask = ids_cmp_mask.unsqueeze(0)
+        ids_gen_mask = ids_gen_mask.unsqueeze(0)
+        error_msg = []
+        output = service.agent.generate(
+            tokenizer=service.tokenizer,
+            input_ids=input_ids,
+            image_embeds=image_embeds,
+            embeds_cmp_mask=embeds_cmp_mask,
+            ids_cmp_mask=ids_cmp_mask,
+            num_img_gen_tokens=num_img_out_tokens,
+            max_new_tokens=max_new_tokens,
+            dtype=service.dtype,
+            device=service.llm_device,
+            top_p=top_p,
+        )
+        gen_imgs_base64_list = []
+        generated_text = output['text']
+        generated_text = generated_text.replace(EOI_TOKEN, IMG_FLAG).replace(service.tokenizer.eos_token, '')
+        torch.cuda.empty_cache()
+        if output['has_img_output']:
+            # print('loading visual encoder and llm to CPU, and sd to GPU')
+            # a = time.time()
+            # service.agent = service.agent.cpu()
+            # service.sd_adapter = service.sd_adapter.to(service.vit_sd_device, dtype=service.dtype)
+            # print("Loading finished: ", time.time() - a)
+            img_gen_feat = output['img_gen_feat'].to(service.vit_sd_device, dtype=service.dtype)
+            for img_idx in range(output['num_gen_imgs']):
+                img_feat = img_gen_feat[img_idx:img_idx + 1]
+                generated_image = service.sd_adapter.generate(image_embeds=img_feat, num_inference_steps=50)[0]
+                if force_polish:
+                    # service.sd_adapter = service.sd_adapter.cpu()
+                    # service.vae_pipe = service.vae_pipe.to(service.vit_sd_device, dtype=service.dtype)
+                    torch.cuda.empty_cache()
+                    service.vae_pipe = service.vae_pipe.to(service.vit_sd_device)
+                    init_image = generated_image.resize((1024, 1024))
+                    prompt = ""
+                    images = service.vae_pipe(prompt=prompt, image=init_image,
+                                              num_inference_steps=50, guidance_scale=8.0, strength=0.38).images
+                    generated_image = images[0]
+                    image_base64 = encode_image(generated_image)
+                    gen_imgs_base64_list.append(image_base64)
+                    # service.vae_pipe = service.vae_pipe.to("cpu")
+                    # service.sd_adapter = service.sd_adapter.to(service.vit_sd_device, dtype=service.dtype)
+                    torch.cuda.empty_cache()
+                    # print('loading visual encoder and llm to GPU, and sd to CPU')
+            # a = time.time()
+            # service.sd_adapter = service.sd_adapter.cpu()
+            # service.visual_encoder = service.visual_encoder.to(service.vit_sd_device, dtype=service.dtype)
+            # service.agent = service.agent.to(service.vit_sd_device, dtype=service.dtype)
+            # print("Loading finished: ", time.time() - a)
+        if args.has_bbox:
+            bboxes = extract_box(generated_text)
+            if bboxes is not None and len(input_images) > 0:
+                image_viz = visualize_bbox(input_images[-1], bboxes)
+                image_base64 = encode_image(image_viz)
+                gen_imgs_base64_list.append(image_base64)
+                if '<box_start>' in generated_text:
+                    generated_text = re.sub(r'\[\[ <box_start>.*?<box_end>.*?\]\]', 'the green bounding box',
+                                            generated_text)
+                else:
+                    generated_text = re.sub(r'<loc-\d+> <loc-\d+> <loc-\d+> <loc-\d+> <box_end>  \]\]',
+                                            'the green bounding box', generated_text)
+                generated_text += IMG_FLAG
+        print(input_text + generated_text)
+        return {'text': generated_text, 'images': gen_imgs_base64_list, 'error_msg': error_msg}
+def http_bot(dialog_state, input_state, max_new_tokens, max_turns, force_image_gen, force_bbox, force_polish,
+             request: gr.Request):
+    print('input_state:', input_state)
+    if len(dialog_state.messages) == 0 or dialog_state.messages[-1]['role'] != dialog_state.roles[0] or len(
+            dialog_state.messages[-1]['message']['text'].strip(' ?.;!/')) == 0:
+        return (dialog_state, input_state, dialog_state.to_gradio_chatbot()) + (no_change_btn,) * 4
+    if len(dialog_state.messages) > max_turns * 2:
+        output_state = init_input_state()
+        output_state['text'] = 'Error: History exceeds maximum rounds, please clear history and restart.'
+        dialog_state.messages.append({'role': dialog_state.roles[1], 'message': output_state})
+        input_state = init_input_state()
+        return (dialog_state, input_state, dialog_state.to_gradio_chatbot()) + (disable_btn,) * 3 + (enable_btn,)
+    prompt = dialog_state.get_prompt()
+    text = prompt['text']
+    max_new_tokens = int(max_new_tokens)
+    images = prompt['images']
+    force_boi = force_image_gen
+    force_bbox = force_bbox
+    results = generate(text, images, max_new_tokens, force_boi, force_bbox, force_polish)
+    print('response: ', {'text': results['text'], 'error_msg': results['error_msg']})
+    output_state = init_input_state()
+    image_dir = get_conv_image_dir()
+    output_state['text'] = results['text']
+    for image_base64 in results['images']:
+        if image_base64 == '':
+            image_path = ''
+        else:
+            image = decode_image(image_base64)
+            image = image.convert('RGB')
+            image_path = get_image_name(image=image, image_dir=image_dir)
+            if not os.path.exists(image_path):
+                image.save(image_path)
+        output_state['images'].append(image_path)
+    dialog_state.messages.append({'role': dialog_state.roles[1], 'message': output_state})
+    vote_last_response(dialog_state, 'common', request)
+    input_state = init_input_state()
+    chatbot = update_error_msg(dialog_state.to_gradio_chatbot(), results['error_msg'])
+    return (dialog_state, input_state, chatbot) + (enable_btn,) * 4
+IMG_FLAG = '<image>'
+LOGDIR = 'log'
+logger = build_logger("gradio_seed_x", LOGDIR)
+headers = {"User-Agent": "SEED-X Client"}
+no_change_btn = gr.Button()
+enable_btn = gr.Button(interactive=True)
+disable_btn = gr.Button(interactive=False)
+conv_seed_llama = conv_seed_llama2
+def get_conv_log_filename():
+    t = datetime.datetime.now()
+    name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
+    return name
+def get_conv_image_dir():
+    name = os.path.join(LOGDIR, 'images')
+    os.makedirs(name, exist_ok=True)
+    return name
+def get_image_name(image, image_dir=None):
+    buffer = io.BytesIO()
+    image.save(buffer, format='PNG')
+    image_bytes = buffer.getvalue()
+    md5 = hashlib.md5(image_bytes).hexdigest()
+    if image_dir is not None:
+        image_name = os.path.join(image_dir, md5 + '.png')
+    else:
+        image_name = md5 + '.png'
+    return image_name
+def resize_image_square(image, target_size=448):
+    resized_image = image.resize((target_size, target_size))
+    return resized_image
+def resize_image(image, max_size=512):
+    width, height = image.size
+    aspect_ratio = float(width) / float(height)
+    if width > height:
+        new_width = max_size
+        new_height = int(new_width / aspect_ratio)
+    else:
+        new_height = max_size
+        new_width = int(new_height * aspect_ratio)
+    resized_image = image.resize((new_width, new_height))
+    return resized_image
+def center_crop_image(image, max_aspect_ratio=1.5):
+    width, height = image.size
+    aspect_ratio = max(width, height) / min(width, height)
+    if aspect_ratio >= max_aspect_ratio:
+        if width > height:
+            new_width = int(height * max_aspect_ratio)
+            left = (width - new_width) // 2
+            right = (width + new_width) // 2
+            top = 0
+            bottom = height
+        else:
+            new_height = int(width * max_aspect_ratio)
+            left = 0
+            right = width
+            top = (height - new_height) // 2
+            bottom = (height + new_height) // 2
+        cropped_image = image.crop((left, top, right, bottom))
+        return cropped_image
+    else:
+        return image
+def vote_last_response(state, vote_type, request: gr.Request):
+    with open(get_conv_log_filename(), "a") as fout:
+        data = {
+            "tstamp": round(time.time(), 4),
+            "type": vote_type,
+            "state": state.dict(),
+            "ip": request.client.host,
+        }
+        fout.write(json.dumps(data) + "\n")
+def upvote_last_response(state, request: gr.Request):
+    logger.info(f"upvote. ip: {request.client.host}")
+    vote_last_response(state, "upvote", request)
+    return (disable_btn,) * 2
+def downvote_last_response(state, request: gr.Request):
+    logger.info(f"downvote. ip: {request.client.host}")
+    vote_last_response(state, "downvote", request)
+    return (disable_btn,) * 2
+def regenerate(dialog_state, request: gr.Request):
+    logger.info(f"regenerate. ip: {request.client.host}")
+    if dialog_state.messages[-1]['role'] == dialog_state.roles[1]:
+        dialog_state.messages.pop()
+    return (
+               dialog_state,
+               dialog_state.to_gradio_chatbot(),
+           ) + (disable_btn,) * 4
+def clear_history(request: gr.Request):
+    logger.info(f"clear_history. ip: {request.client.host}")
+    dialog_state = conv_seed_llama.copy()
+    input_state = init_input_state()
+    return (dialog_state, input_state, dialog_state.to_gradio_chatbot()) + (disable_btn,) * 4
+def init_input_state():
+    return {'images': [], 'text': ''}
+def add_text(dialog_state, input_state, text, request: gr.Request):
+    logger.info(f"add_text. ip: {request.client.host}.")
+    if text is None or len(text) == 0:
+        return (dialog_state, input_state, "", dialog_state.to_gradio_chatbot()) + (no_change_btn,) * 4
+    input_state['text'] += text
+    if len(dialog_state.messages) > 0 and dialog_state.messages[-1]['role'] == dialog_state.roles[0]:
+        dialog_state.messages[-1]['message'] = input_state
+    else:
+        dialog_state.messages.append({'role': dialog_state.roles[0], 'message': input_state})
+    print('add_text: ', dialog_state.to_gradio_chatbot())
+    return (dialog_state, input_state, "", dialog_state.to_gradio_chatbot()) + (disable_btn,) * 4
+def is_blank(image):
+    image_array = np.array(image)
+    unique_colors = np.unique(image_array)
+    print('unique_colors', len(unique_colors))
+    return len(unique_colors) == 1
+def add_image(dialog_state, input_state, image, request: gr.Request):
+    logger.info(f"add_image. ip: {request.client.host}.")
+    if image is None:
+        return (dialog_state, input_state, None, dialog_state.to_gradio_chatbot()) + (no_change_btn,) * 4
+    image = image.convert('RGB')
+    print('image size:', image.size)
+    image = center_crop_image(image, max_aspect_ratio=10)
+    image_dir = get_conv_image_dir()
+    image_path = get_image_name(image=image, image_dir=image_dir)
+    if not os.path.exists(image_path):
+        image.save(image_path)
+    input_state['images'].append(image_path)
+    input_state['text'] += IMG_FLAG
+    if len(dialog_state.messages) > 0 and dialog_state.messages[-1]['role'] == dialog_state.roles[0]:
+        dialog_state.messages[-1]['message'] = input_state
+    else:
+        dialog_state.messages.append({'role': dialog_state.roles[0], 'message': input_state})
+    print('add_image:', dialog_state)
+    return (dialog_state, input_state, None, dialog_state.to_gradio_chatbot()) + (disable_btn,) * 4
+def update_error_msg(chatbot, error_msg):
+    if len(error_msg) > 0:
+        info = '\n-------------\nSome errors occurred during response, please clear history and restart.\n' + '\n'.join(
+            error_msg)
+        chatbot[-1][-1] = chatbot[-1][-1] + info
+    return chatbot
+def load_demo(request: gr.Request):
+    logger.info(f"load_demo. ip: {request.client.host}")
+    dialog_state = conv_seed_llama.copy()
+    input_state = init_input_state()
+    return dialog_state, input_state
+title = ("""
+# SEED-X-I
+[[Paper]](https://arxiv.org/abs/2404.14396) [[Code]](https://github.com/AILab-CVC/SEED-X) [[Faster Demo]](https://arc.tencent.com/en/ai-demos/multimodal)
+Demo of a general instruction-tuned model SEED-X-I (17B) from the foundation model SEED-X.
+SEED-X-I can follow multimodal instruction (including images with **dynamic resolutions**) and make responses with **images, texts and bounding boxes** in multi-turn conversation.
+SEED-X-I **does not support image manipulation**. If you want to experience **SEED-X-Edit** for high-precision image editing, please refer to [[Inference Code]](https://github.com/AILab-CVC/SEED-X).
+If you want to experience the normal model inference speed, you can use [[Faster Demo]](https://arc.tencent.com/en/ai-demos/multimodal) or run [[Inference Code]](https://github.com/AILab-CVC/SEED-X) locally.
+## Tips:
+* Check out the conversation examples (at the bottom) for inspiration.
+* You can adjust "Max History Rounds" to try a conversation with up to **three rounds due to insufficient GPU memory**. For more turns, you can download our checkpoints from GitHub and deploy them locally for inference.
+* Our demo supports a mix of images and texts as input. You can freely upload an image or enter text, and then click on "Add Image/Text". You can repeat the former step multiple times, and click on "Submit" for model inference at last.
+* You can click "Force Image Generation" to compel the model to produce images when necessary. For example, our model might struggle to generate images when there is an excessive amount of text-only context.
+* You can click "Force Bounding Box" to compel the model to produce bounding box for object detection.
+* You can click "Force Polishing Generated Image" to compel the model to polish the generated image with image post-processing.
+* SEED-X was trained with English-only data. It may process with other languages due to the inherent capabilities from LLaMA, but might not stable.
+""")
+css = """
+img {
+  font-family: 'Helvetica';
+  font-weight: 300;
+  line-height: 2;
+  text-align: center;
+  width: auto;
+  height: auto;
+  display: block;
+  position: relative;
+}
+img:before {
+  content: " ";
+  display: block;
+  position: absolute;
+  top: -10px;
+  left: 0;
+  height: calc(100% + 10px);
+  width: 100%;
+  background-color: rgb(230, 230, 230);
+  border: 2px dotted rgb(200, 200, 200);
+  border-radius: 5px;
+}
+img:after {
+  content: " ";
+  display: block;
+  font-size: 16px;
+  font-style: normal;
+  font-family: FontAwesome;
+  color: rgb(100, 100, 100);
+  position: absolute;
+  top: 5px;
+  left: 0;
+  width: 100%;
+  text-align: center;
+}
+"""
+if __name__ == '__main__':
+    examples_mix = [
+        ['https://github.com/AILab-CVC/SEED-X/blob/main/demos/bank.png?raw=true',
+         'Can I conntect with an advisor on Sunday?'],
+        ['https://github.com/AILab-CVC/SEED-X/blob/main/demos/ground.png?raw=true',
+         'Is there anything in the image that can protect me from catching the flu virus when I go out? Show me the location.'],
+        ['https://github.com/AILab-CVC/SEED-X/blob/main/demos/arrow.jpg?raw=true',
+         'What is the object pointed by the red arrow?'],
+        ['https://github.com/AILab-CVC/SEED-X/blob/main/demos/shanghai.png?raw=true',
+         'Where was this image taken? Explain your answer.'],
+        ['https://github.com/AILab-CVC/SEED-X/blob/main/demos/GPT4.png?raw=true',
+         'How long does it take to make GPT-4 safer?'],
+        ['https://github.com/AILab-CVC/SEED-X/blob/main/demos/twitter.png?raw=true',
+         'Please provide a comprehensive description of this image.'],
+    ]
+    examples_text = [
+        ['I want to build a two story cabin in the woods, with many commanding windows. Can you show me a picture?'],
+        ['Use your imagination to design a concept image for Artificial General Intelligence (AGI). Show me an image.'],
+        [
+            'Can you design an illustration for “The Three-Body Problem” to depict a scene from the novel? Show me a picture.'],
+        [
+            'My four year old son loves toy trains. Can you design a fancy birthday cake for him? Please generate a picture.'],
+        [
+            'Generate an image of a portrait of young nordic girl, age 25, freckled skin, neck tatoo, blue eyes 35mm lens, photography, ultra details.'],
+        ['Generate an impressionist painting of an astronaut in a jungle.']
+    ]
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(title)
+        dialog_state = gr.State()
+        input_state = gr.State()
+        with gr.Row():
+            with gr.Column(scale=3):
+                with gr.Row():
+                    image = gr.Image(type='pil', label='input_image')
+                with gr.Row():
+                    text = gr.Textbox(lines=5,
+                                      show_label=False,
+                                      label='input_text',
+                                      elem_id='textbox',
+                                      placeholder="Enter text and image, and press submit,", container=False)
+                with gr.Row():
+                    add_image_btn = gr.Button("Add Image")
+                    add_text_btn = gr.Button("Add Text")
+                    submit_btn = gr.Button("Submit")
+                with gr.Row():
+                    max_new_tokens = gr.Slider(minimum=64,
+                                               maximum=1024,
+                                               value=768,
+                                               step=64,
+                                               interactive=True,
+                                               label="Max Output Tokens")
+                    max_turns = gr.Slider(minimum=1, maximum=3, value=3, step=1, interactive=True,
+                                          label="Max History Rounds")
+                    force_img_gen = gr.Radio(choices=[True, False], value=False, label='Force Image Generation')
+                    force_bbox = gr.Radio(choices=[True, False], value=False, label='Force Bounding Box')
+                    force_polish = gr.Radio(choices=[True, False], value=True, label='Force Polishing Generated Image')
+            with gr.Column(scale=7):
+                chatbot = gr.Chatbot(elem_id='chatbot', label="SEED-X-I", height=700)
+                with gr.Row():
+                    upvote_btn = gr.Button(value="👍  Upvote", interactive=False)
+                    downvote_btn = gr.Button(value="👎  Downvote", interactive=False)
+                    regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
+                    clear_btn = gr.Button(value="🗑️  Clear history", interactive=False)
+        with gr.Row():
+            with gr.Column(scale=0.7):
+                gr.Examples(examples=examples_mix, label='Input examples', inputs=[image, text], cache_examples=False)
+            with gr.Column(scale=0.3):
+                gr.Examples(examples=examples_text, label='Input examples', inputs=[text], cache_examples=False)
+        # Register listeners
+        btn_list = [upvote_btn, downvote_btn, regenerate_btn, clear_btn]
+        upvote_btn.click(upvote_last_response, [dialog_state], [upvote_btn, downvote_btn])
+        downvote_btn.click(downvote_last_response, [dialog_state], [upvote_btn, downvote_btn])
+        regenerate_btn.click(regenerate, [dialog_state], [dialog_state, chatbot] + btn_list).then(
+            http_bot, [dialog_state, input_state, max_new_tokens, max_turns, force_img_gen, force_bbox, force_polish],
+            [dialog_state, input_state, chatbot] + btn_list)
+        add_image_btn.click(add_image, [dialog_state, input_state, image],
+                            [dialog_state, input_state, image, chatbot] + btn_list)
+        add_text_btn.click(add_text, [dialog_state, input_state, text],
+                           [dialog_state, input_state, text, chatbot] + btn_list)
+        submit_btn.click(
+            add_image, [dialog_state, input_state, image], [dialog_state, input_state, image, chatbot] + btn_list).then(
+            add_text, [dialog_state, input_state, text],
+            [dialog_state, input_state, text, chatbot, upvote_btn, downvote_btn, regenerate_btn, clear_btn]).then(
+            http_bot,
+            [dialog_state, input_state, max_new_tokens, max_turns, force_img_gen, force_bbox, force_polish],
+            [dialog_state, input_state, chatbot] + btn_list)
+        clear_btn.click(clear_history, None, [dialog_state, input_state, chatbot] + btn_list)
+        demo.load(load_demo, None, [dialog_state, input_state])
+    demo.launch(debug=True)

conversation.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+import io
+import base64
+import os
+from PIL import Image
+import copy
+IMG_FLAG = '<image>'
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+def decode_image(encoded_image: str) -> Image:
+    decoded_bytes = base64.b64decode(encoded_image.encode('utf-8'))
+    buffer = io.BytesIO(decoded_bytes)
+    image = Image.open(buffer)
+    return image
+def encode_image(image: Image.Image, format: str = 'PNG') -> str:
+    with io.BytesIO() as buffer:
+        image.save(buffer, format=format)
+        encoded_image = base64.b64encode(buffer.getvalue()).decode('utf-8')
+        return encoded_image
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[dict]  # multi-turn -> user & assistant -> {'images': [PIL.Image,], 'text': str}
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = copy.deepcopy(self.messages)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            if self.system is None or self.system == '':
+                text = ''
+            else:
+                text = self.system + self.sep
+            images = []
+            for message in messages:
+                text += message['role'] + ": " + message['message']['text'] + self.sep
+                for image_path in message['message']['images']:
+                    image = Image.open(image_path).resize((256, 256))
+                    image_base64 = encode_image(image)
+                    images.append(image_base64)
+            text += self.roles[1] + ":"
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            b_token = "[INST] "
+            e_token = " [/INST]"
+            if self.system is None or self.system == '':
+                text = ''
+            else:
+                text = f"<<SYS>>\n{self.system}\n<</SYS>>\n\n"
+            images = []
+            for idx, message in enumerate(messages):
+                # text += message['role'] + ": " + message['message']['text'] + self.sep
+                if idx % 2 == 0:
+                    text += b_token + message['message']['text'] + e_token + self.sep
+                else:
+                    text += message['message']['text'] + self.sep
+                for image_path in message['message']['images']:
+                    image = Image.open(image_path)
+                    image_base64 = encode_image(image)
+                    images.append(image_base64)
+        else:
+            raise NotImplementedError
+        return {'text': text, 'images': images}
+    # def update_image_ids(self, images_ids):
+    #     image_count = 0
+    #     for message in self.messages:
+    #         for idx in range(len(message['message']['images_ids'])):
+    #             if message['message']["images_ids"][idx] is None:
+    #                 message['message']["images_ids"][idx] = images_ids[image_count]
+    #             image_count += 1
+    #     assert len(images_ids) == image_count, print(len(images_ids), image_count)
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def to_gradio_chatbot(self):
+        dialog = []
+        for i, single_turn in enumerate(self.messages[self.offset:]):
+            single_turn = single_turn['message']
+            text_list = single_turn['text'].split(IMG_FLAG)
+            assert len(text_list) == len(single_turn['images']) + 1, print(text_list, len(single_turn['images']))
+            message = ''
+            for image_idx in range(len(single_turn['images'])):
+                image_path = single_turn['images'][image_idx]
+                image = Image.open(image_path)
+                image_base64 = encode_image(image)
+                image_str = f'<img src="data:image/png;base64,{image_base64}" alt="user upload image" />'
+                message += text_list[image_idx] + image_str
+                # image_path = single_turn['images'][image_idx]
+                # if image_path == '':
+                #     message += text_list[image_idx] + '<corrupt_image>'
+                # else:
+                #     message += text_list[image_idx] + f'![](file={image_path})'
+            message += text_list[-1]
+            if i % 2 == 0:
+                dialog.append([message, None])
+            else:
+                dialog[-1][-1] = message
+        return dialog
+    def copy(self):
+        return Conversation(system=self.system,
+                            roles=self.roles,
+                            messages=copy.deepcopy(self.messages),
+                            offset=self.offset,
+                            sep_style=self.sep_style,
+                            sep=self.sep,
+                            sep2=self.sep2,
+                            version=self.version)
+    def dict(self):
+        messages = copy.deepcopy(self.messages)
+        for message in messages:
+            for i in range(len(message['message']['images'])):
+                message['message']['images'][i] = os.path.basename(message['message']['images'][i])
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_seed_vicuna = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="v2",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep='\n',
+)
+conv_seed_vicuna_system = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. ",
+    roles=("USER", "ASSISTANT"),
+    version="v2",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep='\n',
+)
+conv_seed_llama2 = Conversation(
+    system="",
+    roles=("[INST]", "[/INST]"),
+    version="v2",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep='\n',
+)

src/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .datapipes import TarArchiveLoader, JsonlParserIterDataPipe

src/data/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (226 Bytes). View file

src/data/__pycache__/datapipes.cpython-38.pyc ADDED Viewed

Binary file (2.86 kB). View file

src/data/dataloader_utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import time
+import random
+import torch
+# from lavis.datasets.data_utils import move_to_cuda
+from torch.utils.data import DataLoader
+class MultiIterLoader:
+    """
+    A simple wrapper for iterating over multiple iterators.
+    Args:
+        loaders (List[Loader]): List of Iterator loaders.
+        ratios (List[float]): List of ratios to sample from each loader. If None, all loaders are sampled uniformly.
+    """
+    def __init__(self, loaders, ratios=None):
+        # assert all loaders has __next__ method
+        for loader in loaders:
+            assert hasattr(loader, "__next__"), "Loader {} has no __next__ method.".format(loader)
+        if ratios is None:
+            ratios = [1.0] * len(loaders)
+        else:
+            assert len(ratios) == len(loaders)
+            ratios = [float(ratio) / sum(ratios) for ratio in ratios]
+        self.loaders = loaders
+        self.ratios = ratios
+    def __next__(self):
+        # random sample from each loader by ratio
+        loader_idx = random.choices(range(len(self.loaders)), self.ratios, k=1)[0]
+        return next(self.loaders[loader_idx])
+    def __iter__(self):
+        return self
+class PrefetchLoader(object):
+    """
+    Modified from https://github.com/ChenRocks/UNITER.
+    overlap compute and cuda data transfer
+    (copied and then modified from nvidia apex)
+    """
+    def __init__(self, loader):
+        self.loader = loader
+        self.stream = torch.cuda.Stream()
+    def __iter__(self):
+        loader_it = iter(self.loader)
+        self.preload(loader_it)
+        batch = self.next(loader_it)
+        while batch is not None:
+            is_tuple = isinstance(batch, tuple)
+            if is_tuple:
+                task, batch = batch
+            if is_tuple:
+                yield task, batch
+            else:
+                yield batch
+            batch = self.next(loader_it)
+    def __len__(self):
+        return len(self.loader)
+    def preload(self, it):
+        try:
+            self.batch = next(it)
+        except StopIteration:
+            self.batch = None
+            return
+        # if record_stream() doesn't work, another option is to make sure
+        # device inputs are created on the main stream.
+        # self.next_input_gpu = torch.empty_like(self.next_input,
+        #                                        device='cuda')
+        # self.next_target_gpu = torch.empty_like(self.next_target,
+        #                                         device='cuda')
+        # Need to make sure the memory allocated for next_* is not still in use
+        # by the main stream at the time we start copying to next_*:
+        # self.stream.wait_stream(torch.cuda.current_stream())
+        # with torch.cuda.stream(self.stream):
+        #     self.batch = move_to_cuda(self.batch)
+        # more code for the alternative if record_stream() doesn't work:
+        # copy_ will record the use of the pinned source tensor in this
+        # side stream.
+        # self.next_input_gpu.copy_(self.next_input, non_blocking=True)
+        # self.next_target_gpu.copy_(self.next_target, non_blocking=True)
+        # self.next_input = self.next_input_gpu
+        # self.next_target = self.next_target_gpu
+    def next(self, it):
+        torch.cuda.current_stream().wait_stream(self.stream)
+        batch = self.batch
+        if batch is not None:
+            record_cuda_stream(batch)
+        self.preload(it)
+        return batch
+    def __getattr__(self, name):
+        method = self.loader.__getattribute__(name)
+        return method
+def record_cuda_stream(batch):
+    if isinstance(batch, torch.Tensor):
+        batch.record_stream(torch.cuda.current_stream())
+    elif isinstance(batch, list) or isinstance(batch, tuple):
+        for t in batch:
+            record_cuda_stream(t)
+    elif isinstance(batch, dict):
+        for t in batch.values():
+            record_cuda_stream(t)
+    else:
+        pass
+class IterLoader:
+    """
+    A wrapper to convert DataLoader as an infinite iterator.
+    Modified from:
+        https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/iter_based_runner.py
+    """
+    def __init__(self, dataloader: DataLoader, use_distributed: bool = False):
+        self._dataloader = dataloader
+        self.iter_loader = iter(self._dataloader)
+        self._use_distributed = use_distributed
+        self._epoch = 0
+    @property
+    def epoch(self) -> int:
+        return self._epoch
+    def __next__(self):
+        try:
+            data = next(self.iter_loader)
+        except StopIteration:
+            self._epoch += 1
+            if hasattr(self._dataloader.sampler, "set_epoch") and self._use_distributed:
+                self._dataloader.sampler.set_epoch(self._epoch)
+            time.sleep(2)  # Prevent possible deadlock during epoch transition
+            self.iter_loader = iter(self._dataloader)
+            data = next(self.iter_loader)
+        return data
+    def __iter__(self):
+        return self
+    def __len__(self):
+        return len(self._dataloader)

src/data/datapipes.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import torchdata.datapipes as dp
+import os
+import tarfile
+from torchdata.datapipes.iter import TarArchiveLoader
+from typing import cast, IO, Iterable, Iterator, Optional, Tuple, Dict
+from torchdata.datapipes import functional_datapipe
+from io import BufferedIOBase
+from torchdata.datapipes.utils import StreamWrapper
+from torchdata.datapipes.utils.common import validate_pathname_binary_tuple
+import warnings
+from torchdata.datapipes.iter import IterDataPipe
+import json
+@functional_datapipe("load_from_tar_wo_exception")
+class TarArchiveLoaderWoException(TarArchiveLoader):
+    def __iter__(self) -> Iterator[Tuple[str, BufferedIOBase]]:
+        for data in self.datapipe:
+            validate_pathname_binary_tuple(data)
+            pathname, data_stream = data
+            try:
+                if isinstance(data_stream, StreamWrapper) and isinstance(data_stream.file_obj, tarfile.TarFile):
+                    tar = data_stream.file_obj
+                else:
+                    reading_mode = (self.mode if hasattr(data_stream, "seekable") and data_stream.seekable() else
+                                    self.mode.replace(":", "|"))
+                    # typing.cast is used here to silence mypy's type checker
+                    tar = tarfile.open(fileobj=cast(Optional[IO[bytes]], data_stream), mode=reading_mode)
+                for tarinfo in tar:
+                    if not tarinfo.isfile():
+                        continue
+                    extracted_fobj = tar.extractfile(tarinfo)
+                    if extracted_fobj is None:
+                        warnings.warn(f"failed to extract file {tarinfo.name} from source tarfile {pathname}")
+                        raise tarfile.ExtractError
+                    inner_pathname = os.path.normpath(os.path.join(pathname, tarinfo.name))
+                    yield inner_pathname, StreamWrapper(extracted_fobj, data_stream,
+                                                        name=inner_pathname)  # type: ignore[misc]
+            except Exception as e:
+                warnings.warn(f"Unable to extract files from corrupted tarfile stream {pathname} due to: {e}, abort!")
+                # raise e
+            finally:
+                if isinstance(data_stream, StreamWrapper):
+                    data_stream.autoclose()
+@functional_datapipe("parse_jsonl_files")
+class JsonlParserIterDataPipe(IterDataPipe[Tuple[str, Dict]]):
+    def __init__(self, source_datapipe: IterDataPipe[Tuple[str, IO]], **kwargs) -> None:
+        self.source_datapipe: IterDataPipe[Tuple[str, IO]] = source_datapipe
+        self.kwargs = kwargs
+    def __iter__(self) -> Iterator[Tuple[str, Dict]]:
+        for file_name, stream in self.source_datapipe:
+            for idx, line in enumerate(stream):
+                if line.strip() != '':
+                    try:
+                        yield f'{file_name}_line{idx}', json.loads(line)
+                    except Exception as e:
+                        warnings.warn(f"Error occured when parsing string to json due to: {e} abort!")

src/data/story_telling.py ADDED Viewed

	@@ -0,0 +1,634 @@

+import torchdata.datapipes as dp
+import json
+from PIL import Image
+import functools
+import numpy as np
+import torch
+import pickle
+import os
+import cv2
+import random
+from torchvision import transforms
+from braceexpand import braceexpand
+import hydra
+from random import choice
+import tarfile
+from torchdata.datapipes.iter import TarArchiveLoader
+from typing import cast, IO, Iterable, Iterator, Optional, Tuple, Dict
+from torchdata.datapipes import functional_datapipe
+from io import BufferedIOBase
+from torchdata.datapipes.utils import StreamWrapper
+from torchdata.datapipes.utils.common import validate_pathname_binary_tuple
+import warnings
+from torchdata.datapipes.iter import IterDataPipe
+import pyrootutils
+pyrootutils.setup_root(__file__, indicator='.project-root', pythonpath=True)
+BOI_TOKEN = '<img>'
+EOI_TOKEN = '</img>'
+IMG_TOKEN = '<img_{:05d}>'
+gen_prompt = [
+    "Please show me a picture of ",
+    "Please design an image of ",
+    "Please produce a photo of ",
+    "Please generate an image of ",
+    "Please draw a painting of ",
+    "I'd like to see a drawing of ",
+    "I'd love to see an illustration of ",
+    "I'd like to view an image of ",
+    "I want to see a picture of ",
+    "I would like to see a photo of ",
+    "Show me a photo of ",
+    "Generate a picture of ",
+    "Show me a photograph of ",
+    "Generate an image of ",
+    "Generate an image: ",
+    "Generate a picture: ",
+    "Generate a painting: ",
+    "Generate a photograph: ",
+    "Show me a photograph: ",
+    "Draw a picture: ",
+    "Draw a painting: ",
+    "Draw an image: ",
+    "Can you make an image of ",
+    "Can you draw a painting of ",
+    "Can you produce a picture of ",
+    "Can you generate a photo of ",
+    "Can you depict a picture of ",
+    "Can you show me an illustration of ",
+]
+gen_prompt_response = [
+    "Here is a picture.",
+    "I have designed an image.",
+    "Here is a photo.",
+    "I have generated an image.",
+    "Here's a painting.",
+    "Here's a drawing.",
+    "Enjoy this illustration.",
+    "Take a look at this image.",
+    "Here is a picture.",
+    "I have created a photo.",
+    "Enjoy this photo.",
+    "I have generated a picture.",
+    "Here is a photograph.",
+    "Here's an image.",
+    "Certainly, here's an image.",
+    "Absolutely, here is a painting.",
+    "Sure, here is a picture.",
+    "Of course, here is a photo.",
+    "Certainly, please enjoy this picture.",
+    "Sure, please enjoy this illustration.",
+    "",
+]
+jdb_filter_vocab = ['watermark', 'watermark,', 'chaos 100', 'chaos 100,']
+def filter_data_with_image_ids(item):
+    if ('images' not in item):
+        # print(item['__key__'])
+        # print('filtered because no images')
+        return False
+    elif 'input_ids' not in item:
+        return False
+    else:
+        return True
+def calculate_new_dimensions(height, width, target_size):
+    if height < width:
+        new_height = target_size
+        new_width = int(width * (target_size / height))
+    else:
+        new_width = target_size
+        new_height = int(height * (target_size / width))
+    return new_height, new_width
+def unwarp_data(item):
+    unwarpped = {}
+    for key, value in item.items():
+        if isinstance(value, dict):
+            unwarpped.update(value)
+        elif value is not None:
+            unwarpped[key] = value
+    if 'metadata' not in unwarpped:
+        unwarpped['metadata'] = '{}'
+    # if '__key__' in unwarpped:
+    #     unwarpped['__key__'] = unwarpped['__key__'].split('/')[-1]
+    return unwarpped
+# def filter_data_with_similarity(item, similarity_thr=0.2, min_resolution=180, min_aspect_ratio=0.666):
+def filter_data_with_similarity(item, similarity_thr=0.2, assure_text=True):
+    if ('images' not in item):
+        # print(item['__key__'])
+        # print('filtered because no images')
+        return False
+    elif (not item.get('filter_flag', True)):
+        # print(item['__key__'])
+        # print('filtered because filter flag.')
+        return False
+    elif assure_text and ('text' not in item):
+        # print(item['__key__'])
+        # print('filtered because assure_text')
+        return False
+    else:
+        metadata = json.loads(item['metadata'])
+        if 'all_similarities' in metadata:
+            similarity = max(metadata['all_similarities'])
+        elif 'similarity' in metadata:
+            similarity = metadata['similarity']
+        elif 'score' in metadata:
+            similarity = metadata['score']
+        elif 'SCORE' in metadata:
+            similarity = metadata['SCORE']
+        else:
+            similarity = None
+        if similarity is not None:
+            if similarity < similarity_thr:
+                # print(item['__key__'])
+                # print('filtered because similarity')
+                return False
+        return True
+def single_turn_edit_collate(batch):
+    results = {}
+    keys = batch[0].keys()
+    for key in keys:
+        cur = [batch[i][key] for i in range(len(batch)) if batch[i][key] is not None]
+        if len(cur) == 0:
+            results[key] = None
+        elif isinstance(cur[0], torch.Tensor):
+            if key in ['embeds_gen_mask', 'embeds_cmp_mask', 'images']:
+                results[key] = torch.cat(cur, dim=0)
+            else:
+                results[key] = torch.stack(cur, dim=0)
+        else:
+            results[key] = cur
+    return results
+def decode_t2i_data(item,
+                    image_dir,
+                    tokenizer,
+                    image_transform=None,
+                    sd_image_transform=None,
+                    max_length=128,
+                    min_resolution=400,
+                    instruction_prompt='[INST] {instruction} [/INST]\n',
+                    turn_sep='\n',
+                    system_message='',
+                    min_aspect_ratio=0.666,
+                    num_img_in_tokens=64,
+                    num_img_out_tokens=64):
+    key, value = item
+    if 'image' not in value or 'caption' not in value:
+        return {}
+    image_path = os.path.join(image_dir, value["image"])
+    try:
+        image = Image.open(image_path).convert('RGB')
+        width, height = image.size
+        aspect_ratio = height / width
+        if height < min_resolution or width < min_resolution:
+            print(f'filtered because resolution: ({width},{height})')
+            return {}
+        if aspect_ratio < min_aspect_ratio or aspect_ratio > 1 / min_aspect_ratio:
+            print(f'filtered because aspect ratio: ({width},{height})')
+            return {}
+            ### SD related
+        image_data = {}
+        if sd_image_transform is not None:
+            # image_data['original_sizes'] = torch.tensor([height, width])
+            sd_image_tensor = sd_image_transform(image)
+            target_size = sd_image_tensor.shape[-2]
+            target_width, target_height = calculate_new_dimensions(height=height, width=width, target_size=target_size)
+            y1 = max(0, int(round((target_height - target_size) / 2.0)))
+            x1 = max(0, int(round((target_width - target_size) / 2.0)))
+            # image_data['crop_top_lefts'] = torch.tensor([y1, x1])
+            image_data['time_ids'] = torch.tensor([height, width, y1, x1, target_size, target_size])
+            image_data['sd_images'] = sd_image_tensor
+        if image_transform is not None:
+            image = image_transform(image)
+    except Exception as e:
+        print('Error while decode image: ', e)
+        return {}
+    input_ids = []
+    labels = []
+    input_text = ''
+    if system_message != '':
+        if not system_message.endswith('\n'):
+            system_message += '\n'
+        input_text += system_message
+        item_ids = tokenizer.encode(system_message, add_special_tokens=False)
+        item_labels = [-100] * len(item_ids)
+        input_ids.extend(item_ids)
+        labels.extend(item_labels)
+    caption = value["caption"]
+    image_cmp_tokens = BOI_TOKEN + ''.join(
+        [IMG_TOKEN.format(int(item)) for item in range(num_img_in_tokens)]) + EOI_TOKEN
+    image_gen_tokens = BOI_TOKEN + ''.join(
+        [IMG_TOKEN.format(int(item)) for item in range(num_img_out_tokens)]) + EOI_TOKEN
+    instruction = instruction_prompt.format_map({'instruction': caption})
+    response = image_gen_tokens
+    images = torch.stack([image], dim=0)
+    # print(instruction)
+    item_ids = tokenizer.encode(instruction, add_special_tokens=False)
+    item_labels = [-100] * len(item_ids)
+    input_text += instruction
+    input_ids.extend(item_ids)
+    labels.extend(item_labels)
+    item_ids = tokenizer.encode(response, add_special_tokens=False)
+    item_labels = item_ids
+    input_text += response
+    input_ids.extend(item_ids)
+    labels.extend(item_labels)
+    input_ids = [tokenizer.bos_token_id] + input_ids + [tokenizer.eos_token_id]
+    attention_mask = [1] * len(input_ids)
+    labels = [-100] + labels + [tokenizer.eos_token_id]
+    boi_token_id = tokenizer.encode(BOI_TOKEN, add_special_tokens=False)[0]
+    eoi_token_id = tokenizer.encode(EOI_TOKEN, add_special_tokens=False)[0]
+    ids_cmp_mask = [False] * len(input_ids)
+    ids_gen_mask = [False] * len(input_ids)
+    embeds_cmp_mask = [False]
+    embeds_gen_mask = [True]
+    # print(len(input_ids))
+    if len(input_ids) >= max_length:
+        # input_ids = input_ids[:max_length]
+        # attention_mask = attention_mask[:max_length]
+        # labels = labels[:max_length]
+        # ids_cmp_mask = ids_cmp_mask[:max_length]
+        # ids_gen_mask = ids_gen_mask[:max_length]
+        # print('An edit sample has been removed because of max length. input_text: ', input_text)
+        return {}
+    else:
+        padding_length = max_length - len(input_ids)
+        input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
+        attention_mask = attention_mask + [0] * padding_length
+        labels = labels + [-100] * padding_length
+        ids_cmp_mask = ids_cmp_mask + [False] * padding_length
+        ids_gen_mask = ids_gen_mask + [False] * padding_length
+    input_ids = torch.tensor(input_ids, dtype=torch.long)
+    attention_mask = torch.tensor(attention_mask, dtype=torch.long)
+    labels = torch.tensor(labels, dtype=torch.long)
+    ids_cmp_mask = torch.tensor(ids_cmp_mask, dtype=torch.bool)
+    ids_gen_mask = torch.tensor(ids_gen_mask, dtype=torch.bool)
+    embeds_cmp_mask = torch.tensor(embeds_cmp_mask) if embeds_cmp_mask is not None else None
+    embeds_gen_mask = torch.tensor(embeds_gen_mask) if embeds_gen_mask is not None else None
+    boi_idx = torch.where(input_ids == boi_token_id)[0].tolist()
+    eoi_idx = torch.where(input_ids == eoi_token_id)[0].tolist()
+    ids_gen_mask[boi_idx[0] + 1:eoi_idx[0]] = True
+    labels[boi_idx[0] + 1:eoi_idx[0] + 1] = -100
+    ret = {
+        'input_ids': input_ids,
+        'attention_mask': attention_mask,
+        'labels': labels,
+        'ids_gen_mask': ids_gen_mask,
+        'ids_cmp_mask': ids_cmp_mask,
+        'embeds_gen_mask': embeds_gen_mask,
+        'embeds_cmp_mask': embeds_cmp_mask,
+        'images': images,
+        'text': input_text,
+    }
+    ret.update(image_data)
+    return ret
+def build_t2i_datapipe(data_dir,
+                       image_dir,
+                       tokenizer=None,
+                       max_length=77,
+                       batch_size=None,
+                       min_resolution=180,
+                       image_transform=None,
+                       sd_image_transform=None,
+                       instruction_prompt='[INST] {instruction} [INST]\n',
+                       turn_sep='\n',
+                       system_message='',
+                       min_aspect_ratio=0.666,
+                       num_img_in_tokens=64,
+                       num_img_out_tokens=64,
+                       cycle_count=None):
+    decode_partial = functools.partial(decode_t2i_data,
+                                       image_dir=image_dir,
+                                       tokenizer=tokenizer,
+                                       image_transform=image_transform,
+                                       sd_image_transform=sd_image_transform,
+                                       max_length=max_length,
+                                       instruction_prompt=instruction_prompt,
+                                       turn_sep=turn_sep,
+                                       system_message=system_message,
+                                       min_resolution=min_resolution,
+                                       min_aspect_ratio=min_aspect_ratio,
+                                       num_img_in_tokens=num_img_in_tokens,
+                                       num_img_out_tokens=num_img_out_tokens)
+    filter_partial = functools.partial(filter_data_with_image_ids)
+    if isinstance(data_dir, str):
+        data_dir = list(braceexpand(data_dir))
+    datapipe = dp.iter.FileLister(root=data_dir, masks='*.jsonl', recursive=True)
+    datapipe = datapipe.shuffle()
+    datapipe = datapipe.cycle(count=cycle_count)
+    datapipe = datapipe.shuffle()
+    # datapipe = dp.iter.FileLister(root=data_dir, masks='0000000.tar', recursive=True)
+    datapipe = datapipe.sharding_filter()
+    # datapipe = datapipe.sharding_round_robin_dispatch(SHARDING_PRIORITIES.MULTIPROCESSING)
+    datapipe = datapipe.open_files(mode='r')
+    datapipe = datapipe.parse_jsonl_files()
+    datapipe = datapipe.map(decode_partial)
+    datapipe = datapipe.filter(filter_partial)
+    # datapipe = datapipe.shuffle(buffer_size=1024)
+    if batch_size is not None:
+        datapipe = datapipe.batch(batch_size)
+        datapipe = datapipe.collate(single_turn_edit_collate)
+    return datapipe
+def decode_long_story_data(item,
+                           image_dir,
+                           tokenizer,
+                           story_len,
+                           image_transform=None,
+                           sd_image_transform=None,
+                           max_length=128,
+                           min_resolution=400,
+                           instruction_prompt='{instruction}',
+                           turn_sep='\n',
+                           system_message='',
+                           min_aspect_ratio=0.666,
+                           num_img_in_tokens=64,
+                           num_img_out_tokens=64, ):
+    key, value = item
+    if 'images' not in value or 'captions' not in value:
+        return {}
+    image_paths = [os.path.join(image_dir, image_path) for image_path in value["images"]]
+    # assert len(image_paths) == story_len
+    story_len = len(image_paths)
+    num_image_given = random.randint(0, story_len - 2)
+    try:
+        images = []
+        for image_path in image_paths:
+            image = Image.open(image_path).convert('RGB')
+            images.append(image)
+            width, height = image.size
+        aspect_ratio = height / width
+        if height < min_resolution or width < min_resolution:
+            print(f'filtered because resolution: ({width},{height})')
+            return {}
+        if aspect_ratio < min_aspect_ratio or aspect_ratio > 1 / min_aspect_ratio:
+            print(f'filtered because aspect ratio: ({width},{height})')
+            return {}
+        image_data = {}
+        sd_image = images[num_image_given + 1]
+        if sd_image_transform is not None:
+            # image_data['original_sizes'] = torch.tensor([height, width])
+            sd_image_tensor = sd_image_transform(sd_image)
+            target_size = sd_image_tensor.shape[-2]
+            target_width, target_height = calculate_new_dimensions(height=height, width=width, target_size=target_size)
+            y1 = max(0, int(round((target_height - target_size) / 2.0)))
+            x1 = max(0, int(round((target_width - target_size) / 2.0)))
+            # image_data['crop_top_lefts'] = torch.tensor([y1, x1])
+            image_data['time_ids'] = torch.tensor([height, width, y1, x1, target_size, target_size])
+            image_data['sd_images'] = sd_image_tensor
+        if image_transform is not None:
+            for i in range(len(images)):
+                images[i] = image_transform(images[i])
+            images = torch.stack(images, dim=0)
+    except Exception as e:
+        print('Error while decode image: ', e)
+        return {}
+    input_ids = []
+    labels = []
+    input_text = ''
+    if system_message != '':
+        if not system_message.endswith('\n'):
+            system_message += '\n'
+        input_text += system_message
+        item_ids = tokenizer.encode(system_message, add_special_tokens=False)
+        item_labels = [-100] * len(item_ids)
+        input_ids.extend(item_ids)
+        labels.extend(item_labels)
+    captions_all = []
+    for i in range(story_len):
+        caption = value["captions"][i]
+        captions_all.append(caption)
+    image_cmp_tokens = BOI_TOKEN + ''.join(
+        [IMG_TOKEN.format(int(item)) for item in range(num_img_in_tokens)]) + EOI_TOKEN
+    image_gen_tokens = BOI_TOKEN + ''.join(
+        [IMG_TOKEN.format(int(item)) for item in range(num_img_out_tokens)]) + EOI_TOKEN
+    instruction = instruction_prompt.format_map({'instruction': captions_all[0] + image_cmp_tokens})
+    for i in range(num_image_given):
+        instruction = instruction + "[INST]" + captions_all[i + 1] + image_cmp_tokens
+    response = "[INST]" + captions_all[num_image_given + 1] + image_gen_tokens
+    images = images[:num_image_given + 2]
+    # print(instruction)
+    item_ids = tokenizer.encode(instruction, add_special_tokens=False)
+    item_labels = [-100] * len(item_ids)
+    input_text += instruction
+    input_ids.extend(item_ids)
+    labels.extend(item_labels)
+    item_ids = tokenizer.encode(response, add_special_tokens=False)
+    item_labels = item_ids
+    input_text += response
+    input_ids.extend(item_ids)
+    labels.extend(item_labels)
+    input_ids = [tokenizer.bos_token_id] + input_ids + [tokenizer.eos_token_id]
+    attention_mask = [1] * len(input_ids)
+    labels = [-100] + labels + [tokenizer.eos_token_id]
+    boi_token_id = tokenizer.encode(BOI_TOKEN, add_special_tokens=False)[0]
+    eoi_token_id = tokenizer.encode(EOI_TOKEN, add_special_tokens=False)[0]
+    ids_cmp_mask = [False] * len(input_ids)
+    ids_gen_mask = [False] * len(input_ids)
+    embeds_cmp_mask = [True] + [True] * num_image_given + [False]
+    embeds_gen_mask = [False] + [False] * num_image_given + [True]
+    # print(len(input_ids))
+    if len(input_ids) >= max_length:
+        # input_ids = input_ids[:max_length]
+        # attention_mask = attention_mask[:max_length]
+        # labels = labels[:max_length]
+        # ids_cmp_mask = ids_cmp_mask[:max_length]
+        # ids_gen_mask = ids_gen_mask[:max_length]
+        # print('An edit sample has been removed because of max length. input_text: ', input_text)
+        return {}
+    else:
+        padding_length = max_length - len(input_ids)
+        input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
+        attention_mask = attention_mask + [0] * padding_length
+        labels = labels + [-100] * padding_length
+        ids_cmp_mask = ids_cmp_mask + [False] * padding_length
+        ids_gen_mask = ids_gen_mask + [False] * padding_length
+    input_ids = torch.tensor(input_ids, dtype=torch.long)
+    attention_mask = torch.tensor(attention_mask, dtype=torch.long)
+    labels = torch.tensor(labels, dtype=torch.long)
+    ids_cmp_mask = torch.tensor(ids_cmp_mask, dtype=torch.bool)
+    ids_gen_mask = torch.tensor(ids_gen_mask, dtype=torch.bool)
+    embeds_cmp_mask = torch.tensor(embeds_cmp_mask) if embeds_cmp_mask is not None else None
+    embeds_gen_mask = torch.tensor(embeds_gen_mask) if embeds_gen_mask is not None else None
+    boi_idx = torch.where(input_ids == boi_token_id)[0].tolist()
+    eoi_idx = torch.where(input_ids == eoi_token_id)[0].tolist()
+    ids_cmp_mask[boi_idx[0] + 1:eoi_idx[0]] = True
+    for i in range(num_image_given):
+        ids_cmp_mask[boi_idx[i + 1] + 1:eoi_idx[i + 1]] = True
+    ids_gen_mask[boi_idx[-1] + 1:eoi_idx[-1]] = True
+    labels[boi_idx[-1] + 1:eoi_idx[-1] + 1] = -100
+    ret = {
+        'input_ids': input_ids,
+        'attention_mask': attention_mask,
+        'labels': labels,
+        'ids_gen_mask': ids_gen_mask,
+        'ids_cmp_mask': ids_cmp_mask,
+        'embeds_gen_mask': embeds_gen_mask,
+        'embeds_cmp_mask': embeds_cmp_mask,
+        'images': images,
+        'text': input_text,
+    }
+    ret.update(image_data)
+    return ret
+def build_long_story_datapipe(data_dir,
+                              image_dir,
+                              tokenizer=None,
+                              story_len=30,
+                              max_length=77,
+                              batch_size=None,
+                              min_resolution=180,
+                              image_transform=None,
+                              sd_image_transform=None,
+                              instruction_prompt='{instruction}',
+                              turn_sep='\n',
+                              system_message='',
+                              min_aspect_ratio=0.666,
+                              num_img_in_tokens=64,
+                              num_img_out_tokens=64,
+                              cycle_count=None):
+    decode_partial = functools.partial(decode_long_story_data,
+                                       image_dir=image_dir,
+                                       tokenizer=tokenizer,
+                                       story_len=story_len,
+                                       image_transform=image_transform,
+                                       sd_image_transform=sd_image_transform,
+                                       max_length=max_length,
+                                       instruction_prompt=instruction_prompt,
+                                       turn_sep=turn_sep,
+                                       system_message=system_message,
+                                       min_resolution=min_resolution,
+                                       min_aspect_ratio=min_aspect_ratio,
+                                       num_img_in_tokens=num_img_in_tokens,
+                                       num_img_out_tokens=num_img_out_tokens)
+    filter_partial = functools.partial(filter_data_with_image_ids)
+    if isinstance(data_dir, str):
+        data_dir = list(braceexpand(data_dir))
+    datapipe = dp.iter.FileLister(root=data_dir, masks='*.jsonl', recursive=True)
+    datapipe = datapipe.shuffle()
+    datapipe = datapipe.cycle(count=cycle_count)
+    datapipe = datapipe.shuffle()
+    # datapipe = dp.iter.FileLister(root=data_dir, masks='0000000.tar', recursive=True)
+    datapipe = datapipe.sharding_filter()
+    # datapipe = datapipe.sharding_round_robin_dispatch(SHARDING_PRIORITIES.MULTIPROCESSING)
+    datapipe = datapipe.open_files(mode='r')
+    datapipe = datapipe.parse_jsonl_files()
+    datapipe = datapipe.map(decode_partial)
+    datapipe = datapipe.filter(filter_partial)
+    # datapipe = datapipe.shuffle(buffer_size=1024)
+    if batch_size is not None:
+        datapipe = datapipe.batch(batch_size)
+        datapipe = datapipe.collate(single_turn_edit_collate)
+    return datapipe
+def build_multi_datapipes(datapipes, tokenizer=None, image_transform=None, sd_image_transform=None,
+                          sample_weights=None):
+    # assert concat_type in ['concat', 'mux_longest', 'sample']
+    if sample_weights is None:
+        sample_weights = [1] * len(datapipes)
+    else:
+        assert len(sample_weights) == len(datapipes)
+    datapipes = [
+        hydra.utils.instantiate(datapipe, tokenizer=tokenizer, image_transform=image_transform,
+                                sd_image_transform=sd_image_transform) for datapipe in datapipes
+    ]
+    datasets_to_weights_dict = {}
+    for dataset, sample_weight in zip(datapipes, sample_weights):
+        datasets_to_weights_dict[dataset] = sample_weight
+    datapipe = dp.iter.SampleMultiplexer(datasets_to_weights_dict)
+    return datapipe

src/eval/gpt_comparative_eval.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import json
+from openai import OpenAI
+import ast
+import time
+import os
+import base64
+# from PIL import Image
+import io
+client = OpenAI(
+    base_url="YOUR_URL",
+    api_key="YOUR_KEY",
+)
+instruction = "Please act as an impartial judge and evaluate the quality of the generation story contents provided by two AI assistants. Your job is to evaluate which assistant's generation is better. Your evaluation should consider the coherence of the generated story images and text. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie."
+# style
+# instruction = "Please act as an impartial judge and evaluate the quality of the generation story contents provided by two AI assistants. Your job is to evaluate which assistant's generation is better. Your evaluation should consider the style consistency of the story images. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie."
+# text engaging level
+# instruction = "Please act as an impartial judge and evaluate the quality of the generation story contents provided by two AI assistants. Your job is to evaluate which assistant's generation is better. Your evaluation should consider the engaging level of the story. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie."
+def api_call(messages):
+    try_times = 0
+    while try_times < 3:
+        try:
+            chat_completion = client.chat.completions.create(
+                messages=messages,
+                model="gpt-4-turbo-2024-04-09", #"gpt-4-0125-preview", #"claude-3-opus-20240229", #"gpt-4-1106-preview",
+                max_tokens=4096,
+                temperature=0.3,
+                # stop=['<wait to execute>']
+            )
+            success = True
+            break
+        except Exception as e:
+            print(f"Error during API call: {e}")
+            time.sleep(15)
+            try_times += 1
+            success = False
+    if success:
+        cleaned_string = chat_completion.choices[0].message.content.strip()
+        return cleaned_string
+    else:
+        return None
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+def read_json_and_extract_content(filepath):
+    """
+    Reads a JSON file and extracts sentences and images.
+    Args:
+    filepath (str): The path to the JSON file.
+    Returns:
+    dict: A dictionary with two keys 'sentences' and 'images', containing the respective content.
+    """
+    with open(filepath, 'r') as file:
+        data = json.load(file)
+    all_content = []
+    for line in data:
+        extracted_content = {
+            "sentences": [],
+            "images": []
+        }
+        # Matching sentences to their corresponding images using their indices
+        for ix in line['sentence_ixs']:
+            if ix == 0:
+                continue
+            extracted_content['sentences'].append(line['sentences'][ix].replace('<|beginofimage|>', ''))
+            extracted_content['images'].append(line['images'][ix])
+        all_content.append(extracted_content)
+    return all_content
+def read_seed_content_from_folders(base_path):
+    """
+    Reads sentences from text.txt and image paths from subfolders named val_x.
+    Args:
+    base_path (str): Path to the main folder containing subfolders val_0 to val_179.
+    Returns:
+    list of dict: Each dictionary contains 'sentences' and 'images' from each subfolder.
+    """
+    contents = []
+    # Iterate over each possible subfolder val_0 to val_179
+    for i in range(180):  # 0 to 179 inclusive
+        folder_name = f"val_{i}"
+        folder_path = os.path.join(base_path, folder_name)
+        if os.path.exists(folder_path):
+            content_dict = {
+                "sentences": [],
+                "images": []
+            }
+            # Read sentences from text.txt
+            text_file_path = os.path.join(folder_path, 'text.txt')
+            if os.path.isfile(text_file_path):
+                with open(text_file_path, 'r') as file:
+                    content_dict['sentences'] = file.read().splitlines()[:6]
+                    content_dict['sentences'] = [s.replace('[INST]', '') for s in content_dict['sentences'] ]
+            # Collect paths for the images ori_01 to ori_06
+            for j in range(1, 7):  # 1 to 6 inclusive
+                image_name = f"ori_0{j}.jpg"  # Assuming the images are in .jpg format
+                image_path = os.path.join(folder_path, image_name)
+                if os.path.isfile(image_path):
+                    content_dict['images'].append(image_path)
+            # Add the content dictionary to the list if it contains any images or sentences
+            if content_dict['sentences'] or content_dict['images']:
+                contents.append(content_dict)
+    return contents
+def evaluate_models(assistant_a, assistant_b, instruction):
+    # Encode all images to base64
+    images_a_base64 = [encode_image(img_path) for img_path in assistant_a['images'][:5]]
+    images_b_base64 = [encode_image(img_path) for img_path in assistant_b['images'][:5]]
+    # Extract the stories from both assistants
+    story_a = assistant_a['sentences']
+    story_b = assistant_b['sentences']
+    messages = []
+    # A
+    messages.append(
+        {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "Story text from Assistant A: {}\n".format(story_a[:5])
+            }
+        ]
+        }
+    )
+    messages.append(
+        {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "Images from Assistant A are encoded in base64.\n"
+            }
+        ]
+        }
+    )
+    for img_a in images_a_base64:
+        messages.append({
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{img_a}"}
+                }
+            ]
+        })
+    # B
+    messages.append(
+        {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "Story text from Assistant B: {}\n".format(story_b[:5])
+            }
+        ]
+        }
+    )
+    messages.append(
+        {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "Images from Assistant B are encoded in base64.\n"
+            }
+        ]
+        }
+    )
+    for img_b in images_b_base64:
+        messages.append({
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{img_b}"}
+                }
+            ]
+        })
+    # INST
+    messages.append(
+        {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": instruction
+            }
+        ]
+        }
+    )
+    # Combine stories and encoded images into the evaluation instruction
+    result = api_call(messages)
+    print(result)
+    return result
+def main():
+    # read mm json
+    mm_contents = read_json_and_extract_content('/group/40034/shuaisyang/seed_project/StorySalon/llm_eval/mm_eval.json')
+    seed_contents = read_seed_content_from_folders('/group/40034/shuaisyang/seed_project/StorySalon/llm_eval/gen_george_len7')
+    assert len(mm_contents) == len(seed_contents)
+    mm_win = 0
+    seed_win = 0
+    tie = 0
+    error = []
+    for i in range(len(mm_contents)):
+    # for i in range(2):
+        mm = mm_contents[i]
+        seed = seed_contents[i]
+        judgment = evaluate_models(mm, seed, instruction)
+        if "[[A]]" in judgment:
+            mm_win += 1
+        elif "[[B]]" in judgment:
+            seed_win += 1
+        elif "[[C]]" in judgment:
+            tie += 1
+        else:
+            error.append([i, judgment])
+    with open('coherence.txt', 'w') as f:
+        f.write("mm:{}\nseed:{}\ntie:{}\nerror:{}".format(mm_win, seed_win, tie, error))
+main()

src/eval/gpt_score_eval.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import json
+from openai import OpenAI
+import ast
+import time
+import os
+import base64
+# from PIL import Image
+import io
+import re
+client = OpenAI(
+    base_url="YOUR_URL",
+    api_key="YOUR_KEY",
+)
+style_instruction = "Please act as an impartial judge and evaluate the quality of the generation story contents provided by an AI assistant. Your job is to give a score out of 10. Your evaluation should consider the style consistency of the story images. Do not allow the length of the responses to influence your evaluation. Be as objective as possible. After providing your explanation, output your final score by strictly following this format: \"[[score]]\", such as \"[[7]]\"."
+engage_instruction =  "Please act as an impartial judge and evaluate the quality of the generation story contents provided by an AI assistant. Your job is to give a score out of 10. Your evaluation should consider the engaging level of the story. Do not allow the length of the responses to influence your evaluation. Be as objective as possible. After providing your explanation, output your final score by strictly following this format: \"[[score]]\", such as \"[[7]]\"."
+coherence_instruction = "Please act as an impartial judge and evaluate the quality of the generation story contents provided by an AI assistant. Your job is to give a score out of 10. Your evaluation should consider the coherence of the generated story images and text. Do not allow the length of the responses to influence your evaluation. Be as objective as possible. After providing your explanation, output your final score by strictly following this format: \"[[score]]\", such as \"[[7]]\"."
+def api_call(messages):
+    try_times = 0
+    while try_times < 3:
+        try:
+            chat_completion = client.chat.completions.create(
+                messages=messages,
+                model="gpt-4-turbo-2024-04-09", #"gpt-4-0125-preview", #"claude-3-opus-20240229", #"gpt-4-1106-preview",
+                max_tokens=4096,
+                temperature=0.3,
+                # stop=['<wait to execute>']
+            )
+            success = True
+            break
+        except Exception as e:
+            print(f"Error during API call: {e}")
+            time.sleep(15)
+            try_times += 1
+            success = False
+    if success:
+        cleaned_string = chat_completion.choices[0].message.content.strip()
+        return cleaned_string
+    else:
+        return None
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+def read_json_and_extract_content(filepath):
+    """
+    Reads a JSON file and extracts sentences and images.
+    Args:
+    filepath (str): The path to the JSON file.
+    Returns:
+    dict: A dictionary with two keys 'sentences' and 'images', containing the respective content.
+    """
+    with open(filepath, 'r') as file:
+        data = json.load(file)
+    all_content = []
+    for line in data:
+        extracted_content = {
+            "sentences": [],
+            "images": []
+        }
+        # Matching sentences to their corresponding images using their indices
+        for ix in line['sentence_ixs']:
+            if ix == 0:
+                continue
+            extracted_content['sentences'].append(line['sentences'][ix].replace('<|beginofimage|>', ''))
+            extracted_content['images'].append(line['images'][ix])
+        all_content.append(extracted_content)
+    return all_content
+def read_seed_content_from_folders(base_path):
+    """
+    Reads sentences from text.txt and image paths from subfolders named val_x.
+    Args:
+    base_path (str): Path to the main folder containing subfolders val_0 to val_179.
+    Returns:
+    list of dict: Each dictionary contains 'sentences' and 'images' from each subfolder.
+    """
+    contents = []
+    # Iterate over each possible subfolder val_0 to val_179
+    for i in range(180):  # 0 to 179 inclusive
+        folder_name = f"val_{i}"
+        folder_path = os.path.join(base_path, folder_name)
+        if os.path.exists(folder_path):
+            content_dict = {
+                "sentences": [],
+                "images": []
+            }
+            # Read sentences from text.txt
+            text_file_path = os.path.join(folder_path, 'text.txt')
+            if os.path.isfile(text_file_path):
+                with open(text_file_path, 'r') as file:
+                    content_dict['sentences'] = file.read().splitlines()[:6]
+                    content_dict['sentences'] = [s.replace('[INST]', '') for s in content_dict['sentences'] ]
+            # Collect paths for the images ori_01 to ori_06
+            for j in range(1, 7):  # 1 to 6 inclusive
+                image_name = f"ori_0{j}.jpg"  # Assuming the images are in .jpg format
+                image_path = os.path.join(folder_path, image_name)
+                if os.path.isfile(image_path):
+                    content_dict['images'].append(image_path)
+            # Add the content dictionary to the list if it contains any images or sentences
+            if content_dict['sentences'] or content_dict['images']:
+                contents.append(content_dict)
+    return contents
+def evaluate_models(assistant_a, instruction):
+    print(assistant_a, instruction)
+    # Encode all images to base64
+    images_a_base64 = [encode_image(img_path) for img_path in assistant_a['images'][:5]]
+    # Extract the stories from both assistants
+    story_a = assistant_a['sentences']
+    messages = []
+    # A
+    messages.append(
+        {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "Story text from Assistant A: {}\n".format(story_a[:5])
+            }
+        ]
+        }
+    )
+    messages.append(
+        {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "Images are encoded in base64.\n"
+            }
+        ]
+        }
+    )
+    for img_a in images_a_base64:
+        messages.append({
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/jpeg;base64,{img_a}"}
+                }
+            ]
+        })
+    # INST
+    messages.append(
+        {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": instruction
+            }
+        ]
+        }
+    )
+    # Combine stories and encoded images into the evaluation instruction
+    result = api_call(messages)
+    print(result)
+    return result
+def find_number_in_string(input_string):
+    # Regular expression to find [[number]]
+    pattern = r'\[\[(\d+)\]\]'
+    match = re.search(pattern, input_string)
+    if match:
+        return int(match.group(1))  # Return the number as an integer
+    else:
+        return None  # No match found
+def main():
+    # read mm json
+    # mm_contents = read_json_and_extract_content('/group/40034/shuaisyang/seed_project/StorySalon/llm_eval/mm_eval.json')
+    seed_contents = read_seed_content_from_folders('/group/40034/shuaisyang/seed_project/StorySalon/llm_eval/gen_george')
+    # assert len(mm_contents) == len(seed_contents)
+    # mm_win = 0
+    seed_win = 0
+    # tie = 0
+    error = []
+    metrics = ['style', 'engaging', 'coherence']
+    for idx, ins in enumerate((style_instruction, engage_instruction, coherence_instruction)):
+        total_score = 0
+        scores = ''
+        for i in range(len(seed_contents)):
+            seed = seed_contents[i]
+            judgment = evaluate_models(seed, ins)
+            number_found = find_number_in_string(judgment)
+            scores += str(number_found) + '\n'
+            total_score += number_found
+        with open('result_{}.txt'.format(metrics[idx]), 'w') as f:
+            f.write("total:{}\navg:{}\nscores:{}".format(total_score, total_score/len(seed_contents), scores))
+main()

src/inference/gen_george.py ADDED Viewed

	@@ -0,0 +1,270 @@

+# flake8: noqa
+import hydra
+from omegaconf import OmegaConf
+import torch
+import os
+import re
+import pyrootutils
+from PIL import Image, ImageDraw, ImageFont
+import json
+from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel, EulerDiscreteScheduler
+pyrootutils.setup_root(__file__, indicator='.project-root', pythonpath=True)
+BOI_TOKEN = '<img>'
+EOI_TOKEN = '</img>'
+IMG_TOKEN = '<img_{:05d}>'
+device = 'cuda:0'
+dtype = torch.float16
+dtype_str = 'fp16'
+num_img_in_tokens = 64
+num_img_out_tokens = 64
+instruction_prompt = '{instruction}'
+tokenizer_cfg_path = 'configs/tokenizer/clm_llama_tokenizer.yaml'
+image_transform_cfg_path = 'configs/processer/qwen_448_transform.yaml'
+visual_encoder_cfg_path = 'configs/visual_tokenizer/qwen_vitg_448.yaml'
+llm_cfg_path = 'configs/clm_models/llama2chat7b_lora.yaml'
+agent_cfg_path = 'configs/clm_models/agent_7b_sft.yaml'
+adapter_cfg_path = 'configs/detokenizer/detokenizer_sdxl_qwen_vit_adapted.yaml'
+discrete_model_cfg_path = 'configs/discrete_model/discrete_identity.yaml'
+diffusion_model_path = 'pretrained/stable-diffusion-xl-base-1.0'
+save_dir = "output"
+tokenizer_cfg = OmegaConf.load(tokenizer_cfg_path)
+tokenizer = hydra.utils.instantiate(tokenizer_cfg)
+image_transform_cfg = OmegaConf.load(image_transform_cfg_path)
+image_transform = hydra.utils.instantiate(image_transform_cfg)
+visual_encoder_cfg = OmegaConf.load(visual_encoder_cfg_path)
+visual_encoder = hydra.utils.instantiate(visual_encoder_cfg)
+visual_encoder.eval().to(device, dtype=dtype)
+print('Init visual encoder done')
+llm_cfg = OmegaConf.load(llm_cfg_path)
+llm = hydra.utils.instantiate(llm_cfg, torch_dtype=dtype_str)
+print('Init llm done.')
+agent_model_cfg = OmegaConf.load(agent_cfg_path)
+agent_model = hydra.utils.instantiate(agent_model_cfg, llm=llm)
+agent_model.eval().to(device, dtype=dtype)
+print('Init agent model Done')
+noise_scheduler = EulerDiscreteScheduler.from_pretrained(diffusion_model_path, subfolder="scheduler")
+print('init vae')
+vae = AutoencoderKL.from_pretrained(diffusion_model_path, subfolder="vae").to(device, dtype=dtype)
+print('init unet')
+unet = UNet2DConditionModel.from_pretrained(diffusion_model_path, subfolder="unet").to(device, dtype=dtype)
+adapter_cfg = OmegaConf.load(adapter_cfg_path)
+adapter = hydra.utils.instantiate(adapter_cfg, unet=unet).to(device, dtype=dtype).eval()
+print('Init adapter done')
+discrete_model_cfg = OmegaConf.load(discrete_model_cfg_path)
+discrete_model = hydra.utils.instantiate(discrete_model_cfg).to(device).eval()
+print('Init discrete model done')
+adapter.init_pipe(vae=vae,
+                  scheduler=noise_scheduler,
+                  visual_encoder=visual_encoder,
+                  image_transform=image_transform,
+                  discrete_model=discrete_model,
+                  dtype=dtype,
+                  device=device)
+print('Init adapter pipe done')
+boi_token_id = tokenizer.encode(BOI_TOKEN, add_special_tokens=False)[0]
+eoi_token_id = tokenizer.encode(EOI_TOKEN, add_special_tokens=False)[0]
+def read_jsonl_to_dict(filename):
+    data = []
+    with open(filename, 'r') as file:
+        for line in file:
+            # Each line is a valid JSON object
+            json_object = json.loads(line)
+            data.append(json_object)
+    return data
+# data
+filename = 'data/json/val.jsonl'
+image_root = 'data/image/george_full'
+data = read_jsonl_to_dict(filename)
+image_paths = [
+    os.path.join(image_root, d['images'][0]) for d in data
+]
+questions = [
+    d['captions'][0] for d in data
+]
+# texts = [
+#     d['captions'][1:] for d in data
+# ]
+def add_subtitle(original_image, text):
+    # Calculate the size of the new image
+    text_height = 80  # Height of the black bar for the text
+    new_image_size = (original_image.width, original_image.height + text_height)
+    # Create a new image with a black background
+    new_image = Image.new("RGB", new_image_size, "black")
+    # Paste the original image onto the new image
+    new_image.paste(original_image, (0, 0))
+    # Prepare the new image for drawing
+    draw = ImageDraw.Draw(new_image)
+    # Specify the font size and font path
+    font_size = 14  # Adjust font size as needed
+    # font = ImageFont.truetype(font_path, font_size)
+    # Manually split the text into two lines
+    line1, line2 = text[:len(text) // 2], text[len(text) // 2:]
+    # Update the position for the first line of text to ensure both lines are centered vertically
+    text_position_line1 = (10, original_image.height + (text_height - font_size) // 2)
+    # Define the text color
+    text_color = "white"
+    # Add the first line of text to the new image
+    draw.text(text_position_line1, line1, fill=text_color)
+    # Adjust the position for the second line of text, based on the height of the first line
+    text_position_line2 = (10, text_position_line1[1] + font_size)
+    # Add the second line of text to the new image
+    draw.text(text_position_line2, line2, fill=text_color)
+    return new_image
+for j in range(len(image_paths)):
+    image_path = image_paths[j]
+    question = questions[j]
+    image = Image.open(image_path).convert('RGB')
+    save_folder = '{}/val_{}'.format(save_dir, j)
+    os.makedirs(save_folder, exist_ok=True)
+    init_image = add_subtitle(image, question)
+    save_path = os.path.join(save_folder, '000start_image.jpg')
+    init_image.save(save_path)
+    agent_model.llm.base_model.model.use_kv_cache_head = False
+    image_tensor = image_transform(image).unsqueeze(0).to(device, dtype=dtype)
+    image_tokens = BOI_TOKEN + ''.join([IMG_TOKEN.format(int(item)) for item in range(num_img_in_tokens)]) + EOI_TOKEN
+    prompt = instruction_prompt.format_map({'instruction': question + image_tokens})
+    print(prompt)
+    print('*' * 20)
+    input_ids = tokenizer.encode(prompt, add_special_tokens=False)
+    input_ids = [tokenizer.bos_token_id] + input_ids
+    boi_idx = input_ids.index(boi_token_id)
+    eoi_idx = input_ids.index(eoi_token_id)
+    input_ids = torch.tensor(input_ids).to(device, dtype=torch.long).unsqueeze(0)
+    ids_cmp_mask = torch.zeros_like(input_ids, dtype=torch.bool)
+    ids_cmp_mask[0, boi_idx + 1:eoi_idx] = True
+    embeds_cmp_mask = torch.tensor([True]).to(device, dtype=torch.bool)
+    with torch.no_grad():
+        image_embeds = visual_encoder(image_tensor)
+    output = agent_model.generate(tokenizer=tokenizer,
+                                  input_ids=input_ids,
+                                  image_embeds=image_embeds,
+                                  embeds_cmp_mask=embeds_cmp_mask,
+                                  ids_cmp_mask=ids_cmp_mask,
+                                  max_new_tokens=500,
+                                  num_img_gen_tokens=num_img_out_tokens)
+    text = re.sub(r'\s*<[^>]*>\s*', ' ', output['text']).strip()
+    with open("{}/text.txt".format(save_folder), 'a+') as text_file:
+        text_file.write(text + '\n')
+    with open("{}/token.txt".format(save_folder), 'a+') as token_file:
+        token_file.write("context token: {}\n".format(input_ids.shape))
+    print(output['text'])
+    print('*' * 20)
+    story_len = 25
+    window_size = 8
+    text_id = 1
+    while output['has_img_output'] and image_embeds.shape[0] < story_len:
+        image_embeds_gen = output['img_gen_feat']
+        images_gen = adapter.generate(image_embeds=output['img_gen_feat'], num_inference_steps=50)
+        name = '{:02d}.jpg'.format(text_id)
+        save_path = os.path.join(save_folder, name)
+        # Open the generated image
+        original_image = images_gen[0]
+        ori_path = os.path.join(save_folder, 'ori_{:02d}.jpg'.format(text_id))
+        original_image.save(ori_path)
+        new_image = add_subtitle(original_image, text)
+        # Save the modified image
+        new_image.save(save_path)
+        image_embeds = torch.cat((image_embeds, image_embeds_gen), dim=0)
+        # image_embeds = torch.cat((image_embeds, image_embeds_gen), dim=0)
+        if text_id >= story_len - 1:
+            break
+        prompt = prompt + text + image_tokens
+        text_id += 1
+        input_ids = tokenizer.encode(prompt, add_special_tokens=False)
+        while image_embeds.shape[0] > window_size:
+            eoi_prompt_idx = prompt.index(EOI_TOKEN)
+            prompt = prompt[eoi_prompt_idx + len(EOI_TOKEN) + len('[INST]'):]
+            image_embeds = image_embeds[1:]
+            input_ids = tokenizer.encode(prompt, add_special_tokens=False)
+        print(prompt)
+        print('*' * 20)
+        input_ids = [tokenizer.bos_token_id] + input_ids
+        boi_idx = torch.where(torch.tensor(input_ids) == boi_token_id)[0].tolist()
+        eoi_idx = torch.where(torch.tensor(input_ids) == eoi_token_id)[0].tolist()
+        input_ids = torch.tensor(input_ids).to(device, dtype=torch.long).unsqueeze(0)
+        ids_cmp_mask = torch.zeros_like(input_ids, dtype=torch.bool)
+        for i in range(image_embeds.shape[0]):
+            ids_cmp_mask[0, boi_idx[i] + 1:eoi_idx[i]] = True
+        embeds_cmp_mask = torch.tensor([True] * image_embeds.shape[0]).to(device, dtype=torch.bool)
+        output = agent_model.generate(tokenizer=tokenizer,
+                                      input_ids=input_ids,
+                                      image_embeds=image_embeds,
+                                      embeds_cmp_mask=embeds_cmp_mask,
+                                      ids_cmp_mask=ids_cmp_mask,
+                                      max_new_tokens=500,
+                                      num_img_gen_tokens=num_img_out_tokens)
+        text = re.sub(r'\s*<[^>]*>\s*', ' ', output['text']).strip()
+        print(output['text'])
+        print('*' * 20)
+        with open("{}/text.txt".format(save_folder), 'a+') as text_file:
+            text_file.write(text + '\n')
+        with open("{}/token.txt".format(save_folder), 'a+') as token_file:
+            token_file.write("context token: {}\n".format(input_ids.shape))

src/inference/vis_george_sink.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import hydra
+from omegaconf import OmegaConf
+import torch
+import os
+import re
+import pyrootutils
+from PIL import Image, ImageDraw, ImageFont
+import json
+from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel, EulerDiscreteScheduler
+import matplotlib.pyplot as plt
+import numpy as np
+from collections import Counter
+import time
+pyrootutils.setup_root(__file__, indicator='.project-root', pythonpath=True)
+BOI_TOKEN = '<img>'
+EOI_TOKEN = '</img>'
+IMG_TOKEN = '<img_{:05d}>'
+device = 'cuda:0'
+dtype = torch.float16
+dtype_str = 'fp16'
+num_img_in_tokens = 64
+num_img_out_tokens = 64
+instruction_prompt = '{instruction}'
+tokenizer_cfg_path = 'configs/tokenizer/clm_llama_tokenizer.yaml'
+image_transform_cfg_path = 'configs/processer/qwen_448_transform.yaml'
+visual_encoder_cfg_path = 'configs/visual_tokenizer/qwen_vitg_448.yaml'
+llm_cfg_path = 'configs/clm_models/llama2chat7b_lora.yaml'
+agent_cfg_path = 'configs/clm_models/agent_7b_sft.yaml'
+adapter_cfg_path = 'configs/detokenizer/detokenizer_sdxl_qwen_vit_adapted.yaml'
+discrete_model_cfg_path = 'configs/discrete_model/discrete_identity.yaml'
+diffusion_model_path = 'pretrained/stable-diffusion-xl-base-1.0'
+save_dir = "output"
+cache_mode = 'img_head_tail'
+# init
+tokenizer_cfg = OmegaConf.load(tokenizer_cfg_path)
+tokenizer = hydra.utils.instantiate(tokenizer_cfg)
+image_transform_cfg = OmegaConf.load(image_transform_cfg_path)
+image_transform = hydra.utils.instantiate(image_transform_cfg)
+visual_encoder_cfg = OmegaConf.load(visual_encoder_cfg_path)
+visual_encoder = hydra.utils.instantiate(visual_encoder_cfg)
+visual_encoder.eval().to(device, dtype=dtype)
+print('Init visual encoder done')
+llm_cfg = OmegaConf.load(llm_cfg_path)
+llm = hydra.utils.instantiate(llm_cfg, torch_dtype=dtype_str)
+print('Init llm done.')
+agent_model_cfg = OmegaConf.load(agent_cfg_path)
+agent_model = hydra.utils.instantiate(agent_model_cfg, llm=llm)
+agent_model.eval().to(device, dtype=dtype)
+print('Init agent model Done')
+noise_scheduler = EulerDiscreteScheduler.from_pretrained(diffusion_model_path, subfolder="scheduler")
+print('init vae')
+vae = AutoencoderKL.from_pretrained(diffusion_model_path, subfolder="vae").to(device, dtype=dtype)
+print('init unet')
+unet = UNet2DConditionModel.from_pretrained(diffusion_model_path, subfolder="unet").to(device, dtype=dtype)
+adapter_cfg = OmegaConf.load(adapter_cfg_path)
+adapter = hydra.utils.instantiate(adapter_cfg, unet=unet).to(device, dtype=dtype).eval()
+print('Init adapter done')
+discrete_model_cfg = OmegaConf.load(discrete_model_cfg_path)
+discrete_model = hydra.utils.instantiate(discrete_model_cfg).to(device).eval()
+print('Init discrete model done')
+adapter.init_pipe(vae=vae,
+                  scheduler=noise_scheduler,
+                  visual_encoder=visual_encoder,
+                  image_transform=image_transform,
+                  discrete_model=discrete_model,
+                  dtype=dtype,
+                  device=device)
+print('Init adapter pipe done')
+boi_token_id = tokenizer.encode(BOI_TOKEN, add_special_tokens=False)[0]
+eoi_token_id = tokenizer.encode(EOI_TOKEN, add_special_tokens=False)[0]
+def read_jsonl_to_dict(filename):
+    data = []
+    with open(filename, 'r') as file:
+        for line in file:
+            # Each line is a valid JSON object
+            json_object = json.loads(line)
+            data.append(json_object)
+    return data
+# data
+filename = 'data/json/val.jsonl'
+image_root = 'data/image/george_full'
+data = read_jsonl_to_dict(filename)
+image_paths = [
+    os.path.join(image_root, d['images'][0]) for d in data
+]
+starting_texts = [
+    d['captions'][0] for d in data
+]
+texts = [
+    d['captions'][1:] for d in data
+]
+def add_subtitle(original_image, text):
+    # Calculate the size of the new image
+    text_height = 80  # Height of the black bar for the text
+    new_image_size = (original_image.width, original_image.height + text_height)
+    # Create a new image with a black background
+    new_image = Image.new("RGB", new_image_size, "black")
+    # Paste the original image onto the new image
+    new_image.paste(original_image, (0, 0))
+    # Prepare the new image for drawing
+    draw = ImageDraw.Draw(new_image)
+    # Specify the font size and font path
+    font_size = 14  # Adjust font size as needed
+    # font = ImageFont.truetype(font_path, font_size)
+    # Manually split the text into two lines
+    line1, line2 = text[:len(text) // 2], text[len(text) // 2:]
+    # Update the position for the first line of text to ensure both lines are centered vertically
+    text_position_line1 = (10, original_image.height + (text_height - font_size) // 2)
+    # Define the text color
+    text_color = "white"
+    # Add the first line of text to the new image
+    draw.text(text_position_line1, line1, fill=text_color)
+    # Adjust the position for the second line of text, based on the height of the first line
+    text_position_line2 = (10, text_position_line1[1] + font_size)
+    # Add the second line of text to the new image
+    draw.text(text_position_line2, line2, fill=text_color)
+    return new_image
+for j in range(len(image_paths)):
+    image_path = image_paths[j]
+    starting_text = starting_texts[j]
+    text_seq = texts[j]
+    image = Image.open(image_path).convert('RGB')
+    save_folder = '{}/val_{}'.format(save_dir, j)
+    os.makedirs(save_folder, exist_ok=True)
+    init_image = add_subtitle(image, starting_text)
+    save_path = os.path.join(save_folder, '000start_image.jpg')
+    init_image.save(save_path)
+    sink_kv_cache = []
+    agent_model.llm.base_model.model.kv_cache_head = None
+    agent_model.llm.base_model.model.past_key_values = None
+    agent_model.llm.base_model.model.use_kv_cache_head = False
+    image_tensor = image_transform(image).unsqueeze(0).to(device, dtype=dtype)
+    image_tokens = BOI_TOKEN + ''.join([IMG_TOKEN.format(int(item)) for item in range(num_img_in_tokens)]) + EOI_TOKEN
+    text = text_seq[0]
+    prompt = instruction_prompt.format_map({'instruction': starting_text + image_tokens}) + text
+    print(prompt)
+    print('*' * 20)
+    input_ids = tokenizer.encode(prompt, add_special_tokens=False)
+    input_ids = [tokenizer.bos_token_id] + input_ids
+    boi_idx = input_ids.index(boi_token_id)
+    eoi_idx = input_ids.index(eoi_token_id)
+    input_ids = torch.tensor(input_ids).to(device, dtype=torch.long).unsqueeze(0)
+    ids_cmp_mask = torch.zeros_like(input_ids, dtype=torch.bool)
+    ids_cmp_mask[0, boi_idx + 1:eoi_idx] = True
+    embeds_cmp_mask = torch.tensor([True]).to(device, dtype=torch.bool)
+    with torch.no_grad():
+        image_embeds = visual_encoder(image_tensor)
+    left = 0
+    right = input_ids.shape[1]
+    output = agent_model.generate(tokenizer=tokenizer,
+                                  input_ids=input_ids,
+                                  image_embeds=image_embeds,
+                                  embeds_cmp_mask=embeds_cmp_mask,
+                                  ids_cmp_mask=ids_cmp_mask,
+                                  max_new_tokens=500,
+                                  num_img_gen_tokens=num_img_out_tokens,
+                                  )
+    with open("{}/text.txt".format(save_folder), 'a+') as text_file:
+        text_file.write(text + '\n')
+    with open("{}/token.txt".format(save_folder), 'a+') as token_file:
+        token_file.write("context token: {} boi_idx: {}\n".format(input_ids.shape, boi_idx))
+    story_len = 25
+    window_size = 8
+    text_id = 1
+    while output['has_img_output'] and image_embeds.shape[0] < story_len:
+        image_embeds_gen = output['img_gen_feat']
+        images_gen = adapter.generate(image_embeds=output['img_gen_feat'], num_inference_steps=50)
+        name = '{:02d}.jpg'.format(text_id)
+        save_path = os.path.join(save_folder, name)
+        # Open the generated image
+        original_image = images_gen[0]
+        ori_path = os.path.join(save_folder, 'ori_{:02d}.jpg'.format(text_id))
+        original_image.save(ori_path)
+        new_image = add_subtitle(original_image, text)
+        # Save the modified image
+        new_image.save(save_path)
+        image_embeds = torch.cat((image_embeds, image_embeds_gen), dim=0)
+        # next gen
+        text = text_seq[text_id]
+        text_id += 1
+        # image_embeds = torch.cat((image_embeds, image_embeds_gen), dim=0)
+        if text_id >= story_len - 1:
+            break
+        past_key_values = [[kv[:, :, :input_ids.shape[1], :] for kv in l] for l in output['past_key_values']]
+        agent_model.llm.base_model.model.kv_cache_head = input_ids.shape[1]
+        prompt = prompt + image_tokens + text
+        next_input_ids = tokenizer.encode(image_tokens + text, add_special_tokens=False)
+        next_input_ids = torch.tensor(next_input_ids).to(device, dtype=torch.long).unsqueeze(0)
+        input_ids = torch.cat((input_ids, next_input_ids), dim=1)
+        left = right
+        right = input_ids.shape[1]
+        while image_embeds.shape[0] > window_size:
+            eoi_prompt_idx = prompt.index(EOI_TOKEN)
+            prompt = prompt[eoi_prompt_idx + len(EOI_TOKEN) :]
+            boi_idx = torch.where(input_ids == boi_token_id)[1].tolist()
+            eoi_idx = torch.where(input_ids == eoi_token_id)[1].tolist()
+            image_embeds = image_embeds[1:]
+            input_ids = input_ids[:, eoi_idx[0]+1:]
+            # slice kv cache
+            if cache_mode == 'img_head_tail':
+                if len(sink_kv_cache) == 0:
+                    sink_kv_cache = [
+                        [
+                            kv[:, :, :4, :] for kv in l
+                        ] for l in past_key_values
+                    ]
+                sink_kv_cache = [
+                    [
+                        torch.cat(
+                            (sink_kv_cache[l_idx][kv_idx],
+                             kv[:, :, boi_idx[0] - 4:boi_idx[0] + 8, :],
+                             kv[:, :, eoi_idx[0] - 8:eoi_idx[0] + 4, :]),
+                            dim=2
+                        ) for kv_idx, kv in enumerate(l)
+                    ] for l_idx, l in enumerate(past_key_values)
+                ]
+                past_key_values = [
+                    [
+                        torch.cat(
+                            (sink_kv_cache[l_idx][kv_idx],
+                             kv[:, :, eoi_idx[0] + sink_kv_cache[0][0].shape[2] + 1:, :]),
+                            dim=2
+                        ) for kv_idx, kv in enumerate(l)
+                    ] for l_idx, l in enumerate(past_key_values)
+                ]
+            # slice Left right
+            agent_model.llm.base_model.model.kv_cache_head -= eoi_idx[0] + 1
+            left -= eoi_idx[0] + 1
+            right -= eoi_idx[0] + 1
+        print("prompt: {}".format(prompt))
+        print('*' * 20)
+        boi_idx = torch.where(input_ids == boi_token_id)[1].tolist()
+        eoi_idx = torch.where(input_ids == eoi_token_id)[1].tolist()
+        ids_cmp_mask = torch.zeros_like(input_ids, dtype=torch.bool)
+        for i in range(image_embeds.shape[0]):
+            ids_cmp_mask[0, boi_idx[i] + 1:eoi_idx[i]] = True
+        embeds_cmp_mask = torch.tensor([True] * image_embeds.shape[0]).to(device, dtype=torch.bool)
+        output = agent_model.generate(tokenizer=tokenizer,
+                                      input_ids=input_ids,
+                                      image_embeds=image_embeds,
+                                      embeds_cmp_mask=embeds_cmp_mask,
+                                      ids_cmp_mask=ids_cmp_mask,
+                                      max_new_tokens=500,
+                                      num_img_gen_tokens=num_img_out_tokens,
+                                      past_key_values=None)
+        with open("{}/text.txt".format(save_folder), 'a+') as text_file:
+            text_file.write(text + '\n')
+        with open("{}/token.txt".format(save_folder), 'a+') as token_file:
+            token_file.write("context token: {} boi_idx: {}\n".format(input_ids.shape, boi_idx))

src/models/__init__.py ADDED Viewed

File without changes

src/models/discrete_models.py ADDED Viewed

	@@ -0,0 +1,454 @@

+import torch
+import torch.nn as nn
+import pyrootutils
+import torch.distributed as dist
+import torch.nn.functional as F
+pyrootutils.setup_root(__file__, indicator='.project-root', pythonpath=True)
+from src.train.dist_utils import concat_all_gather
+def cosine_loss(rec, target):
+    target = target / target.norm(dim=-1, keepdim=True)
+    rec = rec / rec.norm(dim=-1, keepdim=True)
+    rec_loss = (1 - (target * rec).sum(-1)).mean()
+    return rec_loss
+def contrastive_loss(image_feats, text_feats, logit_scale):
+    image_feats = image_feats.unsqueeze(1).contiguous()
+    image_feats_all = concat_all_gather(image_feats)  # [batch_size*num_gpu, num_query_tokens, embed_dim]
+    text_feats_all = concat_all_gather(text_feats)  # [batch_size*num_gpu, embed_dim]
+    sim_q2t = torch.matmul(image_feats.unsqueeze(1), text_feats_all.unsqueeze(-1)).squeeze()
+    # [batch_size, batch_size*num_gpu, num_query_tokens]
+    # image-text similarity: aggregate across all query tokens
+    # sim_i2t, _ = sim_q2t.max(-1)
+    # sim_i2t = sim_q2t.mean(-1)
+    sim_i2t = sim_q2t
+    sim_i2t = sim_i2t / logit_scale
+    # text-query similarity: [batch_size, batch_size*num_gpu, num_query_tokens]
+    sim_t2q = torch.matmul(text_feats.unsqueeze(1).unsqueeze(1), image_feats_all.permute(0, 2, 1)).squeeze()
+    # print(image_feats_all.shape, text_feat_all.shape, sim_q2t.shape, sim_t2q.shape)
+    # text-image similarity: aggregate across all query tokens
+    # sim_t2i, _ = sim_t2q.max(-1)
+    # sim_t2i = sim_t2q.mean(-1)
+    sim_t2i = sim_t2q
+    sim_t2i = sim_t2i / logit_scale  # [batch_size, batch_size*num_gpu]
+    rank = dist.get_rank()
+    bs = image_feats.size(0)
+    targets = torch.linspace(rank * bs, rank * bs + bs - 1, bs, dtype=int).to(image_feats.device)
+    loss_itc = (F.cross_entropy(sim_i2t, targets, label_smoothing=0.1) +
+                F.cross_entropy(sim_t2i, targets, label_smoothing=0.1)) / 2
+    i2t_acc = (sim_i2t.argmax(-1) == targets).sum() / len(sim_i2t)
+    t2i_acc = (sim_t2i.argmax(-1) == targets).sum() / len(sim_t2i)
+    return loss_itc, i2t_acc, t2i_acc
+class DiscreteModleOnlyDistill(nn.Module):
+    def __init__(self,
+                 qformer,
+                 quantizer,
+                 distiller=None,
+                 loss_type='cosine',
+                 scale_commit_loss=1.0,
+                 freeze_qformer=False) -> None:
+        super().__init__()
+        self.qformer = qformer
+        self.quantizer = quantizer
+        self.distiller = distiller
+        self.loss_type = loss_type
+        self.scale_commit_loss = scale_commit_loss
+        self.freeze_qformer = freeze_qformer
+        if freeze_qformer:
+            self.qformer.requires_grad_(False)
+    def forward(self, image_embeds, input_ids=None, text_attention_mask=None, text_embeds=None):
+        if self.freeze_qformer:
+            with torch.no_grad():
+                qforemr_embeds = self.qformer(image_embeds=image_embeds)
+        else:
+            qforemr_embeds = self.qformer(image_embeds=image_embeds)
+        quantizer_output = self.quantizer(qforemr_embeds)
+        recon_embeds = self.distiller(quantizer_output['quant_embeds'])
+        if self.loss_type == 'cosine':
+            distill_loss = cosine_loss(recon_embeds, image_embeds)
+        else:
+            raise NotImplementedError
+        total_loss = distill_loss + self.scale_commit_loss * \
+                     quantizer_output['commit_loss']
+        return {
+            'total_loss': total_loss,
+            'distill_loss': distill_loss,
+            'commit_loss': quantizer_output['commit_loss'],
+            'indices': quantizer_output['indices']
+        }
+    def encode_image_embeds(self, image_embeds):
+        qforemr_embeds = self.qformer(image_embeds=image_embeds)
+        quantizer_output = self.quantizer(qforemr_embeds)
+        output_embeds = quantizer_output['quant_embeds']
+        if self.distiller is not None:
+            output_embeds = self.distiller(output_embeds)
+        return output_embeds
+    @classmethod
+    def from_pretrained(cls, qformer, quantizer, distiller=None, pretrained_model_path=None, **kwargs):
+        model = cls(qformer=qformer, quantizer=quantizer, distiller=distiller, **kwargs)
+        if pretrained_model_path is not None:
+            ckpt = torch.load(pretrained_model_path, map_location='cpu')
+            missing, unexpected = model.load_state_dict(ckpt, strict=False)
+            print('missing keys: ', len(missing), 'unexpected keys:', len(unexpected))
+        return model
+class DiscreteModleIdentity(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.model = nn.Identity()
+    def forward(self, image_embeds, input_ids=None, text_attention_mask=None, text_embeds=None):
+        return
+    def encode_image_embeds(self, image_embeds):
+        return self.model(image_embeds)
+class DiscreteModleStageOneContrastive(nn.Module):
+    def __init__(self, qformer, quantizer=None, distiller=None, projection_dim=1024,
+                 image_cls_token_type='last') -> None:
+        super().__init__()
+        self.qformer = qformer
+        self.quantizer = quantizer
+        self.distiller = distiller
+        self.image_cls_token_type = image_cls_token_type
+        self.logit_scale = nn.Parameter(0.07 * torch.ones([]))
+        self.image_proj = nn.Linear(qformer.perceiver.config.projection_dim, projection_dim, bias=False)
+        self.text_proj = nn.Linear(qformer.perceiver.config.projection_dim, projection_dim, bias=False)
+    def forward(self, image_embeds, input_ids=None, text_attention_mask=None, text_embeds=None):
+        image_embeds = self.qformer(image_embeds=image_embeds)
+        if self.image_cls_token_type == 'last':
+            image_embeds = image_embeds[:, -1, :]
+        else:
+            raise NotImplementedError
+        text_embeds = self.qformer(input_ids=input_ids, text_attention_mask=text_attention_mask)
+        text_embeds = text_embeds[:, 0, :]
+        image_embeds = F.normalize(self.image_proj(image_embeds), dim=-1)
+        text_embeds = F.normalize(self.text_proj(text_embeds), dim=-1)
+        contrast_loss, i2t_acc, t2i_acc = contrastive_loss(image_feats=image_embeds,
+                                                           text_feats=text_embeds,
+                                                           logit_scale=self.logit_scale)
+        return {
+            'total_loss': contrast_loss,
+            'i2t_acc': i2t_acc,
+            't2i_acc': t2i_acc,
+        }
+    def encode_image_embeds(self, image_embeds):
+        image_embeds = self.qformer(image_embeds=image_embeds)
+        return image_embeds
+    @classmethod
+    def from_pretrained(cls, qformer, quantizer, distiller=None, pretrained_model_path=None, **kwargs):
+        model = cls(qformer=qformer, quantizer=quantizer, distiller=distiller, **kwargs)
+        if pretrained_model_path is not None:
+            ckpt = torch.load(pretrained_model_path, map_location='cpu')
+            missing, unexpected = model.load_state_dict(ckpt, strict=False)
+            print('missing keys: ', len(missing), 'unexpected keys:', len(unexpected))
+        return model
+class DiscreteModleStageTwoContrastiveDistill(nn.Module):
+    def __init__(self,
+                 qformer,
+                 quantizer=None,
+                 distiller=None,
+                 contrast_head=None,
+                 projection_dim=1024,
+                 distill_loss_type='cosine',
+                 freeze_qformer=True,
+                 image_cls_token_type='last',
+                 scale_commit_loss=1.0,
+                 scale_contrast_loss=1.0,
+                 scale_distill_loss=1.0) -> None:
+        super().__init__()
+        self.qformer = qformer
+        self.quantizer = quantizer
+        self.distiller = distiller
+        self.contrast_head = contrast_head
+        self.distill_loss_type = distill_loss_type
+        self.image_cls_token_type = image_cls_token_type
+        if self.contrast_head is not None:
+            self.logit_scale = nn.Parameter(0.07 * torch.ones([]))
+            self.image_proj = nn.Linear(contrast_head.perceiver.config.projection_dim, projection_dim, bias=False)
+            self.text_proj = nn.Linear(contrast_head.perceiver.config.projection_dim, projection_dim, bias=False)
+        self.freeze_qformer = freeze_qformer
+        if freeze_qformer:
+            self.qformer.requires_grad_(False)
+        self.scale_commit_loss = scale_commit_loss
+        self.scale_contrast_loss = scale_contrast_loss
+        self.scale_distill_loss = scale_distill_loss
+    def forward(self, image_embeds, input_ids=None, text_attention_mask=None, text_embeds=None):
+        if self.freeze_qformer:
+            with torch.no_grad():
+                qforemr_embeds = self.qformer(image_embeds=image_embeds)
+        else:
+            qforemr_embeds = self.qformer(image_embeds=image_embeds)
+        quantizer_output = self.quantizer(qforemr_embeds)
+        output_state = {}
+        output_state['indices'] = quantizer_output['indices']
+        output_state['commit_loss'] = quantizer_output['commit_loss']
+        output_state['total_loss'] = self.scale_commit_loss * quantizer_output['commit_loss']
+        if self.distiller is not None:
+            recon_embeds = self.distiller(quantizer_output['quant_embeds'])
+            if self.distill_loss_type == 'cosine':
+                distill_loss = cosine_loss(recon_embeds, image_embeds)
+            else:
+                raise NotImplementedError
+            output_state['distill_loss'] = distill_loss
+            output_state['total_loss'] += self.scale_distill_loss * distill_loss
+        if self.contrast_head is not None:
+            text_embeds = self.qformer(input_ids=input_ids, text_attention_mask=text_attention_mask)
+            text_embeds = text_embeds[:, 0, :]
+            image_embeds = self.contrast_head(quantizer_output['quant_embeds'])
+            if self.image_cls_token_type == 'last':
+                image_embeds = image_embeds[:, -1, :]
+            else:
+                raise NotImplementedError
+            image_embeds = F.normalize(self.image_proj(image_embeds), dim=-1)
+            text_embeds = F.normalize(self.text_proj(text_embeds), dim=-1)
+            contrast_loss, i2t_acc, t2i_acc = contrastive_loss(image_feats=image_embeds,
+                                                               text_feats=text_embeds,
+                                                               logit_scale=self.logit_scale)
+            output_state['contrast_loss'] = contrast_loss
+            output_state['total_loss'] += self.scale_contrast_loss * contrast_loss
+            output_state['i2t_acc'] = i2t_acc
+            output_state['t2i_acc'] = t2i_acc
+        return output_state
+    def encode_image_embeds(self, image_embeds):
+        pass
+    @classmethod
+    def from_pretrained(cls, qformer, quantizer, distiller=None, contrast_head=None, pretrained_model_path=None,
+                        **kwargs):
+        model = cls(qformer=qformer, quantizer=quantizer, distiller=distiller, contrast_head=contrast_head, **kwargs)
+        if pretrained_model_path is not None:
+            ckpt = torch.load(pretrained_model_path, map_location='cpu')
+            missing, unexpected = model.load_state_dict(ckpt, strict=False)
+            print('missing keys: ', len(missing), 'unexpected keys:', len(unexpected))
+        return model
+class DiscreteModleDistillWithDoubleContrastive(nn.Module):
+    def __init__(
+            self,
+            qformer,
+            quantizer=None,
+            distiller=None,
+            contrast_head=None,
+            projection_dim=1024,
+            distill_loss_type='cosine',
+            share_contrast_head=True,  # share contrastive head with distiller
+            quantize_cls_token=False,
+            rec_qformer=False,
+            has_contrast=False,
+            freeze_qformer=False,
+            scale_commit_loss=1.0,
+            scale_contrast_loss=1.0,
+            scale_distill_loss=1.0) -> None:
+        super().__init__()
+        self.qformer = qformer
+        self.quantizer = quantizer
+        self.distiller = distiller
+        self.contrast_head = contrast_head
+        self.distill_loss_type = distill_loss_type
+        self.quantize_cls_token = quantize_cls_token
+        self.rec_qformer = rec_qformer
+        self.has_contrast = has_contrast
+        if freeze_qformer:
+            self.qformer.requires_grad_(False)
+        else:
+            self.logit_scale_qformer = nn.Parameter(0.07 * torch.ones([]))
+            self.image_proj_qformer = nn.Linear(qformer.perceiver.config.projection_dim, projection_dim, bias=False)
+            self.text_proj_qformer = nn.Linear(qformer.perceiver.config.projection_dim, projection_dim, bias=False)
+            self.cls_norm_qformer = nn.LayerNorm(qformer.perceiver.config.projection_dim)
+        if self.contrast_head is not None:
+            self.logit_scale_head = nn.Parameter(0.07 * torch.ones([]))
+            self.image_proj_head = nn.Linear(contrast_head.perceiver.config.projection_dim, projection_dim, bias=False)
+            self.text_proj_head = nn.Linear(qformer.perceiver.config.projection_dim, projection_dim, bias=False)
+            self.cls_norm_head = nn.LayerNorm(contrast_head.perceiver.config.projection_dim)
+        if share_contrast_head and distiller is not None:
+            self.logit_scale_head = nn.Parameter(0.07 * torch.ones([]))
+            self.image_proj_head = nn.Linear(distiller.perceiver.config.projection_dim, projection_dim, bias=False)
+            self.text_proj_head = nn.Linear(qformer.perceiver.config.projection_dim, projection_dim, bias=False)
+            self.cls_norm_head = nn.LayerNorm(distiller.perceiver.config.projection_dim)
+        self.scale_commit_loss = scale_commit_loss
+        self.scale_contrast_loss = scale_contrast_loss
+        self.scale_distill_loss = scale_distill_loss
+        self.share_contrast_head = share_contrast_head
+        self.freeze_qformer = freeze_qformer
+        assert int(self.share_contrast_head) + int(contrast_head is not None) <= 1
+    def forward(self, image_embeds, input_ids=None, text_attention_mask=None, text_embeds=None):
+        if self.freeze_qformer:
+            with torch.no_grad():
+                qforemr_embeds = self.qformer(image_embeds=image_embeds)
+        else:
+            qforemr_embeds = self.qformer(image_embeds=image_embeds)
+        qforemr_cls_embeds = qforemr_embeds[:, -1, :]
+        if not self.quantize_cls_token:
+            qforemr_embeds = qforemr_embeds[:, :-1, :]
+        if self.has_contrast:
+            text_embeds = self.qformer(input_ids=input_ids, text_attention_mask=text_attention_mask)
+            text_cls_embeds = text_embeds[:, 0, :]
+        output_state = {}
+        output_state['total_loss'] = 0.0
+        if not self.freeze_qformer and self.has_contrast:
+            qforemr_cls_embeds = self.cls_norm_qformer(qforemr_cls_embeds)
+            qformer_image_embeds = F.normalize(self.image_proj_qformer(qforemr_cls_embeds), dim=-1)
+            qformer_text_embeds = F.normalize(self.text_proj_qformer(text_cls_embeds), dim=-1)
+            qformer_contrast_loss, \
+                qformer_i2t_acc, \
+                qformer_t2i_acc = contrastive_loss(image_feats=qformer_image_embeds,
+                                                    text_feats=qformer_text_embeds,
+                                                    logit_scale=self.logit_scale_qformer)
+            output_state['qformer_contrast_loss'] = qformer_contrast_loss
+            output_state['total_loss'] += self.scale_contrast_loss * qformer_contrast_loss
+            output_state['qformer_i2t_acc'] = qformer_i2t_acc
+            output_state['qformer_t2i_acc'] = qformer_t2i_acc
+        if self.quantizer is not None and self.distiller is not None:
+            quantizer_output = self.quantizer(qforemr_embeds)
+            recon_embeds = self.distiller(quantizer_output['quant_embeds'])
+            if self.share_contrast_head:
+                contrast_head_cls_embeds = recon_embeds[:, -1, :]
+                contrast_head_cls_embeds = self.cls_norm_head(contrast_head_cls_embeds)
+                recon_embeds = recon_embeds[:, :-1, :]
+            if self.contrast_head is not None:
+                contrast_head_embeds = self.contrast_head(quantizer_output['quant_embeds'])
+                contrast_head_cls_embeds = contrast_head_embeds[:, -1, :]
+                contrast_head_cls_embeds = self.cls_norm_head(contrast_head_cls_embeds)
+            output_state['indices'] = quantizer_output['indices']
+            output_state['commit_loss'] = quantizer_output['commit_loss']
+            output_state['total_loss'] += self.scale_commit_loss * quantizer_output['commit_loss']
+            if self.rec_qformer:
+                target_embeds = qforemr_embeds
+            else:
+                target_embeds = image_embeds
+            if self.distill_loss_type == 'cosine':
+                distill_loss = cosine_loss(recon_embeds, target_embeds)
+            else:
+                raise NotImplementedError
+            output_state['distill_loss'] = distill_loss
+            output_state['total_loss'] += self.scale_distill_loss * distill_loss
+            if self.contrast_head is not None or self.share_contrast_head:
+                head_image_embeds = F.normalize(self.image_proj_head(contrast_head_cls_embeds), dim=-1)
+                head_text_embeds = F.normalize(self.text_proj_head(text_cls_embeds), dim=-1)
+                head_contrast_loss, head_i2t_acc, head_t2i_acc = contrastive_loss(image_feats=head_image_embeds,
+                                                                                  text_feats=head_text_embeds,
+                                                                                  logit_scale=self.logit_scale_head)
+                output_state['head_contrast_loss'] = head_contrast_loss
+                output_state['total_loss'] += self.scale_contrast_loss * head_contrast_loss
+                output_state['head_i2t_acc'] = head_i2t_acc
+                output_state['head_t2i_acc'] = head_t2i_acc
+        return output_state
+    def encode_image_embeds(self, image_embeds):
+        qforemr_embeds = self.qformer(image_embeds=image_embeds)
+        return qforemr_embeds
+    @classmethod
+    def from_pretrained(cls, qformer, quantizer=None, distiller=None, contrast_head=None, pretrained_model_path=None,
+                        **kwargs):
+        model = cls(qformer=qformer, quantizer=quantizer, distiller=distiller, contrast_head=contrast_head, **kwargs)
+        if pretrained_model_path is not None:
+            ckpt = torch.load(pretrained_model_path, map_location='cpu')
+            missing, unexpected = model.load_state_dict(ckpt, strict=False)
+            print('missing keys: ', len(missing), 'unexpected keys:', len(unexpected))
+        return model
+    @classmethod
+    def from_pretrained_stage1_yuying(cls,
+                                      qformer,
+                                      quantizer=None,
+                                      distiller=None,
+                                      contrast_head=None,
+                                      pretrained_model_path=None,
+                                      **kwargs):
+        model = cls(qformer=qformer, quantizer=quantizer, distiller=distiller, contrast_head=contrast_head, **kwargs)
+        if pretrained_model_path is not None:
+            ckpt = torch.load(pretrained_model_path, map_location='cpu')
+            ckpt = ckpt['model']
+            new_ckpt = {}
+            new_ckpt['qformer.embed_module.query'] = ckpt['query_tokens'].squeeze(0)
+            new_ckpt['qformer.norm.weight'] = ckpt['ln_vision.weight']
+            new_ckpt['qformer.norm.bias'] = ckpt['ln_vision.bias']
+            for key in ckpt.keys():
+                if key.startswith('Qformer'):
+                    new_key = key.replace('Qformer', 'qformer.perceiver')
+                    new_ckpt[new_key] = ckpt[key]
+            del ckpt
+            missing, unexpected = model.load_state_dict(new_ckpt, strict=False)
+            print('missing keys: ', len(missing), 'unexpected keys:', len(unexpected))
+            print(missing)
+            print(unexpected)
+        return model

src/models/qwen_visual.py ADDED Viewed

	@@ -0,0 +1,501 @@

+# Copyright (c) Alibaba Cloud.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from collections import OrderedDict
+import math
+import requests
+from io import BytesIO
+from functools import partial
+from PIL import Image
+from typing import Callable, Optional, Sequence, Tuple, List
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.init import trunc_normal_
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+def get_abs_pos(abs_pos, tgt_size):
+    # abs_pos: L, C
+    # tgt_size: M
+    # return: M, C
+    src_size = int(math.sqrt(abs_pos.size(0)))
+    tgt_size = int(math.sqrt(tgt_size))
+    dtype = abs_pos.dtype
+    if src_size != tgt_size:
+        return F.interpolate(
+            abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
+            size=(tgt_size, tgt_size),
+            mode="bicubic",
+            align_corners=False,
+        ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
+    else:
+        return abs_pos
+# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000 ** omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+class Resampler(nn.Module):
+    """
+    A 2D perceiver-resampler network with one cross attention layers by
+        (grid_size**2) learnable queries and 2d sincos pos_emb
+    Outputs:
+        A tensor with the shape of (grid_size**2, embed_dim)
+    """
+    def __init__(self, grid_size, embed_dim, num_heads, kv_dim=None, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.num_queries = grid_size ** 2
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.pos_embed = nn.Parameter(torch.from_numpy(get_2d_sincos_pos_embed(embed_dim,
+                                                                               grid_size)).float()).requires_grad_(
+            False)
+        self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
+        trunc_normal_(self.query, std=.02)
+        if kv_dim is not None and kv_dim != embed_dim:
+            self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
+            self.out_dim = kv_dim
+        else:
+            self.kv_proj = nn.Identity()
+            self.out_dim = embed_dim
+        self.attn = nn.MultiheadAttention(embed_dim, num_heads)
+        self.ln_q = norm_layer(embed_dim)
+        self.ln_kv = norm_layer(embed_dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def forward(self, x, attn_mask=None):
+        pos_embed = get_abs_pos(self.pos_embed, x.size(1))
+        x = self.kv_proj(x)
+        x = self.ln_kv(x).permute(1, 0, 2)
+        N = x.shape[1]
+        q = self.ln_q(self.query)
+        out = \
+        self.attn(self._repeat(q, N) + self.pos_embed.unsqueeze(1), x + pos_embed.unsqueeze(1), x, attn_mask=attn_mask)[
+            0]
+        return out.permute(1, 0, 2)
+    def _repeat(self, query, N: int):
+        return query.unsqueeze(1).repeat(1, N, 1)
+class VisualAttention(nn.Module):
+    """self-attention layer class.
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+    def __init__(self, embed_dim, num_heads, bias=True, kdim=None, vdim=None):
+        super(VisualAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        # Per attention head and per partition values.
+        assert embed_dim % num_heads == 0
+        self.hidden_size_per_attention_head = embed_dim // num_heads
+        self.num_attention_heads_per_partition = num_heads
+        self.hidden_size_per_partition = embed_dim
+        # Strided linear layer.
+        assert self._qkv_same_embed_dim, 'Only Support SelfAttention Currently'
+        self.in_proj = nn.Linear(embed_dim, 3 * embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+    def forward(self, query, key, value, attn_mask=None):
+        # query/key/value: [sq, b, h]
+        sq, b, _ = query.size()
+        assert query is key, 'Only Support Self-Attention Currently'
+        sk = sq
+        mixed_x_layer = self.in_proj(query)
+        # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn]
+        new_tensor_shape = mixed_x_layer.size()[:-1] + \
+                           (self.num_attention_heads_per_partition,
+                            3 * self.hidden_size_per_attention_head)
+        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+        # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+        query_layer, key_layer, value_layer = mixed_x_layer.split(self.hidden_size_per_attention_head, dim=-1)
+        # [sq, b, np, hn] -> [sq, b * np, hn]
+        query_layer = query_layer.view(sq, b * self.num_attention_heads_per_partition,
+                                       self.hidden_size_per_attention_head).transpose(0, 1)
+        # [sk, b, np, hn] -> [sk, b * np, hn]
+        key_layer = key_layer.view(sk, b * self.num_attention_heads_per_partition,
+                                   self.hidden_size_per_attention_head).transpose(0, 1)
+        q_scaled = query_layer / self.norm_factor
+        if attn_mask is not None:
+            attention_probs = torch.baddbmm(attn_mask, q_scaled, key_layer.transpose(-2, -1))
+        else:
+            attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1))
+        attention_probs = attention_probs.softmax(dim=-1)
+        value_layer = value_layer.view(sk, b * self.num_attention_heads_per_partition,
+                                       self.hidden_size_per_attention_head).transpose(0, 1)
+        # matmul: [b * np, sq, hn]
+        context_layer = torch.bmm(attention_probs, value_layer)
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.view(b, self.num_attention_heads_per_partition, sq,
+                                           self.hidden_size_per_attention_head)
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+                                  (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        output = self.out_proj(context_layer)
+        return output
+class VisualAttentionBlock(nn.Module):
+    def __init__(
+            self,
+            d_model: int,
+            n_head: int,
+            mlp_ratio: float = 4.0,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = nn.LayerNorm,
+            is_cross_attention: bool = False,
+    ):
+        super().__init__()
+        self.ln_1 = norm_layer(d_model)
+        if is_cross_attention:
+            self.ln_1_kv = norm_layer(d_model)
+        self.ln_2 = norm_layer(d_model)
+        mlp_width = int(d_model * mlp_ratio)
+        self.attn = VisualAttention(d_model, n_head)
+        self.mlp = nn.Sequential(
+            OrderedDict([("c_fc", nn.Linear(d_model, mlp_width)), ("gelu", act_layer()),
+                         ("c_proj", nn.Linear(mlp_width, d_model))]))
+    def attention(
+            self,
+            q_x: torch.Tensor,
+            k_x: Optional[torch.Tensor] = None,
+            v_x: Optional[torch.Tensor] = None,
+            attn_mask: Optional[torch.Tensor] = None,
+    ):
+        k_x = k_x if k_x is not None else q_x
+        v_x = v_x if v_x is not None else q_x
+        attn_mask = attn_mask.to(q_x.dtype) if attn_mask is not None else None
+        return self.attn(q_x, k_x, v_x, attn_mask=attn_mask)
+    def forward(
+            self,
+            q_x: torch.Tensor,
+            k_x: Optional[torch.Tensor] = None,
+            v_x: Optional[torch.Tensor] = None,
+            attn_mask: Optional[torch.Tensor] = None,
+    ):
+        k_x = self.ln_1_kv(k_x) if hasattr(self, "ln_1_kv") and k_x is not None else None
+        v_x = self.ln_1_kv(v_x) if hasattr(self, "ln_1_kv") and v_x is not None else None
+        x = q_x + self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(
+            self,
+            width: int,
+            layers: int,
+            heads: int,
+            mlp_ratio: float = 4.0,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = nn.LayerNorm,
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.ModuleList(
+            [VisualAttentionBlock(width, heads, mlp_ratio, act_layer=act_layer, norm_layer=norm_layer) for _ in
+             range(layers)])
+    def get_cast_dtype(self) -> torch.dtype:
+        return self.resblocks[0].mlp.c_fc.weight.dtype
+    def get_cast_device(self) -> torch.device:
+        return self.resblocks[0].mlp.c_fc.weight.device
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        for r in self.resblocks:
+            x = r(x, attn_mask=attn_mask)
+        return x
+class VisionTransformerWithAttnPool(nn.Module):
+    def __init__(self,
+                 image_size: int,
+                 patch_size: int,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 mlp_ratio: float,
+                 n_queries: int = 256,
+                 output_dim: int = 512,
+                 **kwargs):
+        super().__init__()
+        image_height, image_width = self.image_size = (image_size, image_size)
+        patch_height, patch_width = self.patch_size = (patch_size, patch_size)
+        self.grid_size = (image_height // patch_height, image_width // patch_width)
+        self.output_dim = output_dim
+        mean = (0.48145466, 0.4578275, 0.40821073)
+        std = (0.26862954, 0.26130258, 0.27577711)
+        self.image_transform = transforms.Compose([
+            transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        # class embeddings and positional embeddings
+        scale = width ** -0.5
+        self.positional_embedding = nn.Parameter(scale * torch.randn(256, width))
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        act_layer = nn.GELU
+        self.ln_pre = norm_layer(width)
+        self.transformer = TransformerBlock(
+            width,
+            layers,
+            heads,
+            mlp_ratio,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+        self.attn_pool = Resampler(
+            grid_size=int(math.sqrt(n_queries)),
+            embed_dim=output_dim,
+            num_heads=output_dim // 128,
+            kv_dim=width,
+            norm_layer=norm_layer,
+        )
+        self.ln_post = norm_layer(output_dim)
+        self.proj = nn.Parameter((output_dim ** -0.5) * torch.randn(output_dim, output_dim))
+    def forward(self, x: torch.Tensor):
+        x = x.to(
+            dtype=self.transformer.get_cast_dtype(),
+            device=self.transformer.get_cast_device(),
+        )
+        # to patches
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        # shape = [*, width, grid ** 2]
+        x = x.reshape(x.shape[0], x.shape[1], -1)
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = x + get_abs_pos(self.positional_embedding, x.size(1))
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.attn_pool(x)
+        x = self.ln_post(x)
+        x = x @ self.proj
+        return x
+    def encode(self, image_paths: List[str]):
+        images = []
+        for image_path in image_paths:
+            if image_path.startswith("http://") or image_path.startswith("https://"):
+                image = Image.open(requests.get(image_path, stream=True).raw)
+            else:
+                image = Image.open(image_path)
+            image = image.convert("RGB")
+            images.append(self.image_transform(image))
+        images = torch.stack(images, dim=0)
+        return self(images)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path=None, **kawrgs):
+        model = cls(**kawrgs)
+        if pretrained_model_path is not None:
+            ckpt = torch.load(pretrained_model_path, map_location='cpu')
+            missing, unexpected = model.load_state_dict(ckpt, strict=False)
+            print('Load ckpt of qwen visual encoder')
+            print('missing keys: ', len(missing), 'unexpected keys:', len(unexpected))
+        return model
+class VisionTransformer(nn.Module):
+    def __init__(self,
+                 image_size: int,
+                 patch_size: int,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 mlp_ratio: float,
+                 n_queries: int = 256,
+                 output_dim: int = 512,
+                 **kwargs):
+        super().__init__()
+        image_height, image_width = self.image_size = (image_size, image_size)
+        patch_height, patch_width = self.patch_size = (patch_size, patch_size)
+        self.grid_size = (image_height // patch_height, image_width // patch_width)
+        self.output_dim = output_dim
+        mean = (0.48145466, 0.4578275, 0.40821073)
+        std = (0.26862954, 0.26130258, 0.27577711)
+        self.image_transform = transforms.Compose([
+            transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        # class embeddings and positional embeddings
+        scale = width ** -0.5
+        self.positional_embedding = nn.Parameter(scale * torch.randn(256, width))
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        act_layer = nn.GELU
+        self.ln_pre = norm_layer(width)
+        self.transformer = TransformerBlock(
+            width,
+            layers,
+            heads,
+            mlp_ratio,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+    def forward(self, x: torch.Tensor):
+        x = x.to(
+            dtype=self.transformer.get_cast_dtype(),
+            device=self.transformer.get_cast_device(),
+        )
+        # to patches
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        # shape = [*, width, grid ** 2]
+        x = x.reshape(x.shape[0], x.shape[1], -1)
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = x + get_abs_pos(self.positional_embedding, x.size(1))
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        return x
+    def encode(self, image_paths: List[str]):
+        images = []
+        for image_path in image_paths:
+            if image_path.startswith("http://") or image_path.startswith("https://"):
+                image = Image.open(requests.get(image_path, stream=True).raw)
+            else:
+                image = Image.open(image_path)
+            image = image.convert("RGB")
+            images.append(self.image_transform(image))
+        images = torch.stack(images, dim=0)
+        return self(images)

src/models_clm/__init__.py ADDED Viewed

File without changes

src/models_clm/generation.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch
+from transformers import LogitsProcessor
+BOI_TOKEN = '<img>'
+EOI_TOKEN = '</img>'
+IMG_TOKEN = '<img_{:05d}>'
+class AutoImageTokenGenerationProcessor(LogitsProcessor):
+    def __init__(self, tokenizer, num_img_gen_tokens=64) -> None:
+        super().__init__()
+        # self.boi_token_id = tokenizer.encode(BOI_TOKEN)[0]
+        # self.eoi_token_id = tokenizer.encode(EOI_TOKEN)[0]
+        img_all_token_str = ''.join([BOI_TOKEN] + [IMG_TOKEN.format(int(item))
+                                                   for item in range(num_img_gen_tokens)] + [EOI_TOKEN])
+        self.img_ids_list = tokenizer.encode(img_all_token_str, add_special_tokens=False)
+    def __call__(self, input_ids, scores):
+        bz = input_ids.shape[0]
+        for i in range(bz):
+            cur_input_id = input_ids[i, -1].item()
+            if cur_input_id in self.img_ids_list[:-1]:
+                output_id = self.img_ids_list[self.img_ids_list.index(cur_input_id) + 1]
+                scores[i, ..., output_id] = scores[i, ...].max() + 10.
+            else:
+                scores[i, ..., torch.tensor(self.img_ids_list[1:]).to(dtype=torch.long)] = 0.0
+        return scores

src/models_clm/modeling_llama_4_35.py ADDED Viewed

	@@ -0,0 +1,1236 @@

+# flake8: noqa
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch LLaMA model."""
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, \
+    SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.utils.import_utils import is_torch_fx_available
+from transformers.models.llama.configuration_llama import LlamaConfig
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
+# It means that the function will not be traced through and simply appear as a node in the graph.
+if is_torch_fx_available():
+    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "LlamaConfig"
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    warnings.warn(
+        "Calling `transformers.models.llama.modeling_llama._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils.AttentionMaskConverter._prepare_4d_attention_mask" # yapf: disable # noqa
+    )
+    return AttentionMaskConverter._prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device,
+                      past_key_values_length: int = 0):
+    warnings.warn(
+        "Calling `transformers.models.llama.modeling_llama._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.llama.modeling_llama.AttentionMaskConverter._make_causal_mask" # yapf: disable # noqa
+    )
+    return AttentionMaskConverter._make_causal_mask(input_ids_shape=input_ids_shape,
+                                                    dtype=dtype,
+                                                    device=device,
+                                                    past_key_values_length=past_key_values_length)
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm)
+class LlamaRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(seq_len=max_position_embeddings, device=self.inv_freq.device,
+                                dtype=torch.get_default_dtype())
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * ((self.scaling_factor * seq_len / self.max_position_embeddings) -
+                                (self.scaling_factor - 1)) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class LlamaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+            gate_proj = torch.cat([F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [F.linear(intermediate_states[i], down_proj_slices[i]) for i in
+                         range(self.config.pretraining_tp)]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                             f" and `num_heads`: {self.num_heads}).")
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+        self._init_rope()
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = LlamaRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+            **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split((self.num_heads * self.head_dim) // self.config.pretraining_tp,
+                                                    dim=0)
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                             f" {attn_weights.size()}")
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}")
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                             f" {attn_output.size()}")
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class LlamaFlashAttention2(LlamaAttention):
+    """
+    Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+            **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # LlamaFlashAttention2 attention does not support output_attentions
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        output_attentions = False
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        # TODO: llama does not have dropout in the config??
+        # It is recommended to use dropout with FA according to the docs
+        # when training.
+        dropout_rate = 0.0  # if not self.training else self.attn_dropout
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (LlamaRMSNorm handles it correctly)
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            # Handle the case where the model is quantized
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}.")
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = self._flash_attention_forward(query_states,
+                                                    key_states,
+                                                    value_states,
+                                                    attention_mask,
+                                                    q_len,
+                                                    dropout=dropout_rate)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def _flash_attention_forward(self,
+                                 query_states,
+                                 key_states,
+                                 value_states,
+                                 attention_mask,
+                                 query_length,
+                                 dropout=0.0,
+                                 softmax_scale=None):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length)
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=self.is_causal,
+            )
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(query_states,
+                                          key_states,
+                                          value_states,
+                                          dropout,
+                                          softmax_scale=softmax_scale,
+                                          causal=self.is_causal)
+        return attn_output
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+                                     indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim),
+                                       indices_k)
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim),
+                                           indices_k)
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(batch_size + 1, dtype=torch.int32,
+                                        device=query_layer.device)  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = (LlamaAttention(
+            config=config) if not getattr(config, "_flash_attn_2_enabled", False) else LlamaFlashAttention2(
+            config=config))
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = False,
+            **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`LlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: LlamaConfig
+    """
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(past_key_values_length,
+                                        seq_length + past_key_values_length,
+                                        dtype=torch.long,
+                                        device=device)
+            position_ids = position_ids.unsqueeze(0)
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if getattr(self.config, "_flash_attn_2_enabled", False):
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(attention_mask, (batch_size, seq_length), inputs_embeds,
+                                                               past_key_values_length)
+        # embed positions
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_value,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class LlamaForCausalLM(LlamaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None,
+                                      **kwargs):
+        if past_key_values is not None:
+            past_length = past_key_values[0][0].shape[2]
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+            input_ids = input_ids[:, remove_prefix_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update({
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "attention_mask": attention_mask,
+        })
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),)
+        return reordered_past
+@add_start_docstrings(
+    """
+    The LLaMa Model transformer with a sequence classification head on top (linear layer).
+    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    LLAMA_START_DOCSTRING,
+)
+class LlamaForSequenceClassification(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = LlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to(
+                    logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )

src/models_clm/modeling_llama_xformer.py ADDED Viewed

	@@ -0,0 +1,992 @@

+# flake8: noqa
+# coding=utf-8
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch LLaMA model."""
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.llama.configuration_llama import LlamaConfig
+import xformers.ops as xops
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "LlamaConfig"
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+        input_ids_shape: torch.Size,
+        dtype: torch.dtype,
+        device: torch.device,
+        past_key_values_length: int = 0,
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full(
+        (tgt_len, tgt_len),
+        torch.tensor(torch.finfo(dtype).min, device=device),
+        device=device,
+    )
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat(
+            [
+                torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device),
+                mask,
+            ],
+            dim=-1,
+        )
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+        return self.weight * hidden_states
+class LlamaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        # Build here to make `torch.jit.trace` work.
+        self.max_seq_len_cached = max_position_embeddings
+        t = torch.arange(
+            self.max_seq_len_cached,
+            device=self.inv_freq.device,
+            dtype=self.inv_freq.dtype,
+        )
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
+        self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+        if seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
+            self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            # self.cos_cached[:, :, :, ...].to(dtype=x.dtype),
+            # self.sin_cached[:, :, :, ...].to(dtype=x.dtype),
+        )
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class LlamaMLP(nn.Module):
+    def __init__(
+            self,
+            hidden_size: int,
+            intermediate_size: int,
+            hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                             f" and `num_heads`: {self.num_heads}).")
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # [bsz, nh, t, hd]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        # attn_weights
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attention_mask is None:
+            def lower_triangular_from_bottom_right_mask(qlen, klen, device):
+                """
+                Create a lower triangular mask from the bottom-right corner of a matrix.
+                Args:
+                - qlen (int): Length of the query dimension.
+                - klen (int): Length of the key dimension.
+                Returns:
+                - torch.Tensor: A mask with shape (1, 1, qlen, klen) where the bottom-right triangle is True.
+                """
+                # Create a grid of indices where rows correspond to query indices and columns to key indices
+                q_indices = torch.arange(qlen - 1, -1, -1, device=device).unsqueeze(1)  # Reverse the query indices
+                k_indices = torch.arange(klen - 1, -1, -1, device=device).unsqueeze(0)  # Reverse the key indices
+                # Generate the mask where we compare query indices to key indices
+                # The condition q_indices >= k_indices creates a lower triangular mask from the top-left corner
+                # By reversing both indices, we get the lower triangular effect from the bottom-right
+                mask = q_indices >= k_indices
+                # Reshape to (1, 1, qlen, klen) as required
+                return mask.unsqueeze(0).unsqueeze(0)
+            attention_mask = lower_triangular_from_bottom_right_mask(attn_weights.shape[-2], attn_weights.shape[-1],
+                                                                     device=attn_weights.device)
+        attn_weights = attn_weights + attention_mask
+        attn_weights = attn_weights[:, 0, :, :]
+        # attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        if self.training:
+            attn_output = xops.memory_efficient_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_bias=xops.LowerTriangularMask(),
+            )
+        else:
+            xops_attention_mask = xops.fmha.attn_bias.LowerTriangularFromBottomRightMask()
+            attn_output = xops.memory_efficient_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_bias=xops_attention_mask,
+            )
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttention(config=config)
+        self.mlp = LlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`LlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LlamaModel):
+            module.gradient_checkpointing = value
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: LlamaConfig
+    """
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype,
+                                              tgt_len=input_shape[-1]).to(inputs_embeds.device)
+            combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+        return combined_attention_mask
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        # if input_ids is not None and inputs_embeds is not None:
+        #     raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        # elif input_ids is not None:
+        if input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        # rm when use streaming
+        # if attention_mask is None:
+        #     attention_mask = torch.ones(
+        #         (batch_size, seq_length_with_past),
+        #         dtype=torch.bool,
+        #         device=inputs_embeds.device,
+        #     )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask,
+            (batch_size, seq_length),
+            inputs_embeds,
+            past_key_values_length,
+        )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class LlamaForCausalLM(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.past_key_values = None
+        self.kv_cache_head = None
+        self.use_kv_cache_head = True
+        # self.position_ids = None
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        self.past_key_values = outputs.past_key_values
+        if self.use_kv_cache_head and not self.training:
+            if self.kv_cache_head is None:
+                self.kv_cache_head = input_ids.shape[1]
+            else:
+                self.kv_cache_head += input_ids.shape[1]
+        # new_position_ids = torch.ones((1, 1), device=self.position_ids.device) * (self.position_ids[0, -1].item() + 1)
+        # self.position_ids = torch.cat((self.position_ids, new_position_ids), dim=1)
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+            self,
+            input_ids,
+            past_key_values=None,
+            attention_mask=None,
+            inputs_embeds=None,
+            **kwargs,
+    ):
+        if self.use_kv_cache_head and not self.training:
+            if past_key_values:
+                input_ids = input_ids[:, self.kv_cache_head:]
+                if inputs_embeds is not None:
+                    inputs_embeds = inputs_embeds[:, self.kv_cache_head:]
+            position_ids = kwargs.get("position_ids", None)
+            if attention_mask is not None and position_ids is None:
+                # create position_ids on the fly for batch generation
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                if past_key_values:
+                    position_ids = position_ids[:, self.kv_cache_head:].unsqueeze(-1)
+            # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+            if inputs_embeds is not None and past_key_values is None:
+                model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": input_ids}
+            elif past_key_values is not None and input_ids.shape[1] > 1:
+                model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": input_ids}
+            else:
+                model_inputs = {"input_ids": input_ids}
+            attention_mask = None
+        else:
+            if past_key_values:
+                input_ids = input_ids[:, -1:]
+            position_ids = kwargs.get("position_ids", None)
+            if attention_mask is not None and position_ids is None:
+                # create position_ids on the fly for batch generation
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                if past_key_values:
+                    position_ids = position_ids[:, -1].unsqueeze(-1)
+            # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+            if inputs_embeds is not None and past_key_values is None:
+                model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": input_ids}
+            else:
+                model_inputs = {"input_ids": input_ids}
+            attention_mask = None
+        model_inputs.update({
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "attention_mask": attention_mask,
+        })
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+@add_start_docstrings(
+    """
+    The LLaMa Model transformer with a sequence classification head on top (linear layer).
+    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    LLAMA_START_DOCSTRING,
+)
+class LlamaForSequenceClassification(LlamaPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = LlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+            else:
+                sequence_lengths = -1
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+if __name__ == "__main__":
+    from transformers import LlamaTokenizer
+    model = LlamaForCausalLM.from_pretrained("luodian/llama-7b-hf", device_map="auto")
+    tokenizer = LlamaTokenizer.from_pretrained("luodian/llama-7b-hf")
+    prompt = "Hey, are you consciours? Can you talk to me?"
+    inputs = tokenizer(prompt, return_tensors="pt")
+    generate_ids = model.generate(inputs.input_ids, max_length=30)
+    print(tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])

src/models_clm/models.py ADDED Viewed

	@@ -0,0 +1,336 @@

+import torch
+import torch.nn as nn
+from transformers import LlamaForCausalLM, LlamaConfig
+from transformers import LogitsProcessor, LogitsProcessorList
+from .generation import AutoImageTokenGenerationProcessor
+import torch.nn.functional as F
+BOI_TOKEN = '<img>'
+EOI_TOKEN = '</img>'
+IMG_TOKEN = '<img_{:05d}>'
+def cosine_loss(rec, target):
+    target = target / target.norm(dim=-1, keepdim=True)
+    rec = rec / rec.norm(dim=-1, keepdim=True)
+    rec_loss = (1 - (target * rec).sum(-1)).mean()
+    return rec_loss
+class ContinuousLVLM(nn.Module):
+    def __init__(self, llm, input_resampler, output_resampler, lm_loss_scale=1.0, rec_loss_scale=1.0) -> None:
+        super().__init__()
+        self.llm = llm
+        self.input_resampler = input_resampler
+        self.output_resampler = output_resampler
+        self.lm_loss_scale = lm_loss_scale
+        self.rec_loss_scale = rec_loss_scale
+        # input_resampler.requires_grad_(False)
+        # output_resampler.requires_grad_(False)
+    def forward(self, input_ids, attention_mask, labels, image_embeds, embeds_gen_mask, embeds_cmp_mask, ids_gen_mask,
+                ids_cmp_mask, return_recon_image_embeds=False):
+        input_embeds = self.llm.get_input_embeddings()(input_ids)  # bz x seq_len x dim, 4 x 160 x 4096
+        bz, sq, dim = input_embeds.shape
+        if image_embeds is not None:
+            image_embeds_lm = self.input_resampler(image_embeds)  # num_imgs_in_batch x nq x dim, 4 x 64 x 4096
+            has_image = True
+        else:
+            image_embeds = torch.randn(bz, self.output_resampler.num_queries,
+                                       self.output_resampler.embed_dim).to(input_embeds.device,
+                                                                           dtype=input_embeds.dtype)
+            image_embeds_lm = self.input_resampler(image_embeds)
+            has_image = False
+        has_image_input = has_image and embeds_cmp_mask.sum().item() > 0
+        has_image_output = has_image and embeds_gen_mask.sum().item() > 0
+        if has_image_input:
+            input_embeds[ids_cmp_mask] = image_embeds_lm[embeds_cmp_mask].view(-1, dim)  # eg, 128 x 4096
+            # zero_loss = 0.0
+        else:
+            min_bz = min(input_embeds.shape[0], image_embeds_lm.shape[0])
+            input_embeds[:min_bz, :self.input_resampler.
+            num_queries, :] = input_embeds[:min_bz, :self.input_resampler.
+            num_queries, :] + 0.0 * image_embeds_lm[:min_bz, :, :]
+        output_lm = self.llm(attention_mask=attention_mask,
+                             inputs_embeds=input_embeds,
+                             labels=labels,
+                             output_hidden_states=True,
+                             return_dict=True)
+        lm_loss = output_lm['loss']
+        last_hidden_state = output_lm.hidden_states[-1]  # 4 x 160 x 4096
+        if has_image_output:
+            target_embeds = image_embeds[embeds_gen_mask]  # num_imgs_gen_target x nq_in x dim_in, 2 x 256 x 4096
+            num_imgs_for_rec = target_embeds.shape[0]
+            output_image_embeds = last_hidden_state[ids_gen_mask].view(num_imgs_for_rec, -1,
+                                                                       dim)  # 128 x 4096 -> 2 x 64 x 4096
+            recon_image_embeds = self.output_resampler(output_image_embeds)  # 2 x 256 x 4096
+            rec_loss = cosine_loss(recon_image_embeds, target_embeds)
+        else:
+            output_image_embeds = torch.randn(bz, self.input_resampler.num_queries,
+                                              self.input_resampler.embed_dim).to(input_embeds.device,
+                                                                                 dtype=input_embeds.dtype)
+            recon_image_embeds = self.output_resampler(output_image_embeds)
+            target_embeds = torch.randn(bz, self.output_resampler.num_queries,
+                                        self.output_resampler.embed_dim).to(input_embeds.device,
+                                                                            dtype=input_embeds.dtype)
+            rec_loss = cosine_loss(recon_image_embeds, target_embeds) * 0.0
+        total_loss = self.lm_loss_scale * lm_loss + self.rec_loss_scale * rec_loss
+        if return_recon_image_embeds and has_image_output:
+            return {'total_loss': total_loss, 'lm_loss': lm_loss, 'rec_loss': rec_loss,
+                    'recon_image_embeds': recon_image_embeds}
+        else:
+            return {'total_loss': total_loss, 'lm_loss': lm_loss, 'rec_loss': rec_loss}
+    def generate(self,
+                 tokenizer,
+                 prompt=None,
+                 input_ids=None,
+                 image_embeds=None,
+                 embeds_cmp_mask=None,
+                 ids_cmp_mask=None,
+                 logits_processor=None,
+                 num_img_gen_tokens=64,
+                 temperature=0.7,
+                 num_beams=1,
+                 max_new_tokens=120,
+                 top_p=0.5,
+                 past_key_values=None,
+                 # position_ids=None,
+                 dtype=torch.float16,
+                 device='cuda'):
+        if logits_processor is None:
+            logits_processor = LogitsProcessorList()
+            logits_processor.append(
+                AutoImageTokenGenerationProcessor(tokenizer=tokenizer, num_img_gen_tokens=num_img_gen_tokens))
+        if prompt is not None:
+            input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+        if isinstance(input_ids, list):
+            input_ids = torch.tensor(input_ids)
+        input_ids = input_ids.to(device=device)
+        input_embeds = self.llm.get_input_embeddings()(input_ids)
+        bz, sq, dim = input_embeds.shape
+        if image_embeds is not None:
+            assert embeds_cmp_mask is not None and ids_cmp_mask is not None
+            with torch.no_grad():
+                image_embeds_lm = self.input_resampler(image_embeds)
+            input_embeds[ids_cmp_mask] = image_embeds_lm[embeds_cmp_mask].view(-1, dim)
+        generation_config = {
+            'temperature': temperature,
+            'num_beams': num_beams,
+            'max_new_tokens': max_new_tokens,
+            'top_p': top_p,
+            'do_sample': False
+        }
+        # generate_ids = self.llm.generate(input_ids=input_ids, **generation_config)
+        output = self.llm.generate(input_ids=input_ids,
+                                   inputs_embeds=input_embeds,
+                                   output_hidden_states=True,
+                                   return_dict_in_generate=True,
+                                   logits_processor=logits_processor,
+                                   past_key_values=past_key_values,
+                                   # position_ids=position_ids,
+                                   **generation_config)
+        # self.llm.base_model.model.position_ids = self.llm.base_model.model.position_ids[:, :-2]
+        output_past_key_values = self.llm.past_key_values
+        generate_ids = output.sequences[0][input_ids.shape[1]:]
+        generate_id_list = generate_ids.tolist()
+        boi_token_id = tokenizer.encode(BOI_TOKEN, add_special_tokens=False)[0]
+        eoi_token_id = tokenizer.encode(EOI_TOKEN, add_special_tokens=False)[0]
+        attn_weights = ()
+        def merge_attn_weights(attn_weights):
+            merged_attn_weights = attn_weights[0]
+            # Iterate through the remaining attention weight tensors
+            for i, attn_weight in enumerate(attn_weights[1:]):
+                merged_attn_weights = F.pad(merged_attn_weights, (0, 1), "constant", float('nan'))
+                # Concatenate the expanded tensor to the merged tensor along the kv_len dimension
+                merged_attn_weights = torch.cat([merged_attn_weights, attn_weight], dim=1)
+            return merged_attn_weights
+        if output.attentions is not None:
+            # for idx in [0, 1, 2, 9, 16, 23, 31]:
+            for idx in range(32):
+                attn_weights += (
+                merge_attn_weights([output.attentions[j][idx] for j in range(len(output.attentions))]),)
+        # for skip image multi turn kvcache
+        last_hidden_states = torch.cat([hidden_state[-1] for hidden_state in output.hidden_states], dim=1)
+        if past_key_values is None:
+            last_hidden_states = last_hidden_states[0, input_ids.shape[1]:, :]
+            eoi_indices = torch.where(generate_ids == eoi_token_id)[0].tolist()
+        else:
+            last_hidden_states = last_hidden_states[0, :, :]
+            hidden_len = last_hidden_states.shape[0]
+            eoi_indices = torch.where(output.sequences[0][-hidden_len:] == eoi_token_id)[0].tolist()
+        num_gen_imgs = 1 if len(eoi_indices) > 0 else 0
+        text_mask = torch.ones_like(generate_ids, dtype=torch.bool)
+        has_img_output = num_gen_imgs > 0
+        if has_img_output:
+            img_gen_feats = []
+            img_gen_feats.append(last_hidden_states[eoi_indices[-1] - num_img_gen_tokens:eoi_indices[-1]])
+            text_mask[eoi_indices[-1] - num_img_gen_tokens:eoi_indices[-1]] = False
+            # for eoi_idx in eoi_indices:
+            #     img_gen_feats.append(last_hidden_states[eoi_idx - num_img_gen_tokens:eoi_idx])
+            #     text_mask[eoi_idx - num_img_gen_tokens:eoi_idx] = False
+            img_gen_feats = torch.stack(img_gen_feats)
+            img_gen_feat = self.output_resampler(img_gen_feats)
+        else:
+            img_gen_feat = None
+        text_mask[generate_ids == boi_token_id] = False
+        # generate_ids = generate_ids[text_mask]
+        generate_text = tokenizer.decode(generate_ids, skip_special_tokens=False)
+        return {
+            'text': generate_text,
+            'generate_ids': generate_ids,
+            'has_img_output': has_img_output,
+            'img_gen_feat': img_gen_feat,
+            'num_gen_imgs': num_gen_imgs,
+            'attn_weights': attn_weights,
+            'past_key_values': output_past_key_values
+        }
+    @classmethod
+    def from_pretrained(cls, llm, input_resampler, output_resampler, pretrained_model_path=None, **kwargs):
+        model = cls(llm=llm, input_resampler=input_resampler, output_resampler=output_resampler, **kwargs)
+        if pretrained_model_path is not None:
+            ckpt = torch.load(pretrained_model_path, map_location='cpu')
+            missing, unexpected = model.load_state_dict(ckpt, strict=False)
+            print('agent model, missing keys: ', len(missing), 'unexpected keys:', len(unexpected))
+        return model
+class SEEDLLaMAAlignGeneration(nn.Module):
+    def __init__(self, llm, output_resampler) -> None:
+        super().__init__()
+        self.llm = llm
+        self.output_resampler = output_resampler
+        # self.rec_loss_scale = rec_loss_scale
+        self.llm.requires_grad_(False)
+    def forward(self, input_ids, attention_mask, labels, image_embeds, embeds_gen_mask, embeds_cmp_mask, ids_gen_mask,
+                ids_cmp_mask):
+        input_embeds = self.llm.get_input_embeddings()(input_ids)  # bz x seq_len x dim, 4 x 160 x 4096
+        bz, sq, dim = input_embeds.shape
+        output_lm = self.llm(attention_mask=attention_mask,
+                             inputs_embeds=input_embeds,
+                             labels=labels,
+                             output_hidden_states=True,
+                             return_dict=True)
+        last_hidden_state = output_lm.hidden_states[-1]  # 4 x 160 x 4096
+        target_embeds = image_embeds[embeds_gen_mask]  # num_imgs_gen_target x nq_in x dim_in, 2 x 256 x 4096
+        num_imgs_for_rec = target_embeds.shape[0]
+        output_image_embeds = last_hidden_state[ids_gen_mask].view(num_imgs_for_rec, -1,
+                                                                   dim)  # 128 x 4096 -> 2 x 64 x 4096
+        recon_image_embeds = self.output_resampler(output_image_embeds)  # 2 x 256 x 4096
+        rec_loss = cosine_loss(recon_image_embeds, target_embeds)
+        return {'total_loss': rec_loss, 'rec_loss': rec_loss}
+    @classmethod
+    def from_pretrained(cls, llm, output_resampler, pretrained_model_path=None, **kwargs):
+        model = cls(llm=llm, output_resampler=output_resampler, **kwargs)
+        if pretrained_model_path is not None:
+            ckpt = torch.load(pretrained_model_path, map_location='cpu')
+            missing, unexpected = model.load_state_dict(ckpt, strict=False)
+            print('agent model, missing keys: ', len(missing), 'unexpected keys:', len(unexpected))
+        return model
+    def generate(self,
+                 tokenizer,
+                 input_ids=None,
+                 temperature=0.7,
+                 num_beams=1,
+                 max_new_tokens=120,
+                 num_img_gen_tokens=64,
+                 top_p=0.5,
+                 dtype=torch.float16,
+                 device='cuda'):
+        input_ids = input_ids.to(device=device)
+        input_embeds = self.llm.get_input_embeddings()(input_ids)  # bz x seq_len x dim, 4 x 160 x 4096
+        generation_config = {
+            'temperature': temperature,
+            'num_beams': num_beams,
+            'max_new_tokens': max_new_tokens,
+            'top_p': top_p,
+            'do_sample': False
+        }
+        output = self.llm.generate(input_ids=input_ids,
+                                   inputs_embeds=input_embeds,
+                                   output_hidden_states=True,
+                                   return_dict_in_generate=True,
+                                   **generation_config)
+        generate_ids = output.sequences[0][input_ids.shape[1]:]
+        generate_id_list = generate_ids.tolist()
+        # boi_token_id = tokenizer.encode(BOI_TOKEN, add_special_tokens=False)[0]
+        eoi_token_id = tokenizer.encode(EOI_TOKEN, add_special_tokens=False)[0]
+        # print('output ids: ', generate_ids, generate_ids.shape)
+        # last_hidden_states = output.hidden_states[-1]
+        last_hidden_states = torch.cat([hidden_state[-1] for hidden_state in output.hidden_states],
+                                       dim=1)[:1, input_ids.shape[1]:, :]
+        has_img_output = eoi_token_id in generate_id_list
+        if has_img_output:
+            # print(boi_token_id, generate_id_list, generate_id_list.index(boi_token_id))
+            # boi_idx = generate_id_list.index(boi_token_id)
+            eoi_idx = generate_id_list.index(eoi_token_id)
+            print(len(generate_id_list), generate_id_list, eoi_idx)
+            # print(generate_id_list[boi_idx + 1:boi_idx + 1 + num_img_gen_tokens])
+            # img_gen_feat = last_hidden_states[:, eoi_idx - num_img_gen_tokens:eoi_idx]
+            img_gen_feat = last_hidden_states[:, 0:eoi_idx]
+            print('img_gen_feat', img_gen_feat.shape, last_hidden_states.shape, num_img_gen_tokens)
+            img_gen_feat = self.output_resampler(img_gen_feat)
+        else:
+            img_gen_feat = None
+        generate_text = tokenizer.decode(generate_ids, skip_special_tokens=False)
+        # print('output keys: ', output.keys())
+        return {'text': generate_text, 'has_img_output': has_img_output, 'img_gen_feat': img_gen_feat}

src/models_clm/peft_models.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from peft import (
+    LoraConfig,
+    PeftModel,
+    LoraModel,
+    PeftModelForCausalLM,
+    get_peft_model,
+    get_peft_model_state_dict,
+    prepare_model_for_int8_training,
+    set_peft_model_state_dict,
+)
+from peft.peft_model import PEFT_TYPE_TO_MODEL_MAPPING
+from peft.utils import _set_trainable, PromptLearningConfig
+from peft.utils import PeftConfig
+import torch
+from transformers import LlamaForCausalLM
+from omegaconf import DictConfig
+import hydra
+def get_peft_model_with_resize_embedding(
+        model,
+        peft_config=None,
+        model_id=None,
+        vocab_size=None,
+        torch_dtype='bf16'
+):
+    if torch_dtype == 'bf16' or torch_dtype == 'bfloat16':
+        torch_dtype = torch.bfloat16
+    elif torch_dtype == 'fp16' or torch_dtype == 'float16':
+        torch_dtype = torch.float16
+    else:
+        torch_dtype = torch.float32
+    if isinstance(model, DictConfig):
+        model = hydra.utils.instantiate(model, torch_dtype=torch_dtype)
+    # model.gradient_checkpointing_enable()
+    assert (peft_config is None) + (model_id is None) == 1
+    # print(type(peft_config.target_modules))
+    if vocab_size is not None:
+        print(f'Length of tokenizer and resize embedding: {vocab_size}')
+        model.resize_token_embeddings(vocab_size)
+    if peft_config is not None:
+        print('peft config: ', peft_config)
+        peft_model = get_peft_model(model=model, peft_config=peft_config)
+        peft_model.get_input_embeddings().requires_grad_(True)
+        peft_model.get_output_embeddings().requires_grad_(True)
+        peft_model.print_trainable_parameters()
+        # param_count = 0
+        # if peft_model.modules_to_save is not None:
+        #     for name, param in peft_model.named_parameters():
+        #         if any(module_name in name for module_name in peft_model.modules_to_save):
+        #             param_count += param.numel()
+        #             print(name, param.numel())
+    else:
+        peft_model = PeftModel.from_pretrained(model=model, model_id=model_id)
+    return peft_model
+def get_model_with_resize_embedding(model, vocab_size=None, torch_dtype='bf16'):
+    if torch_dtype == 'bf16' or torch_dtype == 'bfloat16':
+        torch_dtype = torch.bfloat16
+    elif torch_dtype == 'fp16' or torch_dtype == 'float16':
+        torch_dtype = torch.float16
+    else:
+        torch_dtype = torch.float32
+    if isinstance(model, DictConfig):
+        model = hydra.utils.instantiate(model, torch_dtype=torch_dtype)
+    model.requires_grad_(False)
+    if vocab_size is not None:
+        print(f'Length of tokenizer and resize embedding: {vocab_size}')
+        model.resize_token_embeddings(vocab_size)
+        model.get_input_embeddings().requires_grad_(True)
+        model.get_output_embeddings().requires_grad_(True)
+    return model
+def get_full_model_with_resize_embedding(model, vocab_size=None, torch_dtype='bf16'):
+    if torch_dtype == 'bf16' or torch_dtype == 'bfloat16':
+        torch_dtype = torch.bfloat16
+    elif torch_dtype == 'fp16' or torch_dtype == 'float16':
+        torch_dtype = torch.float16
+    else:
+        torch_dtype = torch.float32
+    if isinstance(model, DictConfig):
+        model = hydra.utils.instantiate(model, torch_dtype=torch_dtype)
+    if vocab_size is not None:
+        print(f'Length of tokenizer and resize embedding: {vocab_size}')
+        model.resize_token_embeddings(vocab_size)
+    return model

src/models_ipa/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/models_ipa/adapter_modules.py ADDED Viewed

	@@ -0,0 +1,920 @@

+import torch
+import torch.nn as nn
+import itertools
+import torch.nn.functional as F
+from typing import List
+from diffusers import (
+    StableDiffusionPipeline,
+    StableDiffusionXLPipeline,
+    StableDiffusionXLInstructPix2PixPipeline,
+    StableDiffusionInstructPix2PixPipeline,
+)
+from PIL import Image
+from .ipa_utils import is_torch2_available
+if is_torch2_available():
+    from .attention_processor import IPAttnProcessor2_0 as IPAttnProcessor, AttnProcessor2_0 as AttnProcessor
+else:
+    from .attention_processor import IPAttnProcessor, AttnProcessor
+from diffusers.loaders import LoraLoaderMixin
+from diffusers.models.lora import LoRALinearLayer
+from diffusers.models.unet_2d_blocks import DownBlock2D
+# from .pipeline_stable_diffusion_xl_t2i_edit import StableDiffusionXLText2ImageAndEditPipeline
+# from .pipeline_stable_diffusion_t2i_edit import StableDiffusionText2ImageAndEditPipeline
+class IPAdapterSD(nn.Module):
+    def __init__(self, unet, resampler) -> None:
+        super().__init__()
+        self.unet = unet
+        self.resampler = resampler
+        self.set_ip_adapter()
+        self.set_trainable()
+    def set_ip_adapter(self):
+        attn_procs = {}
+        unet_sd = self.unet.state_dict()
+        for name in self.unet.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else self.unet.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = self.unet.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(self.unet.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = self.unet.config.block_out_channels[block_id]
+            if cross_attention_dim is None:
+                attn_procs[name] = AttnProcessor()
+            else:
+                layer_name = name.split(".processor")[0]
+                weights = {
+                    "to_k_ip.weight": unet_sd[layer_name + ".to_k.weight"],
+                    "to_v_ip.weight": unet_sd[layer_name + ".to_v.weight"],
+                }
+                attn_procs[name] = IPAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+                attn_procs[name].load_state_dict(weights)
+        self.unet.set_attn_processor(attn_procs)
+        self.adapter = torch.nn.ModuleList(self.unet.attn_processors.values())
+    def set_trainable(self):
+        self.unet.requires_grad_(False)
+        self.resampler.requires_grad_(True)
+        self.adapter.requires_grad_(True)
+    def params_to_opt(self):
+        return itertools.chain(self.resampler.parameters(), self.adapter.parameters())
+    def forward(self, noisy_latents, timesteps, image_embeds, text_embeds, noise):
+        image_embeds = self.resampler(image_embeds)
+        # image_embeds = image_embeds.to(dtype=text_embeds.dtype)
+        text_embeds = torch.cat([text_embeds, image_embeds], dim=1)
+        # Predict the noise residual and compute loss
+        noise_pred = self.unet(noisy_latents, timesteps, text_embeds).sample
+        # if noise is not None:
+        loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
+        # else:
+        #     loss = torch.tensor(0.0, device=noisy_latents)
+        return {'total_loss': loss, 'noise_pred': noise_pred}
+    def encode_image_embeds(self, image_embeds):
+        dtype = image_embeds.dtype
+        image_embeds = self.resampler(image_embeds)
+        image_embeds = image_embeds.to(dtype=dtype)
+        return image_embeds
+    @classmethod
+    def from_pretrained(cls,
+                        unet,
+                        resampler,
+                        pretrained_model_path=None,
+                        pretrained_resampler_path=None,
+                        pretrained_adapter_path=None):
+        model = cls(unet=unet, resampler=resampler)
+        if pretrained_model_path is not None:
+            ckpt = torch.load(pretrained_model_path, map_location='cpu')
+            missing, unexpected = model.load_state_dict(ckpt, strict=False)
+            print('missing keys: ', len(missing), 'unexpected keys:', len(unexpected))
+        if pretrained_resampler_path is not None:
+            ckpt = torch.load(pretrained_resampler_path, map_location='cpu')
+            missing, unexpected = model.resampler.load_state_dict(ckpt, strict=True)
+            print('missing keys: ', len(missing), 'unexpected keys:', len(unexpected))
+        if pretrained_adapter_path is not None:
+            ckpt = torch.load(pretrained_adapter_path, map_location='cpu')
+            missing, unexpected = model.adapter.load_state_dict(ckpt, strict=True)
+            print('missing keys: ', len(missing), 'unexpected keys:', len(unexpected))
+        return model
+    @classmethod
+    def from_pretrained_legacy(cls, unet, resampler, pretrained_model_path=None):
+        model = cls(unet=unet, resampler=resampler)
+        if pretrained_model_path is not None:
+            ckpt = torch.load(pretrained_model_path, map_location='cpu')
+            ckpt_image_proj = {}
+            ckpt_ip_layers = {}
+            for key, value in ckpt.items():
+                if key.startswith('image_proj_model'):
+                    new_key = key.replace('image_proj_model.', '')
+                    ckpt_image_proj[new_key] = value
+                elif key.startswith('adapter_modules.'):
+                    new_key = key.replace('adapter_modules.', '')
+                    ckpt_ip_layers[new_key] = value
+            missing, unexpected = model.resampler.load_state_dict(ckpt_image_proj, strict=True)
+            print('missing keys: ', len(missing), 'unexpected keys:', len(unexpected))
+            missing, unexpected = model.adapter.load_state_dict(ckpt_ip_layers, strict=True)
+            print('missing keys: ', len(missing), 'unexpected keys:', len(unexpected))
+        return model
+class IPAdapterSDPipe(nn.Module):
+    def __init__(
+            self,
+            ip_adapter,
+            discrete_model,
+            vae,
+            visual_encoder,
+            text_encoder,
+            tokenizer,
+            scheduler,
+            image_transform,
+            device,
+            dtype,
+    ) -> None:
+        super().__init__()
+        self.ip_adapter = ip_adapter
+        self.vae = vae
+        self.visual_encoder = visual_encoder
+        self.text_encoder = text_encoder
+        self.tokenizer = tokenizer
+        self.scheduler = scheduler
+        self.image_transform = image_transform
+        self.discrete_model = discrete_model
+        self.device = device
+        self.dtype = dtype
+        self.sd_pipe = StableDiffusionPipeline(vae=vae,
+                                               text_encoder=text_encoder,
+                                               tokenizer=tokenizer,
+                                               unet=ip_adapter.unet,
+                                               scheduler=scheduler,
+                                               safety_checker=None,
+                                               feature_extractor=None,
+                                               requires_safety_checker=False)
+    def set_scale(self, scale):
+        for attn_processor in self.sd_pipe.unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor):
+                attn_processor.scale = scale
+    @torch.inference_mode()
+    def get_image_embeds(self, image_pil=None, image_tensor=None, return_negative=True):
+        assert int(image_pil is not None) + int(image_tensor is not None) == 1
+        if image_pil is not None:
+            image_tensor = self.image_transform(image_pil).unsqueeze(0).to(self.device, dtype=self.dtype)
+        if return_negative:
+            image_tensor_neg = torch.zeros_like(image_tensor)
+            image_tensor = torch.cat([image_tensor, image_tensor_neg], dim=0)
+        with torch.cuda.amp.autocast(dtype=self.dtype):
+            image_embeds = self.visual_encoder(image_tensor)
+            image_embeds = self.discrete_model.encode_image_embeds(image_embeds)
+        image_embeds = self.ip_adapter.encode_image_embeds(image_embeds)
+        if return_negative:
+            # bz = image_embeds.shape[0]
+            # image_embeds_neg = image_embeds[bz // 2:]
+            # image_embeds = image_embeds[0:bz // 2]
+            image_embeds, image_embeds_neg = image_embeds.chunk(2)
+        else:
+            image_embeds_neg = None
+        return image_embeds, image_embeds_neg
+    def generate(self,
+                 image_pil=None,
+                 image_tensor=None,
+                 prompt=None,
+                 negative_prompt=None,
+                 scale=1.0,
+                 num_samples=1,
+                 seed=42,
+                 guidance_scale=7.5,
+                 num_inference_steps=30,
+                 **kwargs):
+        self.set_scale(scale)
+        assert int(image_pil is not None) + int(image_tensor is not None) == 1
+        if image_pil is not None:
+            assert isinstance(image_pil, Image.Image)
+            num_prompts = 1
+        else:
+            num_prompts = image_tensor.shape[0]
+        if prompt is None:
+            # prompt = "best quality, high quality"
+            prompt = ""
+        if negative_prompt is None:
+            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
+        if not isinstance(prompt, List):
+            prompt = [prompt] * num_prompts
+        if not isinstance(negative_prompt, List):
+            negative_prompt = [negative_prompt] * num_prompts
+        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(
+            image_pil=image_pil,
+            image_tensor=image_tensor,
+            return_negative=True,
+        )
+        bs_embed, seq_len, _ = image_prompt_embeds.shape
+        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
+        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
+        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
+        with torch.inference_mode():
+            prompt_embeds, negative_prompt_embeds = self.sd_pipe.encode_prompt(
+                prompt,
+                device=self.device,
+                num_images_per_prompt=num_samples,
+                do_classifier_free_guidance=True,
+                negative_prompt=negative_prompt,
+            )
+            prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
+        generator = torch.Generator(self.device).manual_seed(seed) if seed is not None else None
+        images = self.sd_pipe(
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            **kwargs,
+        ).images
+        return images
+def compute_time_ids(original_size, crops_coords_top_left, target_resolution):
+    # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+    target_size = (target_resolution, target_resolution)
+    add_time_ids = list(original_size + crops_coords_top_left + target_size)
+    add_time_ids = torch.tensor([add_time_ids])
+    # add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype)
+    return add_time_ids
+class SDXLAdapter(nn.Module):
+    def __init__(self, unet, resampler, full_ft=False) -> None:
+        super().__init__()
+        self.unet = unet
+        self.resampler = resampler
+        self.full_ft = full_ft
+        self.set_trainable_v2()
+        # self.set_adapter()
+    #     self.set_trainable()
+    # def set_adapter(self):
+    #     adapter = []
+    #     for name, module in self.unet.named_modules():
+    #         if name.endswith('to_k') or name.endswith('to_v'):
+    #             if module is not None:
+    #                 adapter.append(module)
+    #     self.adapter = torch.nn.ModuleList(adapter)
+    #     print(f'adapter: {self.adapter}')
+    # def set_trainable(self):
+    #     self.unet.requires_grad_(False)
+    #     self.resampler.requires_grad_(True)
+    #     self.adapter.requires_grad_(True)
+    def set_trainable_v2(self):
+        self.resampler.requires_grad_(True)
+        adapter_parameters = []
+        if self.full_ft:
+            self.unet.requires_grad_(True)
+            adapter_parameters.extend(self.unet.parameters())
+        else:
+            self.unet.requires_grad_(False)
+            for name, module in self.unet.named_modules():
+                if name.endswith('to_k') or name.endswith('to_v'):
+                    if module is not None:
+                        adapter_parameters.extend(module.parameters())
+        self.adapter_parameters = adapter_parameters
+        for param in self.adapter_parameters:
+            param.requires_grad_(True)
+    # def params_to_opt(self):
+    #     return itertools.chain(self.resampler.parameters(), self.adapter.parameters())
+    def params_to_opt(self):
+        return itertools.chain(self.resampler.parameters(), self.adapter_parameters)
+    def forward(self, noisy_latents, timesteps, image_embeds, text_embeds, noise, time_ids):
+        image_embeds, pooled_image_embeds = self.resampler(image_embeds)
+        unet_added_conditions = {"time_ids": time_ids, 'text_embeds': pooled_image_embeds}
+        noise_pred = self.unet(noisy_latents, timesteps, image_embeds, added_cond_kwargs=unet_added_conditions).sample
+        # if noise is not None:
+        loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
+        # else:
+        #     loss = torch.tensor(0.0, device=noisy_latents)
+        return {'total_loss': loss, 'noise_pred': noise_pred}
+    def encode_image_embeds(self, image_embeds):
+        image_embeds, pooled_image_embeds = self.resampler(image_embeds)
+        return image_embeds, pooled_image_embeds
+    @classmethod
+    def from_pretrained(cls, unet, resampler, pretrained_model_path=None, **kwargs):
+        model = cls(unet=unet, resampler=resampler, **kwargs)
+        if pretrained_model_path is not None:
+            ckpt = torch.load(pretrained_model_path, map_location='cpu')
+            missing, unexpected = model.load_state_dict(ckpt, strict=False)
+            print('missing keys: ', len(missing), 'unexpected keys:', len(unexpected))
+        return model
+    def init_pipe(self,
+                  vae,
+                  scheduler,
+                  visual_encoder,
+                  image_transform,
+                  discrete_model=None,
+                  dtype=torch.float16,
+                  device='cuda'):
+        self.device = device
+        self.dtype = dtype
+        sdxl_pipe = StableDiffusionXLPipeline(tokenizer=None,
+                                              tokenizer_2=None,
+                                              text_encoder=None,
+                                              text_encoder_2=None,
+                                              vae=vae,
+                                              unet=self.unet,
+                                              scheduler=scheduler)
+        self.sdxl_pipe = sdxl_pipe  # .to(self.device, dtype=self.dtype)
+        # print(sdxl_pipe.text_encoder_2, sdxl_pipe.text_encoder)
+        self.visual_encoder = visual_encoder.to(self.device, dtype=self.dtype)
+        if discrete_model is not None:
+            self.discrete_model = discrete_model.to(self.device, dtype=self.dtype)
+        else:
+            self.discrete_model = None
+        self.image_transform = image_transform
+    @torch.inference_mode()
+    def get_image_embeds(self,
+                         image_pil=None,
+                         image_tensor=None,
+                         image_embeds=None,
+                         return_negative=True,
+                         image_size=448
+                         ):
+        assert int(image_pil is not None) + int(image_tensor is not None) + int(image_embeds is not None) == 1
+        if image_pil is not None:
+            image_tensor = self.image_transform(image_pil).unsqueeze(0).to(self.device, dtype=self.dtype)
+        if image_tensor is not None:
+            if return_negative:
+                image_tensor_neg = torch.zeros_like(image_tensor)
+                image_tensor = torch.cat([image_tensor, image_tensor_neg], dim=0)
+            image_embeds = self.visual_encoder(image_tensor)
+        elif return_negative:
+            image_tensor_neg = torch.zeros(
+                1, 3,
+                image_size, image_size
+            ).to(
+                image_embeds.device, dtype=image_embeds.dtype
+            )
+            image_embeds_neg = self.visual_encoder(image_tensor_neg)
+            image_embeds = torch.cat([image_embeds, image_embeds_neg], dim=0)
+        if self.discrete_model is not None:
+            image_embeds = self.discrete_model.encode_image_embeds(image_embeds)
+        image_embeds, pooled_image_embeds = self.encode_image_embeds(image_embeds)
+        if return_negative:
+            image_embeds, image_embeds_neg = image_embeds.chunk(2)
+            pooled_image_embeds, pooled_image_embeds_neg = pooled_image_embeds.chunk(2)
+        else:
+            image_embeds_neg = None
+            pooled_image_embeds_neg = None
+        return image_embeds, image_embeds_neg, pooled_image_embeds, pooled_image_embeds_neg
+    def generate(self,
+                 image_pil=None,
+                 image_tensor=None,
+                 image_embeds=None,
+                 seed=42,
+                 height=1024,
+                 width=1024,
+                 guidance_scale=7.5,
+                 num_inference_steps=30,
+                 input_image_size=448,
+                 **kwargs):
+        if image_pil is not None:
+            assert isinstance(image_pil, Image.Image)
+        image_prompt_embeds, uncond_image_prompt_embeds, pooled_image_prompt_embeds, \
+            pooled_uncond_image_prompt_embeds = self.get_image_embeds(
+            image_pil=image_pil,
+            image_tensor=image_tensor,
+            image_embeds=image_embeds,
+            return_negative=True,
+            image_size=input_image_size,
+        )
+        # print(image_prompt_embeds.shape, pooled_image_prompt_embeds.shape)
+        generator = torch.Generator(self.device).manual_seed(seed) if seed is not None else None
+        images = self.sdxl_pipe(
+            prompt_embeds=image_prompt_embeds,
+            negative_prompt_embeds=uncond_image_prompt_embeds,
+            pooled_prompt_embeds=pooled_image_prompt_embeds,
+            negative_pooled_prompt_embeds=pooled_uncond_image_prompt_embeds,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            height=height,
+            width=width,
+            **kwargs,
+        ).images
+        return images
+class SDXLText2ImageAndEditAdapter(nn.Module):
+    def __init__(self, unet, resampler, lora_rank=16, fully_ft=False) -> None:
+        super().__init__()
+        self.unet = unet
+        self.resampler = resampler
+        self.lora_rank = lora_rank
+        if fully_ft:
+            self.set_fully_trainable()
+        else:
+            self.set_adapter()
+    def set_adapter(self):
+        self.unet.requires_grad_(False)
+        adapter_parameters = []
+        in_channels = 8
+        out_channels = self.unet.conv_in.out_channels
+        self.unet.register_to_config(in_channels=in_channels)
+        with torch.no_grad():
+            new_conv_in = nn.Conv2d(in_channels, out_channels, self.unet.conv_in.kernel_size, self.unet.conv_in.stride,
+                                    self.unet.conv_in.padding)
+            new_conv_in.weight.zero_()
+            new_conv_in.weight[:, :4, :, :].copy_(self.unet.conv_in.weight)
+            self.unet.conv_in = new_conv_in
+        self.unet.conv_in.requires_grad_(True)
+        print('Make conv_in trainable.')
+        adapter_parameters.extend(self.unet.conv_in.parameters())
+        for name, module in self.unet.named_modules():
+            if isinstance(module, DownBlock2D):
+                module.requires_grad_(True)
+                adapter_parameters.extend(module.parameters())
+                print('Make DownBlock2D trainable.')
+        for attn_processor_name, attn_processor in self.unet.attn_processors.items():
+            # Parse the attention module.
+            attn_module = self.unet
+            for n in attn_processor_name.split(".")[:-1]:
+                attn_module = getattr(attn_module, n)
+            # Set the `lora_layer` attribute of the attention-related matrices.
+            attn_module.to_q.set_lora_layer(
+                LoRALinearLayer(in_features=attn_module.to_q.in_features,
+                                out_features=attn_module.to_q.out_features,
+                                rank=self.lora_rank))
+            # attn_module.to_k.set_lora_layer(
+            #     LoRALinearLayer(in_features=attn_module.to_k.in_features,
+            #                     out_features=attn_module.to_k.out_features,
+            #                     rank=self.lora_rank))
+            # attn_module.to_v.set_lora_layer(
+            #     LoRALinearLayer(in_features=attn_module.to_v.in_features,
+            #                     out_features=attn_module.to_v.out_features,
+            #                     rank=self.lora_rank))
+            attn_module.to_out[0].set_lora_layer(
+                LoRALinearLayer(
+                    in_features=attn_module.to_out[0].in_features,
+                    out_features=attn_module.to_out[0].out_features,
+                    rank=self.lora_rank,
+                ))
+            attn_module.to_k.requires_grad_(True)
+            attn_module.to_v.requires_grad_(True)
+            adapter_parameters.extend(attn_module.to_q.lora_layer.parameters())
+            adapter_parameters.extend(attn_module.to_k.parameters())
+            adapter_parameters.extend(attn_module.to_v.parameters())
+            adapter_parameters.extend(attn_module.to_out[0].lora_layer.parameters())
+        self.adapter_parameters = adapter_parameters
+    def set_fully_trainable(self):
+        in_channels = 8
+        out_channels = self.unet.conv_in.out_channels
+        self.unet.register_to_config(in_channels=in_channels)
+        with torch.no_grad():
+            new_conv_in = nn.Conv2d(in_channels, out_channels, self.unet.conv_in.kernel_size, self.unet.conv_in.stride,
+                                    self.unet.conv_in.padding)
+            new_conv_in.weight.zero_()
+            new_conv_in.weight[:, :4, :, :].copy_(self.unet.conv_in.weight)
+            self.unet.conv_in = new_conv_in
+        self.unet.requires_grad_(True)
+        self.adapter_parameters = self.unet.parameters()
+    def params_to_opt(self):
+        return itertools.chain(self.resampler.parameters(), self.adapter_parameters)
+    def forward(self, noisy_latents, timesteps, image_embeds, text_embeds, noise, time_ids, pooled_text_embeds=None):
+        text_embeds, pooled_text_embeds = self.resampler(text_embeds, pooled_text_embeds=pooled_text_embeds)
+        unet_added_conditions = {"time_ids": time_ids, 'text_embeds': pooled_text_embeds}
+        noise_pred = self.unet(noisy_latents, timesteps, text_embeds, added_cond_kwargs=unet_added_conditions).sample
+        loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
+        return {'total_loss': loss, 'noise_pred': noise_pred}
+    def encode_text_embeds(self, text_embeds, pooled_text_embeds=None):
+        text_embeds, pooled_text_embeds = self.resampler(text_embeds, pooled_text_embeds=pooled_text_embeds)
+        return text_embeds, pooled_text_embeds
+    @classmethod
+    def from_pretrained(cls, unet, resampler, pretrained_model_path=None, **kwargs):
+        model = cls(unet=unet, resampler=resampler, **kwargs)
+        if pretrained_model_path is not None:
+            ckpt = torch.load(pretrained_model_path, map_location='cpu')
+            missing, unexpected = model.load_state_dict(ckpt, strict=False)
+            print('missing keys: ', len(missing), 'unexpected keys:', len(unexpected))
+        return model
+    def init_pipe(self,
+                  vae,
+                  scheduler,
+                  text_encoder,
+                  text_encoder_2,
+                  tokenizer,
+                  tokenizer_2,
+                  dtype=torch.float16,
+                  device='cuda'):
+        self.device = device
+        self.dtype = dtype
+        sdxl_pipe = StableDiffusionXLText2ImageAndEditPipeline(
+            tokenizer=None,
+            tokenizer_2=None,
+            text_encoder=None,
+            text_encoder_2=None,
+            vae=vae,
+            unet=self.unet,
+            scheduler=scheduler,
+        )
+        self.sdxl_pipe = sdxl_pipe
+        self.sdxl_pipe.to(device, dtype=dtype)
+        self.tokenizer = tokenizer
+        self.tokenizer_2 = tokenizer_2
+        self.text_encoder = text_encoder
+        self.text_encoder_2 = text_encoder_2
+    @torch.inference_mode()
+    def get_text_embeds(self, prompt=None, negative_prompt='', text_embeds=None):
+        assert int(prompt is not None) + int(text_embeds is not None) == 1
+        if prompt is not None:
+            text_input_ids = self.tokenizer([prompt, negative_prompt],
+                                            max_length=self.tokenizer.model_max_length,
+                                            padding="max_length",
+                                            truncation=True,
+                                            return_tensors="pt").input_ids
+            text_input_ids_2 = self.tokenizer_2([prompt, negative_prompt],
+                                                max_length=self.tokenizer.model_max_length,
+                                                padding="max_length",
+                                                truncation=True,
+                                                return_tensors="pt").input_ids
+            encoder_output = self.text_encoder(text_input_ids.to(self.device), output_hidden_states=True)
+            text_embeds = encoder_output.hidden_states[-2]
+            encoder_output_2 = self.text_encoder_2(text_input_ids_2.to(self.device), output_hidden_states=True)
+            pooled_text_embeds = encoder_output_2[0]
+            text_embeds_2 = encoder_output_2.hidden_states[-2]
+            text_embeds = torch.cat([text_embeds, text_embeds_2], dim=-1)
+        else:
+            text_input_ids = self.tokenizer(negative_prompt,
+                                            max_length=self.tokenizer.model_max_length,
+                                            padding="max_length",
+                                            truncation=True,
+                                            return_tensors="pt").input_ids
+            text_input_ids_2 = self.tokenizer_2(negative_prompt,
+                                                max_length=self.tokenizer.model_max_length,
+                                                padding="max_length",
+                                                truncation=True,
+                                                return_tensors="pt").input_ids
+            encoder_output = self.text_encoder(text_input_ids.to(self.device), output_hidden_states=True)
+            text_embeds_neg = encoder_output.hidden_states[-2]
+            encoder_output_2 = self.text_encoder_2(text_input_ids_2.to(self.device), output_hidden_states=True)
+            text_embeds_neg_2 = encoder_output_2.hidden_states[-2]
+            pooled_text_embeds = encoder_output_2[0]
+            text_embeds_neg = torch.cat([text_embeds_neg, text_embeds_neg_2], dim=-1)
+            text_embeds = torch.cat([text_embeds, text_embeds_neg], dim=0)
+        text_embeds, pooled_text_embeds = self.encode_text_embeds(text_embeds, pooled_text_embeds=pooled_text_embeds)
+        text_embeds, text_embeds_neg = text_embeds.chunk(2)
+        pooled_text_embeds, pooled_text_embeds_neg = pooled_text_embeds.chunk(2)
+        return text_embeds, text_embeds_neg, pooled_text_embeds, pooled_text_embeds_neg
+    def generate(self,
+                 prompt=None,
+                 negative_prompt='',
+                 image=None,
+                 text_embeds=None,
+                 seed=42,
+                 height=1024,
+                 width=1024,
+                 guidance_scale=7.5,
+                 num_inference_steps=30,
+                 **kwargs):
+        text_embeds, text_embeds_neg, pooled_text_embeds, pooled_text_embeds_neg = self.get_text_embeds(
+            prompt=prompt, negative_prompt=negative_prompt, text_embeds=text_embeds)
+        generator = torch.Generator(self.device).manual_seed(seed) if seed is not None else None
+        images = self.sdxl_pipe(
+            image=image,
+            prompt_embeds=text_embeds,
+            negative_prompt_embeds=text_embeds_neg,
+            pooled_prompt_embeds=pooled_text_embeds,
+            negative_pooled_prompt_embeds=pooled_text_embeds_neg,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            height=height,
+            width=width,
+            **kwargs,
+        ).images
+        return images
+class SD21Text2ImageAndEditAdapter(SDXLText2ImageAndEditAdapter):
+    def forward(self, noisy_latents, timesteps, image_embeds, text_embeds, noise):
+        text_embeds, _ = self.resampler(text_embeds)
+        # unet_added_conditions = {"time_ids": time_ids, 'text_embeds': pooled_text_embeds}
+        noise_pred = self.unet(noisy_latents, timesteps, text_embeds).sample
+        loss = F.mse_loss(noise_pred.float(), noise.float(), reduction="mean")
+        return {'total_loss': loss, 'noise_pred': noise_pred}
+    def init_pipe(self,
+                  vae,
+                  scheduler,
+                  text_encoder,
+                  tokenizer,
+                  feature_extractor,
+                  dtype=torch.float16,
+                  device='cuda'):
+        self.device = device
+        self.dtype = dtype
+        sd_pipe = StableDiffusionText2ImageAndEditPipeline(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=self.unet,
+            feature_extractor=feature_extractor,
+            safety_checker=None,
+            requires_safety_checker=False,
+            scheduler=scheduler,
+        )
+        self.sd_pipe = sd_pipe
+        self.sd_pipe.to(device, dtype=dtype)
+        self.tokenizer = tokenizer
+        self.text_encoder = text_encoder
+    @torch.inference_mode()
+    def get_text_embeds(self, prompt=None, negative_prompt='', text_embeds=None):
+        assert int(prompt is not None) + int(text_embeds is not None) == 1
+        if prompt is not None:
+            text_input_ids = self.tokenizer([prompt, negative_prompt],
+                                            max_length=self.tokenizer.model_max_length,
+                                            padding="max_length",
+                                            truncation=True,
+                                            return_tensors="pt").input_ids
+            encoder_output = self.text_encoder(text_input_ids.to(self.device))
+            text_embeds = encoder_output[0]
+        else:
+            text_input_ids = self.tokenizer(negative_prompt,
+                                            max_length=self.tokenizer.model_max_length,
+                                            padding="max_length",
+                                            truncation=True,
+                                            return_tensors="pt").input_ids
+            encoder_output = self.text_encoder(text_input_ids.to(self.device))
+            text_embeds_neg = encoder_output[0]
+            text_embeds = torch.cat([text_embeds, text_embeds_neg], dim=0)
+        text_embeds, _ = self.encode_text_embeds(text_embeds)
+        text_embeds, text_embeds_neg = text_embeds.chunk(2)
+        return text_embeds, text_embeds_neg
+    def generate(self,
+                 prompt=None,
+                 negative_prompt='',
+                 image=None,
+                 text_embeds=None,
+                 seed=42,
+                 height=1024,
+                 width=1024,
+                 guidance_scale=7.5,
+                 num_inference_steps=30,
+                 **kwargs):
+        text_embeds, text_embeds_neg = self.get_text_embeds(
+            prompt=prompt, negative_prompt=negative_prompt, text_embeds=text_embeds)
+        generator = torch.Generator(self.device).manual_seed(seed) if seed is not None else None
+        print(f'text_embeds: {text_embeds.shape}')
+        print(f'text_embeds_neg: {text_embeds_neg.shape}')
+        images = self.sd_pipe(
+            image=image,
+            prompt_embeds=text_embeds,
+            negative_prompt_embeds=text_embeds_neg,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            height=height,
+            width=width,
+            **kwargs,
+        ).images
+        return images
+class SDXLAdapterWithLatentImage(SDXLAdapter):
+    def __init__(self, unet, resampler, full_ft=False, set_trainable_late=False) -> None:
+        nn.Module.__init__(self)
+        self.unet = unet
+        self.resampler = resampler
+        self.full_ft = full_ft
+        if not set_trainable_late:
+            self.set_trainable()
+    def set_trainable(self):
+        self.resampler.requires_grad_(True)
+        adapter_parameters = []
+        in_channels = 8
+        out_channels = self.unet.conv_in.out_channels
+        self.unet.register_to_config(in_channels=in_channels)
+        self.unet.requires_grad_(False)
+        with torch.no_grad():
+            new_conv_in = nn.Conv2d(in_channels, out_channels, self.unet.conv_in.kernel_size, self.unet.conv_in.stride,
+                                    self.unet.conv_in.padding)
+            new_conv_in.weight.zero_()
+            new_conv_in.weight[:, :4, :, :].copy_(self.unet.conv_in.weight)
+            self.unet.conv_in = new_conv_in
+        self.unet.conv_in.requires_grad_(True)
+        if self.full_ft:
+            self.unet.requires_grad_(True)
+            adapter_parameters.extend(self.unet.parameters())
+        else:
+            adapter_parameters.extend(self.unet.conv_in.parameters())
+            for name, module in self.unet.named_modules():
+                if name.endswith('to_k') or name.endswith('to_v'):
+                    if module is not None:
+                        adapter_parameters.extend(module.parameters())
+        self.adapter_parameters = adapter_parameters
+    @classmethod
+    def from_pretrained(cls, unet, resampler, pretrained_model_path=None, set_trainable_late=False, **kwargs):
+        model = cls(unet=unet, resampler=resampler, set_trainable_late=set_trainable_late, **kwargs)
+        if pretrained_model_path is not None:
+            ckpt = torch.load(pretrained_model_path, map_location='cpu')
+            missing, unexpected = model.load_state_dict(ckpt, strict=False)
+            print('missing keys: ', len(missing), 'unexpected keys:', len(unexpected))
+        if set_trainable_late:
+            model.set_trainable()
+        return model
+    def init_pipe(self,
+                  vae,
+                  scheduler,
+                  visual_encoder,
+                  image_transform,
+                  dtype=torch.float16,
+                  device='cuda'):
+        self.device = device
+        self.dtype = dtype
+        sdxl_pipe = StableDiffusionXLText2ImageAndEditPipeline(
+            tokenizer=None,
+            tokenizer_2=None,
+            text_encoder=None,
+            text_encoder_2=None,
+            vae=vae,
+            unet=self.unet,
+            scheduler=scheduler,
+        )
+        self.sdxl_pipe = sdxl_pipe
+        self.sdxl_pipe.to(device, dtype=dtype)
+        self.discrete_model = None
+        self.visual_encoder = visual_encoder.to(self.device, dtype=self.dtype)
+        self.image_transform = image_transform
+    def generate(self,
+                 image_pil=None,
+                 image_tensor=None,
+                 image_embeds=None,
+                 latent_image=None,
+                 seed=42,
+                 height=1024,
+                 width=1024,
+                 guidance_scale=7.5,
+                 num_inference_steps=30,
+                 input_image_size=448,
+                 **kwargs):
+        if image_pil is not None:
+            assert isinstance(image_pil, Image.Image)
+        image_prompt_embeds, uncond_image_prompt_embeds, \
+            pooled_image_prompt_embeds, pooled_uncond_image_prompt_embeds = self.get_image_embeds(
+            image_pil=image_pil,
+            image_tensor=image_tensor,
+            image_embeds=image_embeds,
+            return_negative=True,
+            image_size=input_image_size,
+        )
+        # print(image_prompt_embeds.shape, pooled_image_prompt_embeds.shape)
+        generator = torch.Generator(self.device).manual_seed(seed) if seed is not None else None
+        images = self.sdxl_pipe(
+            image=latent_image,
+            prompt_embeds=image_prompt_embeds,
+            negative_prompt_embeds=uncond_image_prompt_embeds,
+            pooled_prompt_embeds=pooled_image_prompt_embeds,
+            negative_pooled_prompt_embeds=pooled_uncond_image_prompt_embeds,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            height=height,
+            width=width,
+            **kwargs,
+        ).images
+        return images

src/models_ipa/attention_processor.py ADDED Viewed

	@@ -0,0 +1,414 @@

+# modified from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class AttnProcessor(nn.Module):
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __init__(
+            self,
+            hidden_size=None,
+            cross_attention_dim=None,
+    ):
+        super().__init__()
+    def __call__(
+            self,
+            attn,
+            hidden_states,
+            encoder_hidden_states=None,
+            attention_mask=None,
+            temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class IPAttnProcessor(nn.Module):
+    r"""
+    Attention processor for IP-Adapater.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        text_context_len (`int`, defaults to 77):
+            The context length of the text features.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, text_context_len=77, scale=1.0):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.text_context_len = text_context_len
+        self.scale = scale
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+    def __call__(
+            self,
+            attn,
+            hidden_states,
+            encoder_hidden_states=None,
+            attention_mask=None,
+            temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        # split hidden states
+        encoder_hidden_states, \
+            ip_hidden_states = \
+            encoder_hidden_states[:, :self.text_context_len, :], \
+                encoder_hidden_states[:, self.text_context_len:, :]
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+        ip_key = attn.head_to_batch_dim(ip_key)
+        ip_value = attn.head_to_batch_dim(ip_value)
+        ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+        ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+        ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class AttnProcessor2_0(torch.nn.Module):
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(
+            self,
+            hidden_size=None,
+            cross_attention_dim=None,
+    ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+            self,
+            attn,
+            hidden_states,
+            encoder_hidden_states=None,
+            attention_mask=None,
+            temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(query,
+                                                       key,
+                                                       value,
+                                                       attn_mask=attention_mask,
+                                                       dropout_p=0.0,
+                                                       is_causal=False)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class IPAttnProcessor2_0(torch.nn.Module):
+    r"""
+    Attention processor for IP-Adapater for PyTorch 2.0.
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        text_context_len (`int`, defaults to 77):
+            The context length of the text features.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+    """
+    def __init__(self, hidden_size, cross_attention_dim=None, text_context_len=77, scale=1.0):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.text_context_len = text_context_len
+        self.scale = scale
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+    def __call__(
+            self,
+            attn,
+            hidden_states,
+            encoder_hidden_states=None,
+            attention_mask=None,
+            temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        # split hidden states
+        encoder_hidden_states, \
+            ip_hidden_states = \
+            encoder_hidden_states[:, :self.text_context_len, :], \
+                encoder_hidden_states[:, self.text_context_len:, :]
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(query,
+                                                       key,
+                                                       value,
+                                                       attn_mask=attention_mask,
+                                                       dropout_p=0.0,
+                                                       is_causal=False)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        ip_hidden_states = F.scaled_dot_product_attention(query,
+                                                          ip_key,
+                                                          ip_value,
+                                                          attn_mask=None,
+                                                          dropout_p=0.0,
+                                                          is_causal=False)
+        ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        ip_hidden_states = ip_hidden_states.to(query.dtype)
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

src/models_ipa/ipa_utils.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import torch.nn.functional as F
+def is_torch2_available():
+    return hasattr(F, "scaled_dot_product_attention")

src/models_ipa/resampler.py ADDED Viewed

	@@ -0,0 +1,308 @@

+# modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# FFN
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head ** -0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+        b, l, _ = latents.shape
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+        return self.to_out(out)
+class AttentionPool2d(nn.Module):
+    def __init__(self, seq_len: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(seq_len + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x, return_all_tokens=False):
+        # x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = x.permute(1, 0, 2)  # (N(HW)C) => (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(query=x,
+                                              key=x,
+                                              value=x,
+                                              embed_dim_to_check=x.shape[-1],
+                                              num_heads=self.num_heads,
+                                              q_proj_weight=self.q_proj.weight,
+                                              k_proj_weight=self.k_proj.weight,
+                                              v_proj_weight=self.v_proj.weight,
+                                              in_proj_weight=None,
+                                              in_proj_bias=torch.cat(
+                                                  [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+                                              bias_k=None,
+                                              bias_v=None,
+                                              add_zero_attn=False,
+                                              dropout_p=0,
+                                              out_proj_weight=self.c_proj.weight,
+                                              out_proj_bias=self.c_proj.bias,
+                                              use_separate_proj_weight=True,
+                                              training=self.training,
+                                              need_weights=False)
+        if return_all_tokens:
+            return x
+        else:
+            return x[0]
+class Resampler(nn.Module):
+    def __init__(
+            self,
+            dim=1024,
+            depth=8,
+            dim_head=64,
+            heads=16,
+            num_queries=8,
+            embedding_dim=768,
+            output_dim=1024,
+            ff_mult=4,
+    ):
+        super().__init__()
+        self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim ** 0.5)
+        self.proj_in = nn.Linear(embedding_dim, dim)
+        self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(output_dim)
+        self.in_dim = dim
+        self.out_dim = output_dim
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList([
+                    PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                    FeedForward(dim=dim, mult=ff_mult),
+                ]))
+    def forward(self, x):
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        x = self.proj_in(x)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        latents = self.proj_out(latents)
+        output_embeds = self.norm_out(latents)
+        return output_embeds
+class ResamplerXL(nn.Module):
+    def __init__(
+            self,
+            dim=1024,
+            depth=8,
+            dim_head=64,
+            heads=16,
+            num_queries=8,
+            embedding_dim=768,
+            output1_dim=768,
+            output2_dim=1280,
+            ff_mult=4,
+    ):
+        super().__init__()
+        self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim ** 0.5)
+        self.proj_in = nn.Linear(embedding_dim, dim)
+        # self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(dim)
+        self.in_dim = dim
+        self.out_dim = output1_dim + output2_dim
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList([
+                    PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                    FeedForward(dim=dim, mult=ff_mult),
+                ]))
+        self.unet_proj_1 = nn.Linear(self.in_dim, output1_dim)
+        self.unet_proj_2 = nn.Linear(self.in_dim, output2_dim)
+        self.unet_attnpool = AttentionPool2d(num_queries, self.in_dim, heads, output2_dim)
+    def forward(self, x):
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        x = self.proj_in(x)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        hidden_embeds = self.norm_out(latents)
+        encoder_hidden_1 = self.unet_proj_1(hidden_embeds)  # [bs, 256, 768]
+        encoder_hidden_2 = self.unet_proj_2(hidden_embeds)  # [bs, 256, 1280]
+        prompt_embeds = torch.cat([encoder_hidden_1, encoder_hidden_2], dim=-1)  # [bs, 256, 2048]
+        pooled_prompt_embeds = self.unet_attnpool(hidden_embeds)  # [bs, 1280]
+        return prompt_embeds, pooled_prompt_embeds
+class ResamplerXLV2(nn.Module):
+    def __init__(
+            self,
+            dim=1024,
+            depth=8,
+            dim_head=64,
+            heads=16,
+            num_queries=8,
+            embedding_dim=768,
+            output1_dim=768,
+            output2_dim=1280,
+            ff_mult=4,
+    ):
+        super().__init__()
+        self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim ** 0.5)
+        self.proj_in = nn.Linear(embedding_dim, dim)
+        # self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(dim)
+        self.in_dim = dim
+        self.out_dim = output1_dim + output2_dim
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList([
+                    PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                    FeedForward(dim=dim, mult=ff_mult),
+                ]))
+        self.unet_proj_1 = nn.Linear(self.in_dim, output1_dim)
+        self.unet_proj_2 = nn.Linear(self.in_dim, output2_dim)
+        self.unet_attnpool = AttentionPool2d(num_queries, self.in_dim, heads, output2_dim)
+    def forward(self, x, pooled_text_embeds=None):
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        x = F.normalize(x)
+        x = self.proj_in(x)
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+        hidden_embeds = self.norm_out(latents)
+        encoder_hidden_1 = self.unet_proj_1(hidden_embeds)  # [bs, 256, 768]
+        encoder_hidden_2 = self.unet_proj_2(hidden_embeds)  # [bs, 256, 1280]
+        prompt_embeds = torch.cat([encoder_hidden_1, encoder_hidden_2], dim=-1)  # [bs, 256, 2048]
+        pooled_prompt_embeds = self.unet_attnpool(hidden_embeds)  # [bs, 1280]
+        return prompt_embeds, pooled_prompt_embeds
+class ResamplerXLIdentity(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+    def forward(self, x, pooled_text_embeds=None):
+        return x, pooled_text_embeds
+if __name__ == '__main__':
+    image_proj_model = Resampler(dim=1024,
+                                 depth=4,
+                                 dim_head=64,
+                                 heads=12,
+                                 num_queries=1024,
+                                 embedding_dim=1024,
+                                 output_dim=1024,
+                                 ff_mult=4)
+    numel = 0
+    for name, param in image_proj_model.named_parameters():
+        numel += param.numel()
+    print(f'Total params: {numel}')

src/processer/tokenizer.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from transformers import BertTokenizer
+def bert_tokenizer(pretrained_model_name_or_path):
+    tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path,
+                                              truncation_side='right')
+    tokenizer.add_special_tokens({"bos_token": "[DEC]"})
+    return tokenizer

src/processer/transforms.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from torchvision import transforms
+def get_transform(type='clip', keep_ratio=True, image_size=224):
+    if type == 'clip':
+        transform = []
+        if keep_ratio:
+            transform.extend([
+                transforms.Resize(image_size),
+                transforms.CenterCrop(image_size),
+            ])
+        else:
+            transform.append(transforms.Resize((image_size, image_size)))
+        transform.extend([
+            transforms.ToTensor(),
+            transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
+        ])
+        return transforms.Compose(transform)
+    elif type == 'clipa':
+        transform = []
+        if keep_ratio:
+            transform.extend([
+                transforms.Resize(image_size),
+                transforms.CenterCrop(image_size),
+            ])
+        else:
+            transform.append(transforms.Resize((image_size, image_size)))
+        transform.extend(
+            [transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))])
+        return transforms.Compose(transform)
+    elif type == 'sd':
+        transform = []
+        if keep_ratio:
+            transform.extend([
+                transforms.Resize(image_size, interpolation=transforms.InterpolationMode.BICUBIC),
+                transforms.CenterCrop(image_size),
+            ])
+        else:
+            transform.append(
+                transforms.Resize((image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC))
+        transform.extend([transforms.ToTensor(), transforms.Normalize([0.5], [0.5])])
+        return transforms.Compose(transform)
+    else:
+        raise NotImplementedError

src/tools/reload_qwen_vit.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import torch
+from transformers import AutoModelForCausalLM
+torch.manual_seed(1234)
+qwen_model_path = 'pretrained/Qwen-VL-Chat'
+save_path = 'pretrained/QwenViT/qwen_vit_G.pt'
+model = AutoModelForCausalLM.from_pretrained(qwen_model_path, device_map="cpu", trust_remote_code=True).eval()
+visual_encoder = model.transformer.visual
+print(visual_encoder)
+torch.save(visual_encoder.state_dict(), save_path)

src/train/dist_utils.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+import torch.distributed as dist
+def all_gather(tensor):
+    world_size = dist.get_world_size()
+    tensor_list = [torch.zeros_like(tensor) for _ in range(world_size)]
+    dist.all_gather(tensor_list, tensor)
+    return tensor_list
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+@torch.no_grad()
+def concat_all_gather(tensor):
+    """
+    Performs all_gather operation on the provided tensors.
+    *** Warning ***: torch.distributed.all_gather has no gradient.
+    """
+    # if use distributed training
+    if not is_dist_avail_and_initialized():
+        return tensor
+    tensors_gather = [torch.ones_like(tensor) for _ in range(torch.distributed.get_world_size())]
+    torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+    output = torch.cat(tensors_gather, dim=0)
+    return output

src/train/schedular.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import math
+import warnings
+from functools import partial
+from typing import Callable, Iterable, Optional, Tuple, Union
+import torch
+from torch import nn
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau
+from transformers.trainer_utils import SchedulerType
+from transformers.utils import logging
+from transformers.optimization import get_linear_schedule_with_warmup, \
+    get_cosine_with_hard_restarts_schedule_with_warmup, get_polynomial_decay_schedule_with_warmup, \
+    get_constant_schedule, get_constant_schedule_with_warmup, get_inverse_sqrt_schedule, get_reduce_on_plateau_schedule
+logger = logging.get_logger(__name__)
+def _get_cosine_schedule_with_warmup_lr_lambda(current_step: int,
+                                               *,
+                                               num_warmup_steps: int,
+                                               num_training_steps: int,
+                                               num_cycles: float,
+                                               min_lr_ratio: float = 0.0):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+    # return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+    return max(0.0,
+               0.5 * ((1.0 + min_lr_ratio) + (1.0 - min_lr_ratio) * math.cos(
+                   math.pi * float(num_cycles) * 2.0 * progress)))
+def get_cosine_schedule_with_warmup(optimizer: Optimizer,
+                                    num_warmup_steps: int,
+                                    num_training_steps: int,
+                                    num_cycles: float = 0.5,
+                                    last_epoch: int = -1,
+                                    min_lr_ratio: float = 0.0):
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
+    initial lr set in the optimizer.
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        num_cycles (`float`, *optional*, defaults to 0.5):
+            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
+            following a half-cosine).
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    lr_lambda = partial(
+        _get_cosine_schedule_with_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        num_cycles=num_cycles,
+        min_lr_ratio=min_lr_ratio,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+TYPE_TO_SCHEDULER_FUNCTION = {
+    SchedulerType.LINEAR: get_linear_schedule_with_warmup,
+    SchedulerType.COSINE: get_cosine_schedule_with_warmup,
+    SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup,
+    SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
+    SchedulerType.CONSTANT: get_constant_schedule,
+    SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
+    SchedulerType.INVERSE_SQRT: get_inverse_sqrt_schedule,
+    SchedulerType.REDUCE_ON_PLATEAU: get_reduce_on_plateau_schedule,
+}
+def get_scheduler(
+        name: Union[str, SchedulerType],
+        optimizer: Optimizer,
+        num_warmup_steps: Optional[int] = None,
+        num_training_steps: Optional[int] = None,
+        min_lr_ratio: Optional[float] = 0.0,
+):
+    """
+    Unified API to get any scheduler from its name.
+    Args:
+        name (`str` or `SchedulerType`):
+            The name of the scheduler to use.
+        optimizer (`torch.optim.Optimizer`):
+            The optimizer that will be used during training.
+        num_warmup_steps (`int`, *optional*):
+            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+        num_training_steps (`int``, *optional*):
+            The number of training steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+    """
+    name = SchedulerType(name)
+    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
+    if name == SchedulerType.CONSTANT or name == SchedulerType.REDUCE_ON_PLATEAU:
+        return schedule_func(optimizer)
+    # All other schedulers require `num_warmup_steps`
+    if num_warmup_steps is None:
+        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
+    if name == SchedulerType.CONSTANT_WITH_WARMUP:
+        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
+    if name == SchedulerType.INVERSE_SQRT:
+        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
+    # All other schedulers require `num_training_steps`
+    if num_training_steps is None:
+        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
+    logger.info(f'Initialize lr scheduler with min_lr_ratio: {min_lr_ratio}')
+    return schedule_func(optimizer,
+                         num_warmup_steps=num_warmup_steps,
+                         num_training_steps=num_training_steps,
+                         min_lr_ratio=min_lr_ratio)

src/train/train.py ADDED Viewed

	@@ -0,0 +1,291 @@

+# flake8: noqa
+import hydra
+import pyrootutils
+import os
+import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration
+from tqdm.auto import tqdm
+from omegaconf import OmegaConf
+from omegaconf.dictconfig import DictConfig
+import argparse
+from flask import Flask, request
+from typing import List, Union
+import json
+from typing import Optional
+import transformers
+from dataclasses import dataclass, field, asdict, is_dataclass
+from torchdata.dataloader2 import DataLoader2, MultiProcessingReadingService, DistributedReadingService, \
+    SequentialReadingService
+import logging
+pyrootutils.setup_root(__file__, indicator='.project-root', pythonpath=True)
+from src.train.schedular import get_scheduler
+from src.train.dist_utils import all_gather
+# logger = get_logger(__name__, log_level='info')
+log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+logging.basicConfig(level=logging.INFO, format=log_format)
+logger = logging.getLogger(__name__)
+os.environ["WANDB_MODE"] = "offline"
+@dataclass
+class ConfigPathArguments:
+    image_transform: Optional[str] = field(default=None, metadata={"help": "config path of image transform"})
+    tokenizer: Optional[str] = field(default=None,
+                                     metadata={"help": "config path of tokenizer used to initialize tokenizer"})
+    # model: Optional[str] = field(default=None, metadata={"help": "config path of llm"})
+    visual_encoder: Optional[str] = field(default=None, metadata={"help": "config path of visual encoder"})
+    text_encoder: Optional[str] = field(default=None, metadata={"help": "config path of visual encoder"})
+    discrete_model: Optional[str] = field(default=None, metadata={"help": "config path of discrete model"})
+    train_dataset: Optional[str] = field(default=None, metadata={"help": "config path of training dataset"})
+@dataclass
+class TrainingArguments:
+    output_dir: str = field(
+        metadata={"help": "The output directory where the model predictions and checkpoints will be written."}, )
+    resume_from_checkpoint: Optional[str] = field(
+        default=None, metadata={"help": "The path to a folder with a valid checkpoint for your model."})
+    resume_steps: Optional[int] = field(default=None, metadata={"help": "The training sterps of saved checkpoint"})
+    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
+    weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
+    adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
+    adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
+    adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
+    max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."})
+    gradient_accumulation_steps: int = field(
+        default=1, metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."})
+    mixed_precision: Optional[str] = field(
+        default='no',
+        metadata={
+            "help":
+                "Whether to use mixed precision. \
+                    Choose between fp16 and bf16 (bfloat16). \
+                        Bf16 requires PyTorch >=1.10.and an Nvidia Ampere GPU."
+        })
+    num_train_epochs: int = field(default=3, metadata={"help": "Total number of training epochs to perform."})
+    max_steps: int = field(default=-1, metadata={"help": "Total number of training steps to perform. "})
+    save_steps: int = field(default=10000, metadata={"help": "Number of updates steps before two checkpoint saves."})
+    lr_scheduler_type: str = field(default="cosine", metadata={"help": "The scheduler type to use."})
+    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
+    min_lr_ratio: float = field(default=0.01, metadata={"help": "Minimal learning rate ratio."})
+    dataloader_num_workers: int = field(default=8, metadata={"help": "The number of workers to use for data loading."})
+    project_name: str = field(default="DiscreteLearning", metadata={"help": "The name of experiment"})
+    expr_name: str = field(default="", metadata={"help": "The name of experiment"})
+def build_dataloader(dataset_cfg, image_transform, tokenizer, dataloader_num_workers=4):
+    dataset = hydra.utils.instantiate(dataset_cfg, image_transform=image_transform, tokenizer=tokenizer)
+    mp_service = MultiProcessingReadingService(num_workers=dataloader_num_workers)
+    dist_service = DistributedReadingService()
+    reading_service = SequentialReadingService(dist_service, mp_service)
+    dataloader = DataLoader2(dataset, reading_service=reading_service)
+    return dataloader
+def get_metric(output):
+    metric = {}
+    for key, value in output.items():
+        if 'loss' in key:
+            metric[key] = value.item()
+    return metric
+def get_code_usage(indices):
+    indices_list = all_gather(indices)
+    indices = torch.cat(indices_list, dim=0)
+    code_usage = indices.unique().numel()
+    return code_usage
+def merge_config(**kwargs):
+    config = {}
+    for key, value in kwargs.items():
+        if isinstance(value, argparse.Namespace):
+            config[key] = vars(value)
+        elif isinstance(value, DictConfig):
+            config[key] = OmegaConf.to_object(value)
+        elif is_dataclass(value):
+            config[key] = asdict(value)
+        elif isinstance(value, dict):
+            config[key] = value
+        else:
+            logger.error(f'key: {key}, value: {value} will not be merged.')
+    return config
+def trainable_params(model):
+    count = 0
+    for name, param in model.named_parameters():
+        count += param.numel()
+    return count
+def train():
+    parser = transformers.HfArgumentParser((ConfigPathArguments, TrainingArguments))
+    cfg_path, args = parser.parse_args_into_dataclasses()
+    project_config = ProjectConfiguration(project_dir=args.output_dir,
+                                          logging_dir=os.path.join(args.output_dir, 'logs'))
+    accelerator = Accelerator(
+        mixed_precision=args.mixed_precision,
+        log_with=['tensorboard', 'wandb'],
+        project_config=project_config,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        step_scheduler_with_optimizer=False,
+    )
+    logger.info('Init accelerator done.')
+    os.makedirs(args.output_dir, exist_ok=True)
+    visual_encoder_cfg = OmegaConf.load(cfg_path.visual_encoder)
+    visual_encoder = hydra.utils.instantiate(visual_encoder_cfg)
+    logger.info('Load visual encoder done.')
+    discrete_model_cfg = OmegaConf.load(cfg_path.discrete_model)
+    discrete_model = hydra.utils.instantiate(discrete_model_cfg)
+    logger.info('Load discrete model done.')
+    train_dataset_cfg = OmegaConf.load(cfg_path.train_dataset)
+    if cfg_path.text_encoder is not None:
+        text_encoder_cfg = OmegaConf.load(cfg_path.text_encoder)
+        text_encoder = hydra.utils.instantiate(text_encoder_cfg)
+    else:
+        text_encoder_cfg = None
+        text_encoder = None
+    if cfg_path.image_transform is not None:
+        image_transform_cfg = OmegaConf.load(cfg_path.image_transform)
+        image_transform = hydra.utils.instantiate(image_transform_cfg)
+    else:
+        image_transform_cfg = None
+        image_transform = None
+    if cfg_path.tokenizer is not None:
+        tokenizer_cfg = OmegaConf.load(cfg_path.tokenizer)
+        tokenizer = hydra.utils.instantiate(tokenizer_cfg)
+    else:
+        tokenizer_cfg = None
+        tokenizer = None
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    visual_encoder.to(accelerator.device, dtype=weight_dtype)
+    logger.info('Freeze visual encoder...')
+    visual_encoder.requires_grad_(False)
+    if text_encoder is not None:
+        logger.info('Freeze text encoder...')
+        text_encoder.requires_grad_(False)
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+    discrete_model.to(accelerator.device, dtype=weight_dtype)
+    discrete_model = accelerator.prepare(discrete_model)
+    optimizer = torch.optim.AdamW(discrete_model.parameters(),
+                                  lr=args.learning_rate,
+                                  betas=[args.adam_beta1, args.adam_beta2],
+                                  eps=args.adam_epsilon,
+                                  weight_decay=args.weight_decay)
+    logger.info('Init optimizer done.')
+    scheduler = get_scheduler(name=args.lr_scheduler_type,
+                              optimizer=optimizer,
+                              num_warmup_steps=args.warmup_steps,
+                              num_training_steps=args.max_steps,
+                              min_lr_ratio=args.min_lr_ratio)
+    # accelerator.register_for_checkpointing(scheduler)
+    optimizer, scheduler = accelerator.prepare(optimizer, scheduler)
+    logger.info('Prepare accelerator done.')
+    config_record = merge_config(discrete_model=discrete_model_cfg,
+                                 visual_encoder=visual_encoder_cfg,
+                                 text_encoder=text_encoder_cfg,
+                                 image_transform=image_transform_cfg,
+                                 tokenizer=tokenizer_cfg,
+                                 train_dataset=train_dataset_cfg,
+                                 train_args=args)
+    accelerator.init_trackers(project_name=args.project_name,
+                              init_kwargs={"wandb": {
+                                  "config": config_record,
+                                  "name": args.expr_name,
+                                  "dir": args.output_dir
+                              }})
+    if args.resume_from_checkpoint is not None:
+        logger.info(f'Load checkpoint from {args.resume_from_checkpoint}')
+        accelerator.load_state(args.resume_from_checkpoint)
+    num_params = trainable_params(discrete_model)
+    logger.info("***** Running training *****")
+    logger.info(f"  Total optimization steps = {args.max_steps}")
+    logger.info(f"  Total trainable params = {num_params}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_steps), disable=not accelerator.is_main_process)
+    progress_bar.set_description("Steps")
+    global_step = 0
+    if args.resume_steps is not None:
+        global_step = args.resume_steps
+        progress_bar.update(args.resume_steps)
+    train_dataloader = build_dataloader(dataset_cfg=train_dataset_cfg,
+                                        image_transform=image_transform,
+                                        tokenizer=tokenizer,
+                                        dataloader_num_workers=args.dataloader_num_workers)
+    for epoch in range(args.num_train_epochs):
+        discrete_model.train()
+        logger.info('Start new epoch')
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(discrete_model):
+                with torch.no_grad():
+                    image_embeds = visual_encoder(batch['images'].to(accelerator.device, dtype=weight_dtype))
+                    if text_encoder is not None:
+                        text_embeds = text_encoder(batch['text_input_ids'].to(accelerator.device))
+                    else:
+                        text_embeds = None
+                output = discrete_model(image_embeds=image_embeds, text_embeds=text_embeds)
+                loss = output['total_loss']
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(discrete_model.parameters(), max_norm=args.max_grad_norm)
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if global_step % args.save_steps == 0:
+                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                    accelerator.save_state(save_path)
+            metric = get_metric(output)
+            metric['lr'] = optimizer.param_groups[0]['lr']
+            metric['code_usage'] = get_code_usage(output['indices'])
+            metric = {key: (format(value, ".6f") if isinstance(value, float) else value) for key, value in
+                      metric.items()}
+            accelerator.log(metric, step=global_step)
+            if accelerator.is_main_process:
+                tqdm.write(str(metric))
+            # print(metric)
+            if global_step >= args.max_steps:
+                break
+    accelerator.end_training()
+if __name__ == '__main__':
+    train()

src/train/train_clm_sft.py ADDED Viewed

	@@ -0,0 +1,347 @@

+# flake8: noqa
+import hydra
+import pyrootutils
+import os
+import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration
+from torch.utils.data import DataLoader
+from deepspeed.runtime.engine import DummyOptim
+from tqdm.auto import tqdm
+from omegaconf import OmegaConf
+from omegaconf.dictconfig import DictConfig
+import argparse
+from flask import Flask, request
+from typing import List, Union
+import json
+from typing import Optional
+import transformers
+from dataclasses import dataclass, field, asdict, is_dataclass
+from torchdata.dataloader2 import DataLoader2, MultiProcessingReadingService, DistributedReadingService, \
+    SequentialReadingService
+import gc
+import logging
+from accelerate import FullyShardedDataParallelPlugin, DistributedDataParallelKwargs
+from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
+pyrootutils.setup_root(__file__, indicator='.project-root', pythonpath=True)
+from src.train.schedular import get_scheduler
+from src.train.dist_utils import all_gather
+# logger = get_logger(__name__, log_level='info')
+log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+logging.basicConfig(level=logging.INFO, format=log_format)
+logger = logging.getLogger(__name__)
+os.environ["WANDB_MODE"] = "offline"
+@dataclass
+class ConfigPathArguments:
+    image_transform: Optional[str] = field(default=None, metadata={"help": "config path of image transform"})
+    tokenizer: Optional[str] = field(default=None,
+                                     metadata={"help": "config path of tokenizer used to initialize tokenizer"})
+    # model: Optional[str] = field(default=None, metadata={"help": "config path of llm"})
+    visual_encoder: Optional[str] = field(default=None, metadata={"help": "config path of visual encoder"})
+    llm_model: Optional[str] = field(default=None, metadata={"help": "config path of llm"})
+    agent_model: Optional[str] = field(default=None, metadata={"help": "config path of agent"})
+    train_dataset: Optional[str] = field(default=None, metadata={"help": "config path of training dataset"})
+    fsdp_plugin: Optional[str] = field(default=None, metadata={"help": "config path of fsdp plugin"})
+    deepspeed_plugin: Optional[str] = field(default=None, metadata={"help": "config path of deepspeed plugin"})
+@dataclass
+class TrainingArguments:
+    output_dir: str = field(
+        metadata={"help": "The output directory where the model predictions and checkpoints will be written."}, )
+    resume_from_checkpoint: Optional[str] = field(
+        default=None, metadata={"help": "The path to a folder with a valid checkpoint for your model."})
+    resume_steps: Optional[int] = field(default=None, metadata={"help": "The training sterps of saved checkpoint"})
+    batch_size: Optional[int] = field(default=60, metadata={"help": "The training batch size"})
+    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
+    weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
+    adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
+    adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
+    adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
+    max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."})
+    gradient_accumulation_steps: int = field(
+        default=1, metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."})
+    mixed_precision: Optional[str] = field(
+        default='no',
+        metadata={
+            "help":
+                "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=1.10.and an Nvidia Ampere GPU."
+        })
+    num_train_epochs: int = field(default=3, metadata={"help": "Total number of training epochs to perform."})
+    max_steps: int = field(default=-1, metadata={"help": "Total number of training steps to perform. "})
+    save_steps: int = field(default=10000, metadata={"help": "Number of updates steps before two checkpoint saves."})
+    lr_scheduler_type: str = field(default="cosine", metadata={"help": "The scheduler type to use."})
+    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
+    min_lr_ratio: float = field(default=0.01, metadata={"help": "Minimal learning rate ratio."})
+    dataloader_num_workers: int = field(default=8, metadata={"help": "The number of workers to use for data loading."})
+    project_name: str = field(default="ContinuousVLM", metadata={"help": "The name of experiment"})
+    expr_name: str = field(default="", metadata={"help": "The name of experiment"})
+def build_dataloader(dataset_cfg, image_transform, tokenizer, batch_size, dataloader_num_workers=4):
+    dataset = hydra.utils.instantiate(dataset_cfg, image_transform=image_transform, tokenizer=tokenizer)
+    mp_service = MultiProcessingReadingService(num_workers=dataloader_num_workers)
+    dist_service = DistributedReadingService()
+    reading_service = SequentialReadingService(dist_service, mp_service)
+    dataloader = DataLoader2(dataset, reading_service=reading_service)
+    # dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=dataloader_num_workers)
+    return dataloader
+def get_metric(output):
+    metric = {}
+    for key, value in output.items():
+        if 'loss' in key:
+            gathered_metric = torch.stack(all_gather(value)).mean()
+            # metric[key] = value.item()
+            metric[key] = gathered_metric.item()
+        if 'acc' in key:
+            metric[key] = value.item()
+    return metric
+def merge_config(**kwargs):
+    config = {}
+    for key, value in kwargs.items():
+        if isinstance(value, argparse.Namespace):
+            config[key] = vars(value)
+        elif isinstance(value, DictConfig):
+            config[key] = OmegaConf.to_object(value)
+        elif is_dataclass(value):
+            config[key] = asdict(value)
+        elif isinstance(value, (int, str, float, dict)) or value is None:
+            config[key] = value
+        else:
+            logger.error(f'key: {key}, value: {value} will not be merged.')
+    return config
+def trainable_params(model):
+    count = 0
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            count += param.numel()
+    return count
+def train():
+    parser = transformers.HfArgumentParser((ConfigPathArguments, TrainingArguments))
+    cfg_path, args = parser.parse_args_into_dataclasses()
+    project_config = ProjectConfiguration(project_dir=args.output_dir,
+                                          logging_dir=os.path.join(args.output_dir, 'logs'))
+    assert int(cfg_path.fsdp_plugin is not None) + int(cfg_path.deepspeed_plugin is not None) <= 1
+    if cfg_path.fsdp_plugin is not None:
+        fsdp_plugin_cfg = OmegaConf.load(cfg_path.fsdp_plugin)
+        fsdp_plugin = hydra.utils.instantiate(fsdp_plugin_cfg)
+        logger.info('Use FSDP plugin')
+    else:
+        fsdp_plugin = None
+    if cfg_path.deepspeed_plugin is not None:
+        deepspeed_plugin_cfg = OmegaConf.load(cfg_path.deepspeed_plugin)
+        deepspeed_plugin = hydra.utils.instantiate(deepspeed_plugin_cfg)
+        logger.info('Use deepspeed plugin')
+    else:
+        deepspeed_plugin = None
+    # ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        mixed_precision=args.mixed_precision,
+        log_with=['tensorboard', 'wandb'],
+        project_config=project_config,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        step_scheduler_with_optimizer=False,
+        fsdp_plugin=fsdp_plugin,
+        deepspeed_plugin=deepspeed_plugin,
+        # kwargs_handlers=[ddp_kwargs],
+    )
+    accelerator.wait_for_everyone()
+    logger.info('Init accelerator done.')
+    if cfg_path.deepspeed_plugin is not None:
+        accelerator.state.deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = 8
+    # print('deepspeed config: ', accelerator.state.deepspeed_plugin.deepspeed_config)
+    os.makedirs(args.output_dir, exist_ok=True)
+    # if cfg_path.image_transform is not None:
+    image_transform_cfg = OmegaConf.load(cfg_path.image_transform)
+    image_transform = hydra.utils.instantiate(image_transform_cfg)
+    # else:
+    #     image_transform_cfg = None
+    #     image_transform = None
+    # if cfg_path.tokenizer is not None:
+    tokenizer_cfg = OmegaConf.load(cfg_path.tokenizer)
+    tokenizer = hydra.utils.instantiate(tokenizer_cfg)
+    # else:
+    #     tokenizer_cfg = None
+    #     tokenizer = None
+    train_dataset_cfg = OmegaConf.load(cfg_path.train_dataset)
+    visual_encoder_cfg = OmegaConf.load(cfg_path.visual_encoder)
+    visual_encoder = hydra.utils.instantiate(visual_encoder_cfg)
+    logger.info('Load visual encoder done.')
+    llm_model_cfg = OmegaConf.load(cfg_path.llm_model)
+    llm_model = hydra.utils.instantiate(llm_model_cfg)
+    llm_model.gradient_checkpointing_enable()
+    llm_model.config.use_cache = False
+    logger.info('Load llm model done.')
+    agent_model_cfg = OmegaConf.load(cfg_path.agent_model)
+    agent_model = hydra.utils.instantiate(agent_model_cfg, llm=llm_model)
+    logger.info('Load agent model done.')
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    visual_encoder.to(accelerator.device, dtype=weight_dtype)
+    logger.info('Freeze visual encoder...')
+    visual_encoder.requires_grad_(False)
+    if cfg_path.fsdp_plugin is not None:
+        agent_model = accelerator.prepare(agent_model)
+    optimizer = torch.optim.AdamW(agent_model.parameters(),
+                                  lr=args.learning_rate,
+                                  betas=[args.adam_beta1, args.adam_beta2],
+                                  eps=args.adam_epsilon,
+                                  weight_decay=args.weight_decay)
+    logger.info('Init optimizer done.')
+    scheduler = get_scheduler(name=args.lr_scheduler_type,
+                              optimizer=optimizer,
+                              num_warmup_steps=args.warmup_steps,
+                              num_training_steps=args.max_steps,
+                              min_lr_ratio=args.min_lr_ratio)
+    # accelerator.register_for_checkpointing(scheduler)
+    train_dataloader = build_dataloader(dataset_cfg=train_dataset_cfg,
+                                        image_transform=image_transform,
+                                        tokenizer=tokenizer,
+                                        batch_size=args.batch_size,
+                                        dataloader_num_workers=args.dataloader_num_workers)
+    if cfg_path.fsdp_plugin is not None:
+        optimizer, scheduler = accelerator.prepare(optimizer, scheduler)
+    else:
+        agent_model, optimizer, scheduler = accelerator.prepare(agent_model, optimizer, scheduler)
+    logger.info('Prepare accelerator done.')
+    config_record = merge_config(agent_model=agent_model_cfg,
+                                 llm_model=llm_model,
+                                 visual_encoder=visual_encoder_cfg,
+                                 image_transform=image_transform_cfg,
+                                 tokenizer=tokenizer_cfg,
+                                 train_dataset=train_dataset_cfg,
+                                 train_args=args)
+    accelerator.init_trackers(project_name=args.project_name,
+                              init_kwargs={"wandb": {
+                                  "config": config_record,
+                                  "name": args.expr_name,
+                                  "dir": args.output_dir
+                              }})
+    if args.resume_from_checkpoint is not None:
+        logger.info(f'Load checkpoint from {args.resume_from_checkpoint}')
+        accelerator.load_state(args.resume_from_checkpoint)
+        torch.cuda.empty_cache()
+        gc.collect()
+    num_params = trainable_params(agent_model)
+    logger.info("***** Running training *****")
+    logger.info(f"  Total optimization steps = {args.max_steps}")
+    logger.info(f"  Total trainable params = {num_params}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_steps), disable=not accelerator.is_main_process)
+    progress_bar.set_description("Steps")
+    global_step = 0
+    if args.resume_steps is not None:
+        global_step = args.resume_steps
+        progress_bar.update(args.resume_steps)
+    for epoch in range(args.num_train_epochs):
+        agent_model.train()
+        logger.info('Start new epoch')
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(agent_model):
+                # accelerator.wait_for_everyone()
+                # print('1')
+                with torch.no_grad():
+                    if batch['images'] is not None:
+                        image_embeds = visual_encoder(batch['images'].to(accelerator.device, dtype=weight_dtype))
+                        # image_embeds = visual_encoder(batch['images'])
+                    else:
+                        image_embeds = None
+                # accelerator.wait_for_everyone()
+                # print('2')
+                output = agent_model(input_ids=batch['input_ids'].to(accelerator.device),
+                                     attention_mask=batch['attention_mask'].to(accelerator.device),
+                                     labels=batch['labels'].to(accelerator.device),
+                                     image_embeds=image_embeds,
+                                     embeds_gen_mask=batch['embeds_gen_mask'].to(accelerator.device)
+                                     if batch['embeds_gen_mask'] is not None else None,
+                                     embeds_cmp_mask=batch['embeds_cmp_mask'].to(accelerator.device)
+                                     if batch['embeds_cmp_mask'] is not None else None,
+                                     ids_gen_mask=batch['ids_gen_mask'].to(accelerator.device),
+                                     ids_cmp_mask=batch['ids_cmp_mask'].to(accelerator.device))
+                # output = agent_model(
+                #     input_ids=batch['input_ids'],  #.squeeze(0),
+                #     attention_mask=batch['attention_mask'],  # .squeeze(0),
+                #     labels=batch['labels'],  # .squeeze(0),
+                #     image_embeds=image_embeds,
+                #     embeds_gen_mask=batch['embeds_gen_mask'],  #.squeeze(0),
+                #     embeds_cmp_mask=batch['embeds_cmp_mask'],  #.squeeze(0),
+                #     ids_gen_mask=batch['ids_gen_mask'],  #.squeeze(0),
+                #     ids_cmp_mask=batch['ids_cmp_mask'])  #.squeeze(0))
+                loss = output['total_loss']
+                # accelerator.wait_for_everyone()
+                # print('3')
+                accelerator.backward(loss)
+                # accelerator.wait_for_everyone()
+                # print('4')
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(agent_model.parameters(), max_norm=args.max_grad_norm)
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+                # accelerator.wait_for_everyone()
+                # print('5')
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if global_step % args.save_steps == 0:
+                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                    accelerator.save_state(save_path)
+            metric = get_metric(output)
+            metric['lr'] = optimizer.param_groups[0]['lr']
+            accelerator.log(metric, step=global_step)
+            metric = {key: (format(value, ".6f") if isinstance(value, float) else value) for key, value in
+                      metric.items()}
+            if accelerator.is_main_process:
+                tqdm.write(str(metric))
+            # print(metric)
+            if global_step >= args.max_steps:
+                break
+    accelerator.end_training()
+if __name__ == '__main__':
+    train()

src/train/train_sdxl_img2img_llm.py ADDED Viewed

	@@ -0,0 +1,428 @@

+# flake8: noqa
+import hydra
+import pyrootutils
+import os
+import torch
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration
+from tqdm.auto import tqdm
+from omegaconf import OmegaConf
+from omegaconf.dictconfig import DictConfig
+from diffusers import AutoencoderKL, DDPMScheduler, UNet2DConditionModel, DPMSolverMultistepScheduler, \
+    Transformer2DModel
+from transformers import CLIPTextModel, CLIPTokenizer
+import argparse
+from flask import Flask, request
+from typing import List, Union
+import json
+from typing import Optional
+import transformers
+from dataclasses import dataclass, field, asdict, is_dataclass
+from torchdata.dataloader2 import DataLoader2, MultiProcessingReadingService, DistributedReadingService, \
+    SequentialReadingService
+import logging
+pyrootutils.setup_root(__file__, indicator='.project-root', pythonpath=True)
+from src.train.schedular import get_scheduler
+from src.train.dist_utils import all_gather
+# logger = get_logger(__name__, log_level='info')
+log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+logging.basicConfig(level=logging.INFO, format=log_format)
+logger = logging.getLogger(__name__)
+# os.environ["WANDB_MODE"] = "offline"
+@dataclass
+class ConfigPathArguments:
+    image_transform: Optional[str] = field(default=None, metadata={"help": "config path of image transform"})
+    sd_image_transform: Optional[str] = field(default=None,
+                                              metadata={"help": "config path of stable diffusion image transform"})
+    # tokenizer: Optional[str] = field(default=None, metadata={"help": "config path of tokenizer used to initialize tokenizer"})
+    visual_encoder: Optional[str] = field(default=None, metadata={"help": "config path of visual encoder"})
+    # text_encoder: Optional[str] = field(default=None, metadata={"help": "config path of visual encoder"})
+    discrete_model: Optional[str] = field(default=None, metadata={"help": "config path of discrete model"})
+    # noise_scheduler: Optional[str] = field(default=None, metadata={"help": "config path of noise scheduler"})
+    # vae: Optional[str] = field(default=None, metadata={"help": "config path of vae"})
+    adapter: Optional[str] = field(default=None, metadata={"help": "config path of adapter"})
+    train_dataset: Optional[str] = field(default=None, metadata={"help": "config path of training dataset"})
+    fsdp_plugin: Optional[str] = field(default=None, metadata={"help": "config path of fsdp plugin"})
+    deepspeed_plugin: Optional[str] = field(default=None, metadata={"help": "config path of deepspeed plugin"})
+    tokenizer: Optional[str] = field(default=None,
+                                     metadata={"help": "config path of tokenizer used to initialize tokenizer"})
+    llm_model: Optional[str] = field(default=None, metadata={"help": "config path of llm"})
+    agent_model: Optional[str] = field(default=None, metadata={"help": "config path of agent"})
+@dataclass
+class TrainingArguments:
+    output_dir: str = field(
+        metadata={"help": "The output directory where the model predictions and checkpoints will be written."}, )
+    diffusion_model_path: Optional[str] = field(default=None, metadata={"help": "config path of training dataset"})
+    resume_from_checkpoint: Optional[str] = field(
+        default=None, metadata={"help": "The path to a folder with a valid checkpoint for your model."})
+    resume_steps: Optional[int] = field(default=None, metadata={"help": "The training sterps of saved checkpoint"})
+    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
+    weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
+    # adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
+    # adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
+    # adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
+    max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."})
+    gradient_accumulation_steps: int = field(
+        default=1, metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."})
+    mixed_precision: Optional[str] = field(
+        default='no',
+        metadata={
+            "help":
+                "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >=1.10.and an Nvidia Ampere GPU."
+        })
+    num_train_epochs: int = field(default=3, metadata={"help": "Total number of training epochs to perform."})
+    max_steps: int = field(default=-1, metadata={"help": "Total number of training steps to perform. "})
+    save_steps: int = field(default=10000, metadata={"help": "Number of updates steps before two checkpoint saves."})
+    lr_scheduler_type: str = field(default="cosine", metadata={"help": "The scheduler type to use."})
+    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
+    min_lr_ratio: float = field(default=0.01, metadata={"help": "Minimal learning rate ratio."})
+    dataloader_num_workers: int = field(default=8, metadata={"help": "The number of workers to use for data loading."})
+    project_name: str = field(default="IPAdapter", metadata={"help": "The name of experiment"})
+    expr_name: str = field(default="", metadata={"help": "The name of experiment"})
+def build_dataloader(dataset_cfg, image_transform, sd_image_transform, tokenizer, dataloader_num_workers=4):
+    dataset = hydra.utils.instantiate(dataset_cfg,
+                                      image_transform=image_transform,
+                                      sd_image_transform=sd_image_transform,
+                                      tokenizer=tokenizer)
+    mp_service = MultiProcessingReadingService(num_workers=dataloader_num_workers)
+    dist_service = DistributedReadingService()
+    reading_service = SequentialReadingService(dist_service, mp_service)
+    dataloader = DataLoader2(dataset, reading_service=reading_service)
+    return dataloader
+def get_metric(output):
+    metric = {}
+    for key, value in output.items():
+        if 'loss' in key:
+            metric[key] = value.item()
+    return metric
+def merge_config(**kwargs):
+    config = {}
+    for key, value in kwargs.items():
+        if isinstance(value, argparse.Namespace):
+            config[key] = vars(value)
+        elif isinstance(value, DictConfig):
+            config[key] = OmegaConf.to_object(value)
+        elif is_dataclass(value):
+            config[key] = asdict(value)
+        elif isinstance(value, dict):
+            config[key] = value
+        else:
+            logger.error(f'key: {key}, value: {value} will not be merged.')
+    return config
+def trainable_params(model):
+    count = 0
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            count += param.numel()
+    return count
+def train():
+    parser = transformers.HfArgumentParser((ConfigPathArguments, TrainingArguments))
+    cfg_path, args = parser.parse_args_into_dataclasses()
+    project_config = ProjectConfiguration(project_dir=args.output_dir,
+                                          logging_dir=os.path.join(args.output_dir, 'logs'))
+    assert int(cfg_path.fsdp_plugin is not None) + int(cfg_path.deepspeed_plugin is not None) <= 1
+    if cfg_path.fsdp_plugin is not None:
+        fsdp_plugin_cfg = OmegaConf.load(cfg_path.fsdp_plugin)
+        fsdp_plugin = hydra.utils.instantiate(fsdp_plugin_cfg)
+        logger.info('Use FSDP plugin')
+    else:
+        fsdp_plugin = None
+    if cfg_path.deepspeed_plugin is not None:
+        deepspeed_plugin_cfg = OmegaConf.load(cfg_path.deepspeed_plugin)
+        deepspeed_plugin = hydra.utils.instantiate(deepspeed_plugin_cfg)
+        logger.info('Use deepspeed plugin')
+    else:
+        deepspeed_plugin = None
+    accelerator = Accelerator(
+        mixed_precision=args.mixed_precision,
+        log_with=['tensorboard', 'wandb'],
+        project_config=project_config,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        step_scheduler_with_optimizer=False,
+        fsdp_plugin=fsdp_plugin,
+        deepspeed_plugin=deepspeed_plugin,
+    )
+    logger.info('Init accelerator done.')
+    if cfg_path.deepspeed_plugin is not None:
+        accelerator.state.deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = 100
+    os.makedirs(args.output_dir, exist_ok=True)
+    image_transform_cfg = OmegaConf.load(cfg_path.image_transform)
+    image_transform = hydra.utils.instantiate(image_transform_cfg)
+    sd_image_transform_cfg = OmegaConf.load(cfg_path.sd_image_transform)
+    sd_image_transform = hydra.utils.instantiate(sd_image_transform_cfg)
+    tokenizer_cfg = OmegaConf.load(cfg_path.tokenizer)
+    tokenizer = hydra.utils.instantiate(tokenizer_cfg)
+    visual_encoder_cfg = OmegaConf.load(cfg_path.visual_encoder)
+    visual_encoder = hydra.utils.instantiate(visual_encoder_cfg)
+    logger.info('Load visual encoder done.')
+    discrete_model_cfg = OmegaConf.load(cfg_path.discrete_model)
+    discrete_model = hydra.utils.instantiate(discrete_model_cfg)
+    logger.info('Load discrete model done.')
+    # noise_scheduler_cfg = OmegaConf.load(cfg_path.noise_scheduler)
+    # noise_scheduler = hydra.utils.instantiate(noise_scheduler_cfg)
+    # if cfg_path.tokenizer is not None:
+    #     tokenizer_cfg = OmegaConf.load(cfg_path.tokenizer)
+    #     tokenizer = hydra.utils.instantiate(tokenizer_cfg)
+    # else:
+    #     tokenizer_cfg = None
+    #     tokenizer = None
+    # if cfg_path.text_encoder is not None:
+    #     text_encoder_cfg = OmegaConf.load(cfg_path.text_encoder)
+    #     text_encoder = hydra.utils.instantiate(text_encoder_cfg)
+    #     logger.info('Load text encoder done.')
+    # else:
+    #     text_encoder_cfg = None
+    #     text_encoder = None
+    # vae_cfg = OmegaConf.load(cfg_path.vae)
+    # vae = hydra.utils.instantiate(vae_cfg)
+    # logger.info('Load vae done.')
+    # noise_scheduler = DDPMScheduler.from_pretrained(args.diffusion_model_path, subfolder="scheduler")
+    # tokenizer = CLIPTokenizer.from_pretrained(args.diffusion_model_path, subfolder="tokenizer")
+    # text_encoder = CLIPTextModel.from_pretrained(args.diffusion_model_path, subfolder="text_encoder")
+    # vae = AutoencoderKL.from_pretrained(args.diffusion_model_path, subfolder="vae")
+    # unet = UNet2DConditionModel.from_pretrained(args.diffusion_model_path, subfolder="unet")
+    # print('load diffusion model done')
+    # noise_scheduler = DPMSolverMultistepScheduler.from_pretrained(args.diffusion_model_path, subfolder="scheduler")
+    noise_scheduler = DDPMScheduler.from_pretrained(args.diffusion_model_path, subfolder="scheduler")
+    text_encoder = None
+    vae = AutoencoderKL.from_pretrained(args.diffusion_model_path, subfolder="vae")
+    unet = UNet2DConditionModel.from_pretrained(args.diffusion_model_path, subfolder="unet")
+    unet.enable_xformers_memory_efficient_attention()
+    unet.enable_gradient_checkpointing()
+    vae.requires_grad_(False)
+    visual_encoder.requires_grad_(False)
+    discrete_model.requires_grad_(False)
+    adapter_cfg = OmegaConf.load(cfg_path.adapter)
+    adapter = hydra.utils.instantiate(adapter_cfg, unet=unet)
+    logger.info('Load adapter done.')
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    vae.to(accelerator.device, dtype=weight_dtype)
+    visual_encoder.to(accelerator.device, dtype=weight_dtype)
+    discrete_model.to(accelerator.device, dtype=weight_dtype)
+    if text_encoder is not None:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+    train_dataset_cfg = OmegaConf.load(cfg_path.train_dataset)
+    train_dataloader = build_dataloader(dataset_cfg=train_dataset_cfg,
+                                        image_transform=image_transform,
+                                        sd_image_transform=sd_image_transform,
+                                        tokenizer=tokenizer,
+                                        dataloader_num_workers=args.dataloader_num_workers)
+    llm_model_cfg = OmegaConf.load(cfg_path.llm_model)
+    llm_model = hydra.utils.instantiate(llm_model_cfg)
+    llm_model.gradient_checkpointing_enable()
+    llm_model.config.use_cache = False
+    logger.info('Load llm model done.')
+    agent_model_cfg = OmegaConf.load(cfg_path.agent_model)
+    agent_model = hydra.utils.instantiate(agent_model_cfg, llm=llm_model).to(accelerator.device, dtype=weight_dtype)
+    agent_model.requires_grad_(False)
+    agent_model.llm.base_model.model.use_kv_cache_head = False
+    logger.info('Load agent model done.')
+    if cfg_path.fsdp_plugin is not None:
+        adapter = accelerator.prepare(adapter)
+    optimizer = torch.optim.AdamW(adapter.params_to_opt(), lr=args.learning_rate, weight_decay=args.weight_decay)
+    logger.info('Init optimizer done.')
+    scheduler = get_scheduler(name=args.lr_scheduler_type,
+                              optimizer=optimizer,
+                              num_warmup_steps=args.warmup_steps,
+                              num_training_steps=args.max_steps,
+                              min_lr_ratio=args.min_lr_ratio)
+    # accelerator.register_for_checkpointing(scheduler)
+    # adapter.adapter, adapter.resampler, optimizer, scheduler = accelerator.prepare(
+    #     adapter.adapter,
+    #     adapter.resampler,
+    #     optimizer,
+    #     scheduler,
+    # )
+    # adapter, optimizer, scheduler = accelerator.prepare(
+    #     adapter,
+    #     optimizer,
+    #     scheduler,
+    # )
+    if cfg_path.fsdp_plugin is not None:
+        optimizer, scheduler = accelerator.prepare(optimizer, scheduler)
+    else:
+        adapter, optimizer, scheduler = accelerator.prepare(adapter, optimizer, scheduler)
+    logger.info('Prepare accelerator done.')
+    # config_record = merge_config(discrete_model=discrete_model_cfg,
+    #                              visual_encoder=visual_encoder_cfg,
+    #                              text_encoder=text_encoder_cfg,
+    #                              image_transform=image_transform_cfg,
+    #                              sd_image_transform=sd_image_transform_cfg,
+    #                              tokenizer=tokenizer_cfg,
+    #                              train_dataset=train_dataset_cfg,
+    #                              vae=vae_cfg,
+    #                              adapter=adapter_cfg,
+    #                              train_args=args)
+    config_record = merge_config(discrete_model=discrete_model_cfg,
+                                 visual_encoder=visual_encoder_cfg,
+                                 image_transform=image_transform_cfg,
+                                 sd_image_transform=sd_image_transform_cfg,
+                                 train_dataset=train_dataset_cfg,
+                                 adapter=adapter_cfg,
+                                 train_args=args,
+                                 agent_model=agent_model_cfg,
+                                 llm_model=llm_model,
+                                 tokenizer=tokenizer_cfg)
+    accelerator.init_trackers(project_name=args.project_name,
+                              init_kwargs={"wandb": {
+                                  "config": config_record,
+                                  "name": args.expr_name,
+                                  "dir": args.output_dir
+                              }})
+    if args.resume_from_checkpoint is not None:
+        logger.info(f'Load checkpoint from {args.resume_from_checkpoint}')
+        accelerator.load_state(args.resume_from_checkpoint)
+    num_params = trainable_params(adapter)
+    logger.info("***** Running training *****")
+    logger.info(f"  Total optimization steps = {args.max_steps}")
+    logger.info(f"  Total trainable params = {num_params}")
+    for name, param in adapter.named_parameters():
+        if param.requires_grad:
+            print(name)
+    # print(f'adapter: {trainable_params(adapter.adapter)}')
+    # print(f'resampler: {trainable_params(adapter.resampler)}')
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(args.max_steps), disable=not accelerator.is_main_process)
+    progress_bar.set_description("Steps")
+    global_step = 0
+    if args.resume_steps is not None:
+        global_step = args.resume_steps
+        progress_bar.update(args.resume_steps)
+    for epoch in range(args.num_train_epochs):
+        logger.info('Start new epoch')
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(adapter):
+                with torch.no_grad():
+                    image_embeds = visual_encoder(batch['images'].to(accelerator.device, dtype=weight_dtype))
+                    image_embeds = discrete_model.encode_image_embeds(image_embeds)
+                    if text_encoder is not None:
+                        text_embeds = text_encoder(batch['text_input_ids'].to(accelerator.device))[0]
+                    else:
+                        text_embeds = None
+                    latents = vae.encode(
+                        batch["sd_images"].to(accelerator.device, dtype=weight_dtype)).latent_dist.sample()
+                    latents = latents * vae.config.scaling_factor
+                    llm_output = agent_model(input_ids=batch['input_ids'].to(accelerator.device),
+                                             attention_mask=batch['attention_mask'].to(accelerator.device),
+                                             labels=batch['labels'].to(accelerator.device),
+                                             image_embeds=image_embeds,
+                                             embeds_gen_mask=batch['embeds_gen_mask'].to(accelerator.device)
+                                             if batch['embeds_gen_mask'] is not None else None,
+                                             embeds_cmp_mask=batch['embeds_cmp_mask'].to(accelerator.device)
+                                             if batch['embeds_cmp_mask'] is not None else None,
+                                             ids_gen_mask=batch['ids_gen_mask'].to(accelerator.device),
+                                             ids_cmp_mask=batch['ids_cmp_mask'].to(accelerator.device),
+                                             return_recon_image_embeds=True)
+                time_ids = batch['time_ids'].to(accelerator.device)
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+                output = adapter(noisy_latents=noisy_latents,
+                                 timesteps=timesteps,
+                                 image_embeds=llm_output['recon_image_embeds'],
+                                 text_embeds=None,
+                                 noise=noise,
+                                 time_ids=time_ids)
+                loss = output['total_loss']
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(adapter.parameters(), max_norm=args.max_grad_norm)
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                if global_step % args.save_steps == 0:
+                    save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                    accelerator.save_state(save_path)
+            metric = get_metric(output)
+            metric['lr'] = optimizer.param_groups[0]['lr']
+            accelerator.log(metric, step=global_step)
+            metric = {key: (format(value, ".6f") if isinstance(value, float) else value) for key, value in
+                      metric.items()}
+            # if accelerator.is_local_main_process:
+            if accelerator.is_main_process:
+                tqdm.write(str(metric))
+            # print(metric)
+            if global_step >= args.max_steps:
+                break
+    accelerator.end_training()
+if __name__ == '__main__':
+    train()

utils.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import datetime
+import logging
+import logging.handlers
+import os
+import sys
+handler = None
+def build_logger(logger_name, logger_dir):
+    global handler
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(logger_dir, exist_ok=True)
+        filename = os.path.join(logger_dir, logger_name + '.log')
+        handler = logging.handlers.TimedRotatingFileHandler(filename, when='D', utc=True)
+        handler.setFormatter(formatter)
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+    return logger
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ''
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ''
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == '\n':
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+    def flush(self):
+        if self.linebuf != '':
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ''