File size: 10,265 Bytes
925c081
ac1256d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
925c081
ac1256d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import spaces
from PIL import Image
import gradio as gr
from huggingface_hub import hf_hub_download, snapshot_download
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.set_float32_matmul_precision('high')
setattr(torch.nn.Linear, 'reset_parameters', lambda self: None)
setattr(torch.nn.LayerNorm, 'reset_parameters', lambda self: None)
import os
import time
import argparse
from tokenizer_image.vq_model import VQ_models
from models.gpt import GPT_models
from models.generate import generate
from t5 import T5Embedder
os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = "cuda"

model2ckpt = {
    "GPT-XL": ("vq_ds16_t2i.pt", "t2i_XL_stage2_512.pt", 512),
}

def load_model(args):
    ckpt_folder = './'
    t5_folder = os.path.join(ckpt_folder, "flan-t5-xl")
    if not os.path.exists(t5_folder):
        os.makedirs(t5_folder, exist_ok=True)
    vq_ckpt, gpt_ckpt, image_size = model2ckpt[args.gpt_model]
    hf_hub_download(repo_id="peizesun/llamagen_t2i", filename=vq_ckpt, local_dir=ckpt_folder)
    hf_hub_download(repo_id="peizesun/llamagen_t2i", filename=gpt_ckpt, local_dir=ckpt_folder)
    hf_hub_download(repo_id="google/flan-t5-xl", filename="config.json", local_dir=t5_folder)
    hf_hub_download(repo_id="google/flan-t5-xl", filename="pytorch_model-00001-of-00002.bin", local_dir=t5_folder)
    hf_hub_download(repo_id="google/flan-t5-xl", filename="pytorch_model-00002-of-00002.bin", local_dir=t5_folder)
    hf_hub_download(repo_id="google/flan-t5-xl", filename="pytorch_model.bin.index.json", local_dir=t5_folder)
    hf_hub_download(repo_id="google/flan-t5-xl", filename="special_tokens_map.json", local_dir=t5_folder)
    hf_hub_download(repo_id="google/flan-t5-xl", filename="spiece.model", local_dir=t5_folder)
    hf_hub_download(repo_id="google/flan-t5-xl", filename="tokenizer_config.json", local_dir=t5_folder)
    # create and load model
    vq_model = VQ_models[args.vq_model](
        codebook_size=args.codebook_size,
        codebook_embed_dim=args.codebook_embed_dim)
    vq_model.to(device)
    vq_model.eval()
    checkpoint = torch.load(f"{ckpt_folder}{vq_ckpt}", map_location="cpu")
    vq_model.load_state_dict(checkpoint["model"])
    del checkpoint
    print(f"image tokenizer is loaded")

    # create and load gpt model
    precision = {'none': torch.float32, 'bf16': torch.bfloat16, 'fp16': torch.float16}[args.precision]
    latent_size = image_size // args.downsample_size
    gpt_model = GPT_models[args.gpt_model](
        vocab_size=args.codebook_size,
        block_size=latent_size ** 2,
        num_classes=args.num_classes,
        cls_token_num=args.cls_token_num,
        model_type=args.gpt_type,
    ).to(device=device, dtype=precision)
    
    checkpoint = torch.load(f"{ckpt_folder}{gpt_ckpt}", map_location="cpu")
    if args.from_fsdp: # fspd
        model_weight = checkpoint
    elif "model" in checkpoint:  # ddp
        model_weight = checkpoint["model"]
    elif "module" in checkpoint: # deepspeed
        model_weight = checkpoint["module"]
    elif "state_dict" in checkpoint:
        model_weight = checkpoint["state_dict"]
    else:
        raise Exception("please check model weight")
    # if 'freqs_cis' in model_weight:
    #     model_weight.pop('freqs_cis')
    gpt_model.load_state_dict(model_weight, strict=False)
    gpt_model.eval()
    del checkpoint
    print(f"gpt model is loaded")

    if args.compile:
        print(f"compiling the model...")
        gpt_model = torch.compile(
            gpt_model,
            mode="reduce-overhead",
            fullgraph=True
        ) # requires PyTorch 2.0 (optional)
    else:
        print(f"no need to compile model in demo") 

    t5_model = T5Embedder(
        device=device, 
        local_cache=True, 
        cache_dir=ckpt_folder, 
        dir_or_name="flan-t5-xl",
        torch_dtype=precision,
        model_max_length=args.t5_feature_max_len,
    )

    return t5_model, vq_model, gpt_model, image_size


@spaces.GPU
def infer(cfg_scale, top_k, top_p, temperature, prompt, seed):
    prompts = [prompt for _ in range(4)]
    caption_embs, emb_masks = t5_model.get_text_embeddings(prompts)

    if not args.no_left_padding:
        print(f"processing left-padding...")    
        # a naive way to implement left-padding
        new_emb_masks = torch.flip(emb_masks, dims=[-1])
        new_caption_embs = []
        for idx, (caption_emb, emb_mask) in enumerate(zip(caption_embs, emb_masks)):
            valid_num = int(emb_mask.sum().item())
            print(f'  prompt {idx} token len: {valid_num}')
            new_caption_emb = torch.cat([caption_emb[valid_num:], caption_emb[:valid_num]])
            new_caption_embs.append(new_caption_emb)
        new_caption_embs = torch.stack(new_caption_embs)
    else:
        new_caption_embs, new_emb_masks = caption_embs, emb_masks
    c_indices = new_caption_embs * new_emb_masks[:,:, None]
    c_emb_masks = new_emb_masks
    qzshape = [len(c_indices), args.codebook_embed_dim, latent_size, latent_size]

    t1 = time.time()
    torch.manual_seed(seed)
    index_sample = generate(
        gpt_model, c_indices, latent_size ** 2,
        c_emb_masks,
        cfg_scale=cfg_scale, cfg_interval=args.cfg_interval,
        temperature=temperature, top_k=top_k,
        top_p=top_p, sample_logits=True, 
        )
    sampling_time = time.time() - t1
    print(f"gpt sampling takes about {sampling_time:.2f} seconds.")    

    t2 = time.time()
    samples = vq_model.decode_code(index_sample, qzshape) # output value is between [-1, 1]
    decoder_time = time.time() - t2
    print(f"decoder takes about {decoder_time:.2f} seconds.")
    # Convert to PIL.Image format:
    samples = samples.mul(127.5).add_(128.0).clamp_(0, 255).permute(0, 2, 3, 1).to("cpu", torch.uint8).numpy()
    samples = [Image.fromarray(sample) for sample in samples]
    return samples


parser = argparse.ArgumentParser()
parser.add_argument("--t5-path", type=str, default='.')
parser.add_argument("--t5-feature-max-len", type=int, default=120)
parser.add_argument("--t5-feature-dim", type=int, default=2048)
parser.add_argument("--no-left-padding", action='store_true', default=False)
parser.add_argument("--gpt-model", type=str, choices=list(GPT_models.keys()), default="GPT-XL")
parser.add_argument("--gpt-type", type=str, choices=['c2i', 't2i'], default="t2i", help="class-conditional or text-conditional")
parser.add_argument("--from-fsdp", action='store_true')
parser.add_argument("--cls-token-num", type=int, default=120, help="max token number of condition input")
parser.add_argument("--precision", type=str, default='bf16', choices=["none", "fp16", "bf16"]) 
parser.add_argument("--compile", action='store_true', default=False)
parser.add_argument("--vq-model", type=str, choices=list(VQ_models.keys()), default="VQ-16")
parser.add_argument("--codebook-size", type=int, default=16384, help="codebook size for vector quantization")
parser.add_argument("--codebook-embed-dim", type=int, default=8, help="codebook dimension for vector quantization")
parser.add_argument("--downsample-size", type=int, choices=[8, 16], default=16)
parser.add_argument("--num-classes", type=int, default=1000)
parser.add_argument("--cfg-scale", type=float, default=7.5)
parser.add_argument("--cfg-interval", type=float, default=-1)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--top-k", type=int, default=2000,help="top-k value to sample with")
parser.add_argument("--temperature", type=float, default=1.0, help="temperature value to sample with")
parser.add_argument("--top-p", type=float, default=1.0, help="top-p value to sample with")
args = parser.parse_args()

t5_model, vq_model, gpt_model, image_size = load_model(args)
latent_size = image_size // args.downsample_size

examples = [
    "A fluffy golden retriever puppy with big, soulful eyes sits in a sunlit garden, surrounded by colorful flowers and butterflies fluttering around its wagging tail.",
    "A steaming bowl of Pho, filled with translucent rice noodles and thin slices of savory beef, topped with a heaping of fresh bean sprouts, a wedge of lime on the side, and a sprinkle of chopped green onions and cilantro.",
    "An ethereal black and white landscape, where a solitary, sinuous black tree stands stark against a stark white snowy backdrop. Its branches twist intricately towards the sky, casting dramatic shadows on the untouched snow below.",        
]

with gr.Blocks() as demo:
    gr.Markdown("<h1 style='text-align: center'>Autoregressive Model Beats Diffusion: Llama for Scalable Image Generation</h1>")

    with gr.Tabs():
        with gr.TabItem('Generate'):
            with gr.Row():
                with gr.Column():
                    cfg_scale = gr.Slider(minimum=1, maximum=25, step=0.1, value=7.5, label='Classifier-free Guidance Scale')
                    top_k = gr.Slider(minimum=1, maximum=16384, step=1, value=4000, label='Top-K')
                    top_p = gr.Slider(minimum=0., maximum=1.0, step=0.1, value=1.0, label="Top-P")
                    temperature = gr.Slider(minimum=0., maximum=1.0, step=0.1, value=1.0, label='Temperature')
                    seed = gr.Slider(minimum=0, maximum=1000, step=1, value=0, label='Seed')
                    with gr.Row():
                        text_prompt = gr.Textbox(
                            label="Enter your prompt",
                            show_label=False,
                            max_lines=1,
                            placeholder="Enter your prompt",
                        )
                    button = gr.Button("Generate", variant="primary")
                    gr.Examples(
                        label="Examples (select one example, and click Generate button)",
                        examples=examples,
                        inputs=text_prompt,
                        # outputs=[result],
                        # fn=generate,
                    )
                with gr.Column():
                    output = gr.Gallery(label='Generated Images', height=700)
                    button.click(infer, inputs=[cfg_scale, top_k, top_p, temperature, text_prompt, seed], outputs=[output])
    demo.queue()
    demo.launch(debug=True, share=True)