Spaces:

gabrielchua
/

janus-demo

Running on Zero

App Files Files Community

gabrielchua commited on 12 days ago

Commit

4e757f3

•

1 Parent(s): 9f78eb1

Create app.py

Browse files

Files changed (1) hide show

app.py +197 -0

app.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import os
+import torch
+import gradio as gr
+import numpy as np
+import spaces
+from PIL import Image
+from transformers import AutoModelForCausalLM
+from janus.models import MultiModalityCausalLM, VLChatProcessor
+from janus.utils.io import load_pil_images
+# Specify the path to the model
+model_path = "deepseek-ai/Janus-1.3B"
+# Load the VLChatProcessor and tokenizer
+vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
+tokenizer = vl_chat_processor.tokenizer
+# Load the MultiModalityCausalLM model
+vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
+    model_path, trust_remote_code=True
+)
+vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
+@spaces.GPU(duration=120)
+def image_to_latex(image: Image.Image) -> str:
+    """
+    Convert an uploaded image of a formula into LaTeX code.
+    """
+    # Define the conversation with the uploaded image
+    conversation = [
+        {
+            "role": "User",
+            "content": "<image_placeholder>\nConvert the formula into latex code.",
+            "images": [image],
+        },
+        {"role": "Assistant", "content": ""},
+    ]
+    # Load the PIL images from the conversation
+    pil_images = load_pil_images(conversation)
+    # Prepare the inputs for the model
+    prepare_inputs = vl_chat_processor(
+        conversations=conversation, images=pil_images, force_batchify=True
+    ).to(vl_gpt.device)
+    # Prepare input embeddings
+    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
+    # Generate the response from the model
+    outputs = vl_gpt.language_model.generate(
+        inputs_embeds=inputs_embeds,
+        attention_mask=prepare_inputs.attention_mask,
+        pad_token_id=tokenizer.eos_token_id,
+        bos_token_id=tokenizer.bos_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        max_new_tokens=512,
+        do_sample=False,
+        use_cache=True,
+    )
+    # Decode the generated tokens to get the answer
+    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
+    return answer
+@spaces.GPU(duration=120)
+def text_to_image(prompt: str) -> Image.Image:
+    """
+    Generate an image based on the input text prompt.
+    """
+    # Define the conversation with the user prompt
+    conversation = [
+        {
+            "role": "User",
+            "content": prompt,
+        },
+        {"role": "Assistant", "content": ""},
+    ]
+    # Apply the SFT template to format the prompt
+    sft_format = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(
+        conversations=conversation,
+        sft_format=vl_chat_processor.sft_format,
+        system_prompt="",
+    )
+    prompt_text = sft_format + vl_chat_processor.image_start_tag
+    # Encode the prompt
+    input_ids = vl_chat_processor.tokenizer.encode(prompt_text)
+    input_ids = torch.LongTensor(input_ids)
+    # Prepare tokens for generation
+    tokens = torch.zeros((2, len(input_ids)), dtype=torch.int).cuda()
+    tokens[0, :] = input_ids
+    tokens[1, :] = vl_chat_processor.pad_id
+    # Get input embeddings
+    inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
+    # Generation parameters
+    image_token_num_per_image = 576
+    img_size = 384
+    patch_size = 16
+    cfg_weight = 5
+    temperature = 1
+    # Initialize tensor to store generated tokens
+    generated_tokens = torch.zeros((1, image_token_num_per_image), dtype=torch.int).cuda()
+    for i in range(image_token_num_per_image):
+        if i == 0:
+            outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds, use_cache=True)
+        else:
+            outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds, use_cache=True, past_key_values=outputs.past_key_values)
+        hidden_states = outputs.last_hidden_state
+        # Get logits and apply classifier-free guidance
+        logits = vl_gpt.gen_head(hidden_states[:, -1, :])
+        logit_cond = logits[0::2, :]
+        logit_uncond = logits[1::2, :]
+        logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond)
+        # Sample the next token
+        probs = torch.softmax(logits / temperature, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)
+        generated_tokens[:, i] = next_token.squeeze(dim=-1)
+        # Prepare for the next step
+        next_token_combined = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
+        img_embeds = vl_gpt.prepare_gen_img_embeds(next_token_combined)
+        inputs_embeds = img_embeds.unsqueeze(dim=1)
+    # Decode the generated tokens to get the image
+    dec = vl_gpt.gen_vision_model.decode_code(
+        generated_tokens.to(dtype=torch.int),
+        shape=[1, 8, img_size//patch_size, img_size//patch_size]
+    )
+    dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
+    dec = np.clip((dec + 1) / 2 * 255, 0, 255).astype(np.uint8)
+    # Convert to PIL Image
+    visual_img = dec[0]
+    image = Image.fromarray(visual_img)
+    return image
+# Create the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown(
+        """
+        # Janus-1.3B Gradio Demo
+        This demo showcases two functionalities using the Janus-1.3B model:
+        1. **Image to LaTeX**: Upload an image of a mathematical formula to convert it into LaTeX code.
+        2. **Text to Image**: Enter a descriptive text prompt to generate a corresponding image.
+        """
+    )
+    with gr.Tab("Image to LaTeX"):
+        gr.Markdown("### Convert Formula Image to LaTeX Code")
+        with gr.Row():
+            with gr.Column():
+                image_input = gr.Image(
+                    type="pil",
+                    label="Upload Formula Image",
+                    tool="editor",
+                )
+                submit_btn = gr.Button("Convert to LaTeX")
+            with gr.Column():
+                latex_output = gr.Textbox(
+                    label="LaTeX Code",
+                    lines=10,
+                )
+        submit_btn.click(fn=image_to_latex, inputs=image_input, outputs=latex_output)
+    with gr.Tab("Text to Image"):
+        gr.Markdown("### Generate Image from Text Prompt")
+        with gr.Row():
+            with gr.Column():
+                prompt_input = gr.Textbox(
+                    lines=2,
+                    placeholder="Enter your image description here...",
+                    label="Text Prompt",
+                )
+                generate_btn = gr.Button("Generate Image")
+            with gr.Column():
+                image_output = gr.Image(
+                    label="Generated Image",
+                )
+        generate_btn.click(fn=text_to_image, inputs=prompt_input, outputs=image_output)
+    )
+# Launch the Gradio app
+if __name__ == "__main__":
+    demo.launch()