Spaces:

gabrielchua
/

janus-demo

Running on Zero

App Files Files Community

gabrielchua commited on 12 days ago

Commit

574151f

•

1 Parent(s): a18d449

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -38

app.py CHANGED Viewed

@@ -12,26 +12,32 @@ from janus.utils.io import load_pil_images
 model_path = "deepseek-ai/Janus-1.3B"
 # Load the VLChatProcessor and tokenizer
 vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
 tokenizer = vl_chat_processor.tokenizer
 # Load the MultiModalityCausalLM model
 vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
     model_path, trust_remote_code=True
 )
-vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()
 @spaces.GPU(duration=120)
-def image_to_latex(image: Image.Image) -> str:
     """
-    Convert an uploaded image of a formula into LaTeX code.
     """
-    # Define the conversation with the uploaded image
     conversation = [
         {
             "role": "User",
-            "content": "<image_placeholder>\nConvert the formula into latex code.",
-            "images": [image],
         },
         {"role": "Assistant", "content": ""},
     ]
@@ -42,22 +48,23 @@ def image_to_latex(image: Image.Image) -> str:
     # Prepare the inputs for the model
     prepare_inputs = vl_chat_processor(
         conversations=conversation, images=pil_images, force_batchify=True
-    ).to(vl_gpt.device)
     # Prepare input embeddings
     inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
     # Generate the response from the model
-    outputs = vl_gpt.language_model.generate(
-        inputs_embeds=inputs_embeds,
-        attention_mask=prepare_inputs.attention_mask,
-        pad_token_id=tokenizer.eos_token_id,
-        bos_token_id=tokenizer.bos_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        max_new_tokens=512,
-        do_sample=False,
-        use_cache=True,
-    )
     # Decode the generated tokens to get the answer
     answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
@@ -88,12 +95,15 @@ def text_to_image(prompt: str) -> Image.Image:
     # Encode the prompt
     input_ids = vl_chat_processor.tokenizer.encode(prompt_text)
-    input_ids = torch.LongTensor(input_ids)
     # Prepare tokens for generation
-    tokens = torch.zeros((2, len(input_ids)), dtype=torch.int).cuda()
-    tokens[0, :] = input_ids
-    tokens[1, :] = vl_chat_processor.pad_id
     # Get input embeddings
     inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
@@ -106,7 +116,7 @@ def text_to_image(prompt: str) -> Image.Image:
     temperature = 1
     # Initialize tensor to store generated tokens
-    generated_tokens = torch.zeros((1, image_token_num_per_image), dtype=torch.int).cuda()
     for i in range(image_token_num_per_image):
         if i == 0:
@@ -128,14 +138,14 @@ def text_to_image(prompt: str) -> Image.Image:
         generated_tokens[:, i] = next_token.squeeze(dim=-1)
         # Prepare for the next step
-        next_token_combined = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
         img_embeds = vl_gpt.prepare_gen_img_embeds(next_token_combined)
         inputs_embeds = img_embeds.unsqueeze(dim=1)
     # Decode the generated tokens to get the image
     dec = vl_gpt.gen_vision_model.decode_code(
         generated_tokens.to(dtype=torch.int),
-        shape=[1, 8, img_size//patch_size, img_size//patch_size]
     )
     dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
     dec = np.clip((dec + 1) / 2 * 255, 0, 255).astype(np.uint8)
@@ -152,30 +162,36 @@ with gr.Blocks() as demo:
         """
         # Janus-1.3B Gradio Demo
         This demo showcases two functionalities using the Janus-1.3B model:
-        1. **Image to LaTeX**: Upload an image of a mathematical formula to convert it into LaTeX code.
         2. **Text to Image**: Enter a descriptive text prompt to generate a corresponding image.
         """
     )
-    with gr.Tab("Image to LaTeX"):
-        gr.Markdown("### Convert Formula Image to LaTeX Code")
         with gr.Row():
             with gr.Column():
-                image_input = gr.Image(
                     type="pil",
-                    label="Upload Formula Image",
                     tool="editor",
                 )
-                submit_btn = gr.Button("Convert to LaTeX")
             with gr.Column():
-                latex_output = gr.Textbox(
-                    label="LaTeX Code",
-                    lines=10,
                 )
-        submit_btn.click(fn=image_to_latex, inputs=image_input, outputs=latex_output)
     with gr.Tab("Text to Image"):
-        gr.Markdown("### Generate Image from Text Prompt")
         with gr.Row():
             with gr.Column():
                 prompt_input = gr.Textbox(
@@ -189,9 +205,7 @@ with gr.Blocks() as demo:
                     label="Generated Image",
                 )
         generate_btn.click(fn=text_to_image, inputs=prompt_input, outputs=image_output)
-    )
 # Launch the Gradio app
 if __name__ == "__main__":
     demo.launch()

 model_path = "deepseek-ai/Janus-1.3B"
 # Load the VLChatProcessor and tokenizer
+print("Loading VLChatProcessor and tokenizer...")
 vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)
 tokenizer = vl_chat_processor.tokenizer
 # Load the MultiModalityCausalLM model
+print("Loading MultiModalityCausalLM model...")
 vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
     model_path, trust_remote_code=True
 )
+# Move the model to GPU with bfloat16 precision for efficiency
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+vl_gpt = vl_gpt.to(torch.bfloat16 if device.type == "cuda" else torch.float32).to(device).eval()
 @spaces.GPU(duration=120)
+def text_image_to_text(user_text: str, user_image: Image.Image) -> str:
     """
+    Generate a textual response based on user-provided text and image.
+    This can be used for tasks like converting an image of a formula to LaTeX code
+    or generating descriptive captions.
     """
+    # Define the conversation with user-provided text and image
     conversation = [
         {
             "role": "User",
+            "content": user_text,
+            "images": [user_image],
         },
         {"role": "Assistant", "content": ""},
     ]
     # Prepare the inputs for the model
     prepare_inputs = vl_chat_processor(
         conversations=conversation, images=pil_images, force_batchify=True
+    ).to(device)
     # Prepare input embeddings
     inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
     # Generate the response from the model
+    with torch.no_grad():
+        outputs = vl_gpt.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=prepare_inputs.attention_mask,
+            pad_token_id=tokenizer.eos_token_id,
+            bos_token_id=tokenizer.bos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            max_new_tokens=512,
+            do_sample=False,
+            use_cache=True,
+        )
     # Decode the generated tokens to get the answer
     answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
     # Encode the prompt
     input_ids = vl_chat_processor.tokenizer.encode(prompt_text)
+    input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(device)
     # Prepare tokens for generation
+    parallel_size = 1  # Adjust based on GPU memory
+    tokens = torch.zeros((parallel_size*2, len(input_ids[0])), dtype=torch.int).to(device)
+    for i in range(parallel_size*2):
+        tokens[i, :] = input_ids
+        if i % 2 != 0:
+            tokens[i, 1:-1] = vl_chat_processor.pad_id
     # Get input embeddings
     inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
     temperature = 1
     # Initialize tensor to store generated tokens
+    generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).to(device)
     for i in range(image_token_num_per_image):
         if i == 0:
         generated_tokens[:, i] = next_token.squeeze(dim=-1)
         # Prepare for the next step
+        next_token_combined = torch.cat([next_token, next_token], dim=0).view(-1)
         img_embeds = vl_gpt.prepare_gen_img_embeds(next_token_combined)
         inputs_embeds = img_embeds.unsqueeze(dim=1)
     # Decode the generated tokens to get the image
     dec = vl_gpt.gen_vision_model.decode_code(
         generated_tokens.to(dtype=torch.int),
+        shape=[parallel_size, 8, img_size//patch_size, img_size//patch_size]
     )
     dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
     dec = np.clip((dec + 1) / 2 * 255, 0, 255).astype(np.uint8)
         """
         # Janus-1.3B Gradio Demo
         This demo showcases two functionalities using the Janus-1.3B model:
+        1. **Text + Image to Text**: Input both text and an image to generate a textual response.
         2. **Text to Image**: Enter a descriptive text prompt to generate a corresponding image.
         """
     )
+    with gr.Tab("Text + Image to Text"):
+        gr.Markdown("### Generate Text Based on Input Text and Image")
         with gr.Row():
             with gr.Column():
+                user_text_input = gr.Textbox(
+                    lines=2,
+                    placeholder="Enter your instructions or description here...",
+                    label="Input Text",
+                )
+                user_image_input = gr.Image(
                     type="pil",
+                    label="Upload Image",
                     tool="editor",
                 )
+                submit_btn = gr.Button("Generate Text")
             with gr.Column():
+                text_output = gr.Textbox(
+                    label="Generated Text",
+                    lines=15,
+                    interactive=False,
                 )
+        submit_btn.click(fn=text_image_to_text, inputs=[user_text_input, user_image_input], outputs=text_output)
     with gr.Tab("Text to Image"):
+        gr.Markdown("### Generate Image Based on Text Prompt")
         with gr.Row():
             with gr.Column():
                 prompt_input = gr.Textbox(
                     label="Generated Image",
                 )
         generate_btn.click(fn=text_to_image, inputs=prompt_input, outputs=image_output)
 # Launch the Gradio app
 if __name__ == "__main__":
     demo.launch()