import gradio as gr from transformers import AutoProcessor, AutoModelForCausalLM import spaces import requests import copy from PIL import Image, ImageDraw, ImageFont import io import matplotlib.pyplot as plt import matplotlib.patches as patches import random import numpy as np import subprocess subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) model = AutoModelForCausalLM.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True).to("cuda").eval() processor = AutoProcessor.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True) DESCRIPTION = "# [Florence-2-DocVQA Demo](https://huggingface.co/HuggingFaceM4/Florence-2-DocVQA)" colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red', 'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue'] @spaces.GPU def run_example(task_prompt, image, text_input=None): if text_input is None: prompt = task_prompt else: prompt = task_prompt + text_input inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda") generated_ids = model.generate( input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, early_stopping=False, do_sample=False, num_beams=3, ) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0] parsed_answer = processor.post_process_generation( generated_text, task=task_prompt, image_size=(image.width, image.height) ) return parsed_answer def process_image(image, text_input=None): image = Image.fromarray(image) # Convert NumPy array to PIL Image task_prompt = '' results = run_example(task_prompt, image, text_input)[task_prompt].replace("", "") return results, None css = """ #output { height: 500px; overflow: auto; border: 1px solid #ccc; } """ with gr.Blocks(css=css) as demo: gr.Markdown(DESCRIPTION) with gr.Tab(label="Florence-2 Image Captioning"): with gr.Row(): with gr.Column(): input_img = gr.Image(label="Input Picture") text_input = gr.Textbox(label="Text Input (optional)") submit_btn = gr.Button(value="Submit") with gr.Column(): output_text = gr.Textbox(label="Output Text") output_img = gr.Image(label="Output Image") gr.Examples( examples=[ ["idefics2_architecture.png", 'How many tokens per image does it use?'], ["idefics2_architecture.png", 'How large can the input images be?'], ["idefics2_architecture.png", 'Up to which size can the images be?'], ["image.jpg", "What's the share of Industry Switchers Gained?"] ], inputs=[input_img, text_input], outputs=[output_text, output_img], fn=process_image, cache_examples=True, label='Try examples' ) submit_btn.click(process_image, [input_img, text_input], [output_text, output_img]) demo.launch(debug=True)