File size: 3,272 Bytes
ade70cf 1d51385 d69fd19 d7f29ce ade70cf 1d51385 d7f29ce d9cf2fe d7f29ce ade70cf d7f29ce ade70cf 1d51385 d7f29ce 39ae23a ade70cf 50fae8a d502400 50fae8a ade70cf d256f3b 50fae8a ca16909 d256f3b beec895 50fae8a 1d51385 69958d1 ade70cf 1d51385 ade70cf c8f76e0 6172e67 c8f76e0 6172e67 1d51385 6172e67 8b2d7f4 6172e67 8b2d7f4 6172e67 c8f76e0 6172e67 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import spaces
import requests
import copy
from PIL import Image, ImageDraw, ImageFont
import io
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import random
import numpy as np
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
model = AutoModelForCausalLM.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True).to("cuda").eval()
processor = AutoProcessor.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True)
DESCRIPTION = "# [Florence-2-DocVQA Demo](https://huggingface.co/HuggingFaceM4/Florence-2-DocVQA)"
colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
@spaces.GPU
def run_example(task_prompt, image, text_input=None):
if text_input is None:
prompt = task_prompt
else:
prompt = task_prompt + text_input
inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
early_stopping=False,
do_sample=False,
num_beams=3,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = processor.post_process_generation(
generated_text,
task=task_prompt,
image_size=(image.width, image.height)
)
return parsed_answer
def process_image(image, text_input=None):
image = Image.fromarray(image) # Convert NumPy array to PIL Image
task_prompt = '<DocVQA>'
results = run_example(task_prompt, image, text_input)[task_prompt].replace("<pad>", "")
return results, None
css = """
#output {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css) as demo:
gr.Markdown(DESCRIPTION)
with gr.Tab(label="Florence-2 Image Captioning"):
with gr.Row():
with gr.Column():
input_img = gr.Image(label="Input Picture")
text_input = gr.Textbox(label="Text Input (optional)")
submit_btn = gr.Button(value="Submit")
with gr.Column():
output_text = gr.Textbox(label="Output Text")
output_img = gr.Image(label="Output Image")
gr.Examples(
examples=[
["idefics2_architecture.png", 'How many tokens per image does it use?'],
["idefics2_architecture.png", 'How large can the input images be?'],
["idefics2_architecture.png", 'Up to which size can the images be?'],
["image.jpg", "What's the share of Industry Switchers Gained?"]
],
inputs=[input_img, text_input],
outputs=[output_text, output_img],
fn=process_image,
cache_examples=True,
label='Try examples'
)
submit_btn.click(process_image, [input_img, text_input], [output_text, output_img])
demo.launch(debug=True) |