Florence / app.py
andito's picture
andito HF staff
remove unused parts
c8f76e0
raw
history blame
3.02 kB
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import spaces
import requests
import copy
from PIL import Image, ImageDraw, ImageFont
import io
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import random
import numpy as np
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
model = AutoModelForCausalLM.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True).to("cuda").eval()
processor = AutoProcessor.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True)
DESCRIPTION = "# [Florence-2-DocVQA Demo](https://huggingface.co/HuggingFaceM4/Florence-2-DocVQA)"
colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
@spaces.GPU
def run_example(task_prompt, image, text_input=None):
if text_input is None:
prompt = task_prompt
else:
prompt = task_prompt + text_input
inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
early_stopping=False,
do_sample=False,
num_beams=3,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = processor.post_process_generation(
generated_text,
task=task_prompt,
image_size=(image.width, image.height)
)
return parsed_answer
def process_image(image, text_input=None):
image = Image.fromarray(image) # Convert NumPy array to PIL Image
task_prompt = '<DocVQA>'
results = run_example(task_prompt, image, text_input)[task_prompt].replace("<pad>", "")
return results, None
css = """
#output {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css) as demo:
gr.Markdown(DESCRIPTION)
with gr.Tab(label="Florence-2 Image Captioning"):
with gr.Row():
with gr.Column():
input_img = gr.Image(label="Input Picture")
text_input = gr.Textbox(label="Text Input (optional)")
submit_btn = gr.Button(value="Submit")
with gr.Column():
output_text = gr.Textbox(label="Output Text")
output_img = gr.Image(label="Output Image")
gr.Examples(
examples=[
["image1.jpg", 'Object Detection'],
["image2.jpg", 'OCR with Region']
],
inputs=[input_img],
outputs=[output_text, output_img],
fn=process_image,
cache_examples=True,
label='Try examples'
)
submit_btn.click(process_image, [input_img, text_input], [output_text, output_img])
demo.launch(debug=True)