File size: 3,312 Bytes
ade70cf 1d51385 d69fd19 d7f29ce ef3da92 d7f29ce 39ae23a ade70cf 50fae8a d502400 50fae8a ade70cf d256f3b 833928a ca16909 d256f3b beec895 50fae8a 1d51385 69958d1 ade70cf 1d51385 ade70cf c8f76e0 6172e67 c8f76e0 ccd3ca3 c8f76e0 6172e67 833928a 6172e67 1d51385 6172e67 8b2d7f4 ef3da92 8b2d7f4 6172e67 8b2d7f4 ccd3ca3 6172e67 833928a 6172e67 ccd3ca3 6172e67 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import spaces
from PIL import Image
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
model = AutoModelForCausalLM.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True).to("cuda").eval()
processor = AutoProcessor.from_pretrained('HuggingFaceM4/Florence-2-DocVQA', trust_remote_code=True)
TITLE = "# [Florence-2-DocVQA Demo](https://huggingface.co/HuggingFaceM4/Florence-2-DocVQA)"
DESCRIPTION = "The demo for Florence-2 fine-tuned on DocVQA dataset. You can find the notebook [here](https://colab.research.google.com/drive/1hKDrJ5AH_o7I95PtZ9__VlCTNAo1Gjpf?usp=sharing). Read more about Florence-2 fine-tuning [here](finetune-florence2)."
colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
@spaces.GPU
def run_example(task_prompt, image, text_input=None):
if text_input is None:
prompt = task_prompt
else:
prompt = task_prompt + text_input
inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
early_stopping=False,
do_sample=False,
num_beams=3,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = processor.post_process_generation(
generated_text,
task=task_prompt,
image_size=(image.width, image.height)
)
return parsed_answer
def process_image(image, text_input=None):
image = Image.fromarray(image) # Convert NumPy array to PIL Image
task_prompt = '<DocVQA>'
results = run_example(task_prompt, image, text_input)[task_prompt].replace("<pad>", "")
return results
css = """
#output {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css) as demo:
gr.Markdown(TITLE)
gr.Markdown(DESCRIPTION)
with gr.Tab(label="Florence-2 Image Captioning"):
with gr.Row():
with gr.Column():
input_img = gr.Image(label="Input Picture")
text_input = gr.Textbox(label="Text Input (optional)")
submit_btn = gr.Button(value="Submit")
with gr.Column():
output_text = gr.Textbox(label="Output Text")
gr.Examples(
examples=[
["idefics2_architecture.png", 'How many tokens per image does it use?'],
["idefics2_architecture.png", "What type of encoder does the model use?"],
["idefics2_architecture.png", 'Up to which size can the images be?'],
["image.jpg", "What's the share of Industry Switchers Gained?"]
],
inputs=[input_img, text_input],
outputs=[output_text],
fn=process_image,
cache_examples=True,
label='Try the examples below'
)
submit_btn.click(process_image, [input_img, text_input], [output_text])
demo.launch(debug=True) |