import gradio as gr | |
#from transformers import AutoModelForCausalLM, AutoProcessor | |
# Load the model and processor | |
model_id = "microsoft/Phi-3-vision-128k-instruct" | |
#model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", trust_remote_code=True, torch_dtype="auto", _attn_implementation='flash_attention_2') | |
#processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) | |
# Define the function to generate text | |
def generate_text(image, prompt): | |
# Process the input | |
inputs = "" | |
# Generate the text | |
generation_args = { | |
"max_new_tokens": 500, | |
"temperature": 0.0, | |
"do_sample": False, | |
} | |
return image + prompt | |
# Create the Gradio application | |
gr.Interface( | |
fn=generate_text, | |
inputs=[ | |
gr.Image(type="pil"), | |
gr.Textbox(label="Prompt") | |
], | |
outputs=gr.Textbox(), | |
title="Phi-3-Vision Model", | |
description="Generate text based on an image and prompt using the Phi-3-Vision model." | |
).launch(share=True,show_error=True) |