import gradio as gr import torch from gradio.components import Dropdown, Image, Textbox from huggingface_hub import HfApi, ModelFilter from transformers import AutoModelForCausalLM, AutoProcessor # Get the list of models from the Hugging Face Hub api = HfApi() models_infos = api.list_models(author="jat-project", filter=ModelFilter(tags="text-generation")) models_names = [model.modelId for model in models_infos] # Dictionary to store loaded models and their pipelines models = {} # Load a default model initially default_model_name = "jat-project/jat2-small-untrained" def generate_text(model_name, input_image): # Check if the selected model is already loaded if model_name not in models: # Inform the user that the model is loading yield "Loading model..." # Load the model processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding_side='left') model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) models[model_name] = model, processor # Get the model for the selected model model, processor = models[model_name] # Inform the user that the text is being generated yield "Generating caption..." # Convert the input image to a tensor pixel_values = processor(images=input_image, return_tensors="pt").pixel_values # Generate text generated_ids = model.generate(pixel_values=pixel_values, max_length=100, early_stopping=True) generated_text = processor.decode(generated_ids[0], skip_special_tokens=True) # Return the generated text yield generated_text # Define the Gradio interface iface = gr.Interface( fn=generate_text, # Function to be called on user input inputs=[ Dropdown(models_names, label="Select Model", value=default_model_name), # Select model Image(label="Input Image"), # Image input ], outputs=Textbox(label="Generated Caption"), # Textbox to display the generated text title="JAT Image Captioning", # Title of the interface ) # Launch the Gradio interface iface.launch(enable_queue=True)