import torch from transformers import AutoModelForCausalLM, AutoTokenizer import gradio as gr # Load the model and tokenizer model = AutoModelForCausalLM.from_pretrained( "qresearch/llama-3.1-8B-vision-378", trust_remote_code=True, torch_dtype=torch.float16, ).to("cuda") tokenizer = AutoTokenizer.from_pretrained("qresearch/llama-3.1-8B-vision-378", use_fast=True) # Define the function to process the image and instruction def describe_image(image, instruction): description = model.answer_question( image, instruction, tokenizer, max_new_tokens=1000, do_sample=True, temperature=0.3 ) return description # Create the Gradio interface interface = gr.Interface( fn=describe_image, inputs=[ gr.Image(type="pil"), # Input for the image gr.Textbox( placeholder="Enter your instruction here...", label="Instruction", lines=10, # Increase number of lines for instruction input max_lines=20 # Maximum number of lines for scrolling ) ], outputs=gr.Textbox( label="Description", lines=10, # Increase number of lines for output max_lines=30 # Maximum number of lines for scrolling ), title="LLaMA 3.1 with Vision", description="Upload an image and enter an instruction to generate a description based on the provided instruction." ) # Launch the Gradio app interface.launch()