import gradio as gr import cv2 import torch import numpy as np from PIL import Image # Load the YOLOv5 model model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True) # Function to run inference on an image def run_inference(image): # Convert the image from PIL format to a format compatible with OpenCV image = np.array(image) # Run YOLOv5 inference results = model(image) # Convert the annotated image from BGR to RGB for display annotated_image = results.render()[0] annotated_image = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB) return Image.fromarray(annotated_image) # Function to generate a summary for the detected objects def generate_summary(image): results = model(image) detected_objects = results.pandas().xyxy[0] summary = "Detected objects:\n\n" for idx, obj in detected_objects.iterrows(): summary += f"- {obj['name']} with confidence {obj['confidence']:.2f}\n" return summary # Function to generate a scene description based on the summary def generate_scene_description(summary): if "person" in summary.lower(): return "This scene might involve people interacting or a social gathering." elif "car" in summary.lower() or "truck" in summary.lower(): return "This could be a street scene or a transportation-related scenario." elif "dog" in summary.lower() or "cat" in summary.lower(): return "This appears to involve pets or animals, possibly in a domestic or outdoor setting." else: return "This scene involves various objects. It could be a dynamic or static environment." # Create the Gradio interface with improved UI with gr.Blocks(css=""" body { font-family: 'Poppins', sans-serif; margin: 0; background: linear-gradient(135deg, #3D52A0, #7091E6, #8697C4, #ADBBDA, #EDE8F5); background-size: 400% 400%; animation: gradient-animation 15s ease infinite; color: #FFFFFF; } @keyframes gradient-animation { 0% { background-position: 0% 50%; } 50% { background-position: 100% 50%; } 100% { background-position: 0% 50%; } } h1 { text-align: center; color: #FFFFFF; font-size: 2.5em; font-weight: bold; margin-bottom: 0.5em; text-shadow: 2px 2px 5px rgba(0, 0, 0, 0.3); } footer { text-align: center; margin-top: 20px; padding: 10px; font-size: 1em; color: #FFFFFF; background: rgba(61, 82, 160, 0.8); border-radius: 8px; } .gr-button { font-size: 1em; padding: 12px 24px; background-color: #7091E6; color: #FFFFFF; border: none; border-radius: 5px; transition: all 0.3s ease-in-out; } .gr-button:hover { background-color: #8697C4; transform: scale(1.05); box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2); } .gr-box { background: rgba(255, 255, 255, 0.1); border: 1px solid rgba(255, 255, 255, 0.3); border-radius: 10px; padding: 15px; box-shadow: 0 4px 10px rgba(0, 0, 0, 0.3); color: #FFFFFF; } """) as demo: with gr.Row(): gr.Markdown("

✨ InsightVision: Detect, Analyze, Summarize ✨

") with gr.Row(): with gr.Column(scale=2): image_input = gr.Image(label="Upload Image", type="pil", elem_classes="gr-box") detect_button = gr.Button("Run Detection", elem_classes="gr-button") with gr.Column(scale=3): annotated_image_output = gr.Image(label="Detected Image", type="pil", elem_classes="gr-box") summary_output = gr.Textbox(label="Detection Summary", lines=10, interactive=False, elem_classes="gr-box") scene_description_output = gr.Textbox(label="Scene Description", lines=5, interactive=False, elem_classes="gr-box") # Actions for buttons def detect_and_process(image): annotated_image = run_inference(image) summary = generate_summary(np.array(image)) scene_description = generate_scene_description(summary) return annotated_image, summary, scene_description detect_button.click( fn=detect_and_process, inputs=[image_input], outputs=[annotated_image_output, summary_output, scene_description_output] ) gr.Markdown("") # Launch the interface demo.launch()