from PIL import Image import requests import torch import gradio as gr from transformers import pipeline CAPTION_MODELS = { 'blip-base': 'Salesforce/blip-image-captioning-base', 'blip-large': 'Salesforce/blip-image-captioning-large', 'vit-gpt2-coco-en': 'ydshieh/vit-gpt2-coco-en', } captioner = pipeline(task="image-to-text", model=CAPTION_MODELS['blip-base'], max_new_tokens=30, device_map="cpu", use_fast=True ) # Simple caption creation def caption_image(captioner, image_path): caption = captioner(image_path)[0]['generated_text'] return str(caption).strip() def launch(input): return caption_image(captioner, input) iface = gr.Interface(launch, inputs="text", outputs="text") iface.launch()