from transformers import pipeline from PIL import Image import os pretrained_img_model = "nlpconnect/vit-gpt2-image-captioning" def load_image_pipeline(img_path): img_path_read = Image.fromarray(img_path) img_path_read.save("temp_img.jpg") image_to_text = pipeline("image-to-text", model=pretrained_img_model, framework="pt") generated_text = image_to_text("temp_img.jpg")[0]["generated_text"] os.remove("temp_img.jpg") return generated_text if __name__=="__main__": imgpath = r"C:\Users\Shringar\Pictures\ar.jpg" img_text_generated = load_image_pipeline(imgpath) print(img_text_generated)