# Visual question answering from # https://learn.deeplearning.ai/courses/open-source-models-hugging-face/lesson/13/multimodal-visual-question-answering # from transformers import BlipForQuestionAnswering model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") from transformers import AutoProcessor processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base") from PIL import Image import gradio as gr def answering(image, question): inputs = processor(image, question, return_tensors="pt") out = model.generate(**inputs) output = processor.decode(out[0], skip_special_tokens=True) return output gr.close_all() app = gr.Interface(fn=answering, inputs=[gr.Image(label="Picture here", type="pil"), gr.Textbox(label="Question about picture here")], outputs=[gr.Textbox(label="Answer"),], title="Harza's application for answering questions about picture'", description="Harza's miracle application that can answer questions about given picuture!'", allow_flagging="never") app.launch() gr.close_all()