image_to_text / app.py
mookkanvas's picture
Update app.py
f5a4803
import streamlit as st
import torch
from transformers import AutoFeatureExtractor, AutoModelForSequenceClassification, AutoTokenizer
from PIL import Image
# Load the pretrained model and tokenizer
model_name = "nlpconnect/vit-gpt2-image-captioning"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# Define a function to generate captions from an image
def generate_caption(image):
inputs = tokenizer(image, return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits
caption = tokenizer.decode(logits.argmax(1)[0], skip_special_tokens=True)
return caption
def main():
st.title("Image to Text Captioning")
with st.form("my_form"):
uploaded_file = st.file_uploader("Choose an image file", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
# Display the uploaded image
image = Image.open(uploaded_file)
st.image(image, caption="Uploaded Image", use_column_width=True)
clicked = st.form_submit_button("Generate Caption")
if clicked:
if "image" in locals():
caption = generate_caption(image)
st.subheader("Generated Caption:")
st.write(caption)
if __name__ == "__main__":
main()