Spaces:

JerryAnto
/

captiongenerator

Runtime error

App Files Files Community

captiongenerator / app.py

JerryAnto

Update app.py

6c29be2 over 1 year ago

raw

history blame contribute delete

2.51 kB

	# -- coding: utf-8 --
	"""Image Captioning with ViT+GPT2
	Automatically generated by Colaboratory.
	Original file is located at
	https://colab.research.google.com/drive/1P3O0gO5AUqSmM8rE9dxy2tXJ-9jkhxHz
	"""

	#! pip install transformers -q

	#! pip install gradio -q

	from PIL import Image
	from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, PreTrainedTokenizerFast
	import requests
	from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
	import torch
	from PIL import Image

	model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
	feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
	tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)



	max_length = 16
	num_beams = 4
	gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
	def predict_step(image_paths):
	images = []
	for image_path in image_paths:
	i_image = Image.open(image_path)
	if i_image.mode != "RGB":
	i_image = i_image.convert(mode="RGB")

	images.append(i_image)

	pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
	pixel_values = pixel_values.to(device)

	output_ids = model.generate(pixel_values, **gen_kwargs)

	preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
	preds = [pred.strip() for pred in preds]
	return preds

	#predict_step(['/content/drive/MyDrive/caption generator/horses.png'])


	import gradio as gr

	inputs = [
	gr.inputs.Image(type="pil", label="Original Image")
	]

	outputs = [
	gr.outputs.Textbox(label = 'Caption')
	]

	title = "Image Captioning using ViT + GPT2"
	description = "ViT and GPT2 are used to generate Image Caption for the uploaded image. COCO Dataset was used for training. This image captioning model might have some biases that we couldn't figure during our stress testing, so if you find any bias (gender, race and so on) please use `Flag` button to flag the image with bias"
	article = " <a href='https://huggingface.co/sachin/vit2distilgpt2'>Model Repo on Hugging Face Model Hub</a>"
	examples = [
	["horses.png"],
	["persons.png"],
	["football_player.png"]

	]

	gr.Interface(
	predict_step,
	inputs,
	outputs,
	title=title,
	description=description,
	article=article,
	examples=examples,
	theme="huggingface",
	).launch(debug=True, enable_queue=True)