Spaces:

johko
/

capdec-image-captioning

Runtime error

Johannes

update

2420b7f almost 2 years ago

3.26 kB

	import gradio as gr
	import clip
	from model import ClipCaptionModel
	from transformers import GPT2Tokenizer
	import numpy as np
	import torch
	import PIL
	from predict import generate2, generate_beam
	from huggingface_hub import hf_hub_download

	D = torch.device
	CPU = torch.device('cpu')
	pretrained_model_variance = "0.015"
	device = "cpu"
	model_path = hf_hub_download('johko/capdec_015', 'model.pt')

	clip_model, preprocess = clip.load("RN50x4", device=device, jit=False)
	tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

	model_0 = hf_hub_download('johko/capdec_0', 'model.pt')
	model_001 = hf_hub_download('johko/capdec_001', 'model.pt')
	model_005 = hf_hub_download('johko/capdec_005', 'model.pt')
	model_015 = hf_hub_download('johko/capdec_015', 'model.pt')
	model_025 = hf_hub_download('johko/capdec_025', 'model.pt')
	model_05 = hf_hub_download('johko/capdec_05', 'model.pt')


	def load_noise_level_model(noise_level):
	if noise_level == "0.0":
	model_path = model_0
	elif noise_level == "0.001":
	model_path = model_001
	elif noise_level == "0.005":
	model_path = model_005
	elif noise_level == "0.015":
	model_path = model_015
	elif noise_level == "0.025":
	model_path = model_025
	elif noise_level == "0.05":
	model_path = model_05
	else:
	raise ValueError("Unknown Noise Level")

	model = ClipCaptionModel()
	model.load_state_dict(torch.load(model_path, map_location=CPU))
	model = model.eval()
	model = model.to(device)

	return model

	def infer(input_image: np.ndarray, noise_level: str):
	use_beam_search = True

	model = load_noise_level_model(noise_level)

	pil_image = PIL.Image.fromarray(input_image)

	image = preprocess(pil_image).unsqueeze(0).to(device)
	with torch.no_grad():
	prefix = clip_model.encode_image(image).to(device, dtype=torch.float32)
	prefix_embed = model.clip_project(prefix).reshape(1, 40, -1)
	if use_beam_search:
	generated_text_prefix = generate_beam(model, tokenizer, embed=prefix_embed)[0]
	else:
	generated_text_prefix = generate2(model, tokenizer, embed=prefix_embed)

	return input_image, generated_text_prefix

	description="""This space is a demo for the paper [Text-Only Training for Image Captioning using Noise-Injected CLIP](https://arxiv.org/pdf/2211.00575.pdf)
	by David Nukrai, Ron Mokady and Amir Globerson.

	The paper is about training an Image Captioning model by only using text. It leverages the usage of noise injections at different Noise Levels,
	with which you can experiment as well in this demo. The text caption will change depending on the Noise Level you choose."""

	dropdown = gr.components.Dropdown(["0.0", "0.001", "0.005", "0.015", "0.025", "0.05"], value="0.015", label="Noise Level")
	input_image = gr.components.Image(label="Input Image")
	output_image = gr.components.Image(label="Image")
	output_text = gr.components.Textbox(label="Generated Caption")

	iface = gr.Interface(
	title="CapDec Image Captioning",
	description=description,
	fn=infer,
	inputs=[input_image, dropdown],
	outputs=[output_image, output_text],
	examples=[["examples/flickr_ex2.jpg", "0.015"], ["examples/web_ex3.jpeg", "0.015"]])
	iface.launch()