image-caption-server

Paused

App Files Files Community

image-caption-server / app.py

jbilcke-hf HF staff

Update app.py

ee9e2f0 about 1 year ago

raw

history blame

4.97 kB

	#!/usr/bin/env python

	from __future__ import annotations

	import os

	import gradio as gr
	import PIL.Image
	import spaces
	import torch
	from transformers import InstructBlipForConditionalGeneration, InstructBlipProcessor

	DESCRIPTION = "# InstructBLIP"

	MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "1024"))

	SECRET_TOKEN = os.getenv('SECRET_TOKEN', 'default_secret')

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	model_id = "Salesforce/instructblip-vicuna-7b"
	processor = InstructBlipProcessor.from_pretrained(model_id)
	model = InstructBlipForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")


	@spaces.GPU
	def run(
	secret_token: str,
	image: PIL.Image.Image,
	prompt: str,
	text_decoding_method: str = "Nucleus sampling",
	num_beams: int = 5,
	max_length: int = 256,
	min_length: int = 1,
	top_p: float = 0.9,
	repetition_penalty: float = 1.5,
	length_penalty: float = 1.0,
	temperature: float = 1.0,
	) -> str:
	if secret_token != SECRET_TOKEN:
	raise gr.Error(
	f'Invalid secret token. Please fork the original space if you want to use it for yourself.')

	h, w = image.size
	scale = MAX_IMAGE_SIZE / max(h, w)
	if scale < 1:
	new_w = int(w * scale)
	new_h = int(h * scale)
	image = image.resize((new_w, new_h), resample=PIL.Image.Resampling.LANCZOS)

	inputs = processor(images=image, text=prompt, return_tensors="pt").to(device, torch.float16)
	generated_ids = model.generate(
	**inputs,
	do_sample=text_decoding_method == "Nucleus sampling",
	num_beams=num_beams,
	max_length=max_length,
	min_length=min_length,
	top_p=top_p,
	repetition_penalty=repetition_penalty,
	length_penalty=length_penalty,
	temperature=temperature,
	)
	generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
	return generated_caption


	with gr.Blocks(css="style.css") as demo:
	gr.Markdown(DESCRIPTION)

	with gr.Row():
	with gr.Column():
	secret_token = gr.Textbox(label="Secret token")
	input_image = gr.Image(type="pil")
	prompt = gr.Textbox(label="Prompt")
	run_button = gr.Button()
	with gr.Accordion(label="Advanced options", open=False):
	text_decoding_method = gr.Radio(
	label="Text Decoding Method",
	choices=["Beam search", "Nucleus sampling"],
	value="Nucleus sampling",
	)
	num_beams = gr.Slider(
	label="Number of Beams",
	minimum=1,
	maximum=10,
	step=1,
	value=5,
	)
	max_length = gr.Slider(
	label="Max Length",
	minimum=1,
	maximum=512,
	step=1,
	value=256,
	)
	min_length = gr.Slider(
	label="Minimum Length",
	minimum=1,
	maximum=64,
	step=1,
	value=1,
	)
	top_p = gr.Slider(
	label="Top P",
	minimum=0.1,
	maximum=1.0,
	step=0.1,
	value=0.9,
	)
	repetition_penalty = gr.Slider(
	label="Repetition Penalty",
	info="Larger value prevents repetition.",
	minimum=1.0,
	maximum=5.0,
	step=0.5,
	value=1.5,
	)
	length_penalty = gr.Slider(
	label="Length Penalty",
	info="Set to larger for longer sequence, used with beam search.",
	minimum=-1.0,
	maximum=2.0,
	step=0.2,
	value=1.0,
	)
	temperature = gr.Slider(
	label="Temperature",
	info="Used with nucleus sampling.",
	minimum=0.5,
	maximum=1.0,
	step=0.1,
	value=1.0,
	)

	with gr.Column():
	output = gr.Textbox(label="Result")

	gr.on(
	triggers=[prompt.submit, run_button.click],
	fn=run,
	inputs=[
	secret_token,
	input_image,
	prompt,
	text_decoding_method,
	num_beams,
	max_length,
	min_length,
	top_p,
	repetition_penalty,
	length_penalty,
	temperature,
	],
	outputs=output,
	api_name="run",
	)

	if __name__ == "__main__":
	demo.queue(max_size=20).launch()