Spaces:

gokaygokay
/

Florence-2-SD3-Captioner

Running

App Files Files Community

Florence-2-SD3-Captioner / app.py

gokaygokay

Update app.py

79ce0f7 verified 4 months ago

raw

history blame contribute delete

3.63 kB

	import gradio as gr
	from transformers import AutoProcessor, AutoModelForCausalLM
	import spaces
	import re
	from PIL import Image

	import subprocess
	subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

	model = AutoModelForCausalLM.from_pretrained('gokaygokay/Florence-2-SD3-Captioner', trust_remote_code=True).eval()

	processor = AutoProcessor.from_pretrained('gokaygokay/Florence-2-SD3-Captioner', trust_remote_code=True)


	TITLE = "# [Florence-2 SD3 Long Captioner](https://huggingface.co/gokaygokay/Florence-2-SD3-Captioner/)"
	DESCRIPTION = "[Florence-2 Base](https://huggingface.co/microsoft/Florence-2-base-ft) fine-tuned on Long SD3 Prompt and Image pairs. Check above link for datasets that are used for fine-tuning."

	def modify_caption(caption: str) -> str:
	"""
	Removes specific prefixes from captions if present, otherwise returns the original caption.
	Args:
	caption (str): A string containing a caption.
	Returns:
	str: The caption with the prefix removed if it was present, or the original caption.
	"""
	# Define the prefixes to remove
	prefix_substrings = [
	('captured from ', ''),
	('captured at ', '')
	]

	# Create a regex pattern to match any of the prefixes
	pattern = '\|'.join([re.escape(opening) for opening, _ in prefix_substrings])
	replacers = {opening.lower(): replacer for opening, replacer in prefix_substrings}

	# Function to replace matched prefix with its corresponding replacement
	def replace_fn(match):
	return replacers[match.group(0).lower()]

	# Apply the regex to the caption
	modified_caption = re.sub(pattern, replace_fn, caption, count=1, flags=re.IGNORECASE)

	# If the caption was modified, return the modified version; otherwise, return the original
	return modified_caption if modified_caption != caption else caption

	@spaces.GPU
	def run_example(image):
	image = Image.fromarray(image)
	task_prompt = "<DESCRIPTION>"
	prompt = task_prompt + "Describe this image in great detail."

	# Ensure the image is in RGB mode
	if image.mode != "RGB":
	image = image.convert("RGB")

	inputs = processor(text=prompt, images=image, return_tensors="pt")
	generated_ids = model.generate(
	input_ids=inputs["input_ids"],
	pixel_values=inputs["pixel_values"],
	max_new_tokens=1024,
	num_beams=3
	)
	generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
	parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
	return modify_caption(parsed_answer["<DESCRIPTION>"])


	css = """
	#output {
	height: 500px;
	overflow: auto;
	border: 1px solid #ccc;
	}
	"""

	with gr.Blocks(css=css) as demo:
	gr.Markdown(TITLE)
	gr.Markdown(DESCRIPTION)
	with gr.Tab(label="Florence-2 SD3 Prompts"):
	with gr.Row():
	with gr.Column():
	input_img = gr.Image(label="Input Picture")
	submit_btn = gr.Button(value="Submit")
	with gr.Column():
	output_text = gr.Textbox(label="Output Text")

	gr.Examples(
	[["image1.jpg"], ["image2.jpg"], ["image3.png"], ["image4.jpg"], ["image5.jpg"], ["image6.PNG"]],
	inputs = [input_img],
	outputs = [output_text],
	fn=run_example,
	label='Try captioning on below examples'
	)

	submit_btn.click(run_example, [input_img], [output_text])

	demo.launch(debug=True)