Florence-2-SD3-Captioner

Running on Zero

File size: 3,106 Bytes

4e70ef0
 
 
 
 
704a53e
4e70ef0
704a53e
48fc5f0
 
f64ce70
f263b93
4e70ef0
f263b93
4e70ef0
 
1c8fb6e
 
4e70ef0
 
 
b5c347a
4e70ef0
 
 
b5c347a
4e70ef0
 
 
 
 
 
 
 
 
cb6599c
4e70ef0
 
 
cb6599c
4e70ef0
 
b5c347a
 
 
 
4e70ef0
 
 
 
c569a26
 
4e70ef0
 
 
 
 
79ce0f7
65853cc
 
 
 
4e70ef0
 
 
 
 
 
65853cc
 
 
 
4e70ef0
 
 
 
 
7946cb5
4e70ef0
ed4007b
 
d1e3414
ed4007b
902ec6b

import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import spaces
import re
from PIL import Image 
import torch
import subprocess
#subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
subprocess.run('pip install einops', shell=True)

#device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained('gokaygokay/Florence-2-Flux-Large', trust_remote_code=True).to("cuda:0").eval()

processor = AutoProcessor.from_pretrained('gokaygokay/Florence-2-Flux-Large', trust_remote_code=True)


TITLE = "# Florence-2-SD3-Captioner"
DESCRIPTION = ""

def modify_caption(caption: str) -> str:
    """
    Removes specific prefixes from captions if present, otherwise returns the original caption.
    Args:
        caption (str): A string containing a caption.
    Returns:
        str: The caption with the prefix removed if it was present, or the original caption.
    """
    # Define the prefixes to remove
    prefix_substrings = [
        ('captured from ', ''),
        ('captured at ', '')
    ]
    
    # Create a regex pattern to match any of the prefixes
    pattern = '|'.join([re.escape(opening) for opening, _ in prefix_substrings])
    replacers = {opening.lower(): replacer for opening, replacer in prefix_substrings}
    
    # Function to replace matched prefix with its corresponding replacement
    def replace_fn(match):
        return replacers[match.group(0).lower()]
    
    # Apply the regex to the caption
    modified_caption = re.sub(pattern, replace_fn, caption, count=1, flags=re.IGNORECASE)
    
    # If the caption was modified, return the modified version; otherwise, return the original
    return modified_caption if modified_caption != caption else caption
    
@spaces.GPU
def run_example(image):
    image = Image.fromarray(image)
    task_prompt = "<DESCRIPTION>"
    prompt = task_prompt + "Describe this image in great detail."

    # Ensure the image is in RGB mode
    if image.mode != "RGB":
        image = image.convert("RGB")

    inputs = processor(text=prompt, images=image, return_tensors="pt")
    
    # Move inputs to GPU
    inputs = {key: value.to("cuda:0") for key, value in inputs.items()}
    
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        num_beams=3
    )
    
    # Move generated_ids to CPU for decoding
    generated_ids = generated_ids.to("cpu")
    
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
    return modify_caption(parsed_answer["<DESCRIPTION>"])


with gr.Blocks() as demo:
    gr.Markdown(TITLE)
    submit_btn = gr.Button(value="Submit")
    output_text = gr.Textbox(label="Output Text")
    input_img = gr.Image(label="Input Picture")
    submit_btn.click(run_example, [input_img], [output_text])

demo.launch(debug=True)