File size: 3,665 Bytes
4e70ef0
 
 
 
30085b1
4e70ef0
7a8d779
 
 
4e70ef0
30085b1
4e70ef0
 
 
 
71141aa
4e70ef0
 
 
b5c347a
4e70ef0
 
 
b5c347a
4e70ef0
 
 
 
 
 
 
 
 
cb6599c
4e70ef0
 
 
cb6599c
4e70ef0
 
b5c347a
 
 
 
4e70ef0
 
 
 
c569a26
 
4e70ef0
 
 
 
 
30085b1
4e70ef0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71141aa
4e70ef0
 
 
 
 
 
 
 
 
 
 
25d724e
4e70ef0
ff558e1
4e70ef0
 
 
 
30085b1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM
import spaces
import re
from PIL import Image

# Install the necessary packages
import subprocess
subprocess.run('pip install flash-attn einops --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

model = AutoModelForCausalLM.from_pretrained('gokaygokay/Florence-2-SD3-Captioner', trust_remote_code=True).eval()

processor = AutoProcessor.from_pretrained('gokaygokay/Florence-2-SD3-Captioner', trust_remote_code=True)

TITLE = "# [Florence-2 SD3 Long Captioner](https://huggingface.co/gokaygokay/Florence-2-SD3-Captioner/)"
DESCRIPTION = "[Florence-2 Base](https://huggingface.co/microsoft/Florence-2-base-ft) fine-tuned on Long SD3 Prompt and Image pairs. Check above link for datasets that are used for fine-tuning."

def modify_caption(caption: str) -> str:
    """
    Removes specific prefixes from captions if present, otherwise returns the original caption.
    Args:
        caption (str): A string containing a caption.
    Returns:
        str: The caption with the prefix removed if it was present, or the original caption.
    """
    # Define the prefixes to remove
    prefix_substrings = [
        ('captured from ', ''),
        ('captured at ', '')
    ]
    
    # Create a regex pattern to match any of the prefixes
    pattern = '|'.join([re.escape(opening) for opening, _ in prefix_substrings])
    replacers = {opening.lower(): replacer for opening, replacer in prefix_substrings}
    
    # Function to replace matched prefix with its corresponding replacement
    def replace_fn(match):
        return replacers[match.group(0).lower()]
    
    # Apply the regex to the caption
    modified_caption = re.sub(pattern, replace_fn, caption, count=1, flags=re.IGNORECASE)
    
    # If the caption was modified, return the modified version; otherwise, return the original
    return modified_caption if modified_caption != caption else caption
    
@spaces.GPU
def run_example(image):
    image = Image.fromarray(image)
    task_prompt = "<DESCRIPTION>"
    prompt = task_prompt + "Describe this image in great detail."

    # Ensure the image is in RGB mode
    if image.mode != "RGB":
        image = image.convert("RGB")

    inputs = processor(text=prompt, images=image, return_tensors="pt")
    generated_ids = model.generate(
        input_ids=inputs["input_ids"],
        pixel_values=inputs["pixel_values"],
        max_new_tokens=1024,
        num_beams=3
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
    return modify_caption(parsed_answer["<DESCRIPTION>"])

css = """
  #output {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(TITLE)
    gr.Markdown(DESCRIPTION)
    with gr.Tab(label="Florence-2 SD3 Prompts"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture")
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.Textbox(label="Output Text")

        gr.Examples(
            [["image1.jpg"], ["image2.jpg"], ["image3.png"], ["image4.jpg"], ["image5.jpg"], ["image6.PNG"]],
            inputs = [input_img],
            outputs = [output_text],
            fn=run_example,
            label='Try captioning on below examples'
            )

        submit_btn.click(run_example, [input_img], [output_text])

demo.launch(debug=True)