from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
from palette import ade_palette
from PIL import Image
import gradio as gr

def seg(image):
    feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/mit-b5")
    model = SegformerForSemanticSegmentation.from_pretrained("nvidia/mit-b5")
    print(model)
    
    inputs = feature_extractor(images=image, return_tensors="pt")
    outputs = model(**inputs)
    
    ## The model outputs logits of shape (batch_size, num_labels, height/4, width/4). 
    # We first rescale the logits to match the original size of the image using 
    # "bilinear interpolation". Next, we perform an argmax on the class dimension,
    #  and we create a color map which we draw over the image.

    # First, rescale logits to original image size
    logits = nn.functional.interpolate(outputs.logits.detach().cpu(),
                size=image.size[::-1], # (height, width)
                mode='bilinear',
                align_corners=False)
    # Second, apply argmax on the class dimension
    seg = logits.argmax(dim=1)[0]
    color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) # height, width, 3
    palette = np.array(ade_palette())
    for label, color in enumerate(palette):
        color_seg[seg == label, :] = color
    # Convert to BGR
    color_seg = color_seg[..., ::-1]
    img = np.array(image) * 0.5 + color_seg * 0.5
    img = img.astype(np.uint8)
    img = Image.fromarray(img)
    return img


iface = gr.Interface(fn=seg, inputs=gr.inputs.Image(type='pil'), outputs=gr.outputs.Image('pil'))
iface.launch()