Spaces:
Runtime error
Runtime error
from PIL import Image | |
import requests | |
import gradio as gr | |
from transformers import AutoProcessor, AutoTokenizer, AutoImageProcessor, AutoModelForCausalLM, BlipForConditionalGeneration, VisionEncoderDecoderModel | |
import torch | |
import torch | |
from torch.autograd import Variable as V | |
import torchvision.models as models | |
from torchvision import transforms as trn | |
from torch.nn import functional as F | |
import os | |
import numpy as np | |
import cv2 | |
from PIL import Image | |
def recursion_change_bn(module): | |
if isinstance(module, torch.nn.BatchNorm2d): | |
module.track_running_stats = 1 | |
else: | |
for i, (name, module1) in enumerate(module._modules.items()): | |
module1 = recursion_change_bn(module1) | |
return module | |
def load_labels(): | |
# prepare all the labels | |
# scene category relevant | |
file_name_category = 'categories_places365.txt' | |
classes = list() | |
with open(file_name_category) as class_file: | |
for line in class_file: | |
classes.append(line.strip().split(' ')[0][3:]) | |
classes = tuple(classes) | |
# indoor and outdoor relevant | |
file_name_IO = 'IO_places365.txt' | |
with open(file_name_IO) as f: | |
lines = f.readlines() | |
labels_IO = [] | |
for line in lines: | |
items = line.rstrip().split() | |
labels_IO.append(int(items[-1]) -1) # 0 is indoor, 1 is outdoor | |
labels_IO = np.array(labels_IO) | |
# scene attribute relevant | |
file_name_attribute = 'labels_sunattribute.txt' | |
with open(file_name_attribute) as f: | |
lines = f.readlines() | |
labels_attribute = [item.rstrip() for item in lines] | |
file_name_W = 'W_sceneattribute_wideresnet18.npy' | |
W_attribute = np.load(file_name_W) | |
return classes, labels_IO, labels_attribute, W_attribute | |
def hook_feature(module, input, output): | |
return np.squeeze(output.data.cpu().numpy()) | |
def returnCAM(feature_conv, weight_softmax, class_idx): | |
# generate the class activation maps upsample to 256x256 | |
size_upsample = (256, 256) | |
nc, h, w = feature_conv.shape | |
output_cam = [] | |
for idx in class_idx: | |
cam = weight_softmax[class_idx].dot(feature_conv.reshape((nc, h*w))) | |
cam = cam.reshape(h, w) | |
cam = cam - np.min(cam) | |
cam_img = cam / np.max(cam) | |
cam_img = np.uint8(255 * cam_img) | |
output_cam.append(cv2.resize(cam_img, size_upsample)) | |
return output_cam | |
def returnTF(): | |
# load the image transformer | |
tf = trn.Compose([ | |
trn.Resize((224,224)), | |
trn.ToTensor(), | |
trn.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) | |
]) | |
return tf | |
def load_model(): | |
# this model has a last conv feature map as 14x14 | |
model_file = 'wideresnet18_places365.pth.tar' | |
import wideresnet | |
model = wideresnet.resnet18(num_classes=365) | |
checkpoint = torch.load(model_file, map_location=lambda storage, loc: storage) | |
state_dict = {str.replace(k,'module.',''): v for k,v in checkpoint['state_dict'].items()} | |
model.load_state_dict(state_dict) | |
# hacky way to deal with the upgraded batchnorm2D and avgpool layers... | |
for i, (name, module) in enumerate(model._modules.items()): | |
module = recursion_change_bn(model) | |
model.avgpool = torch.nn.AvgPool2d(kernel_size=14, stride=1, padding=0) | |
model.eval() | |
# hook the feature extractor | |
features_names = ['layer4','avgpool'] # this is the last conv layer of the resnet | |
for name in features_names: | |
model._modules.get(name).register_forward_hook(hook_feature) | |
return model | |
# load the labels | |
classes, labels_IO, labels_attribute, W_attribute = load_labels() | |
# load the model | |
features_blobs = [] | |
model = load_model() | |
# load the transformer | |
tf = returnTF() # image transformer | |
# get the softmax weight | |
params = list(model.parameters()) | |
weight_softmax = params[-2].data.numpy() | |
weight_softmax[weight_softmax<0] = 0 | |
def predict(img): | |
#img = Image.open('6.jpg') | |
input_img = V(tf(img).unsqueeze(0)) | |
logit = model.forward(input_img) | |
h_x = F.softmax(logit, 1).data.squeeze() | |
probs, idx = h_x.sort(0, True) | |
probs = probs.numpy() | |
idx = idx.numpy() | |
io_image = np.mean(labels_IO[idx[:10]]) # vote for the indoor or outdoor | |
env_image = [] | |
if io_image < 0.5: | |
env_image.append('Indoor') | |
#print('--TYPE OF ENVIRONMENT: indoor') | |
else: | |
env_image.append('Outdoor') | |
#print('--TYPE OF ENVIRONMENT: outdoor') | |
# output the prediction of scene category | |
#print('--SCENE CATEGORIES:') | |
scene_cat=[] | |
for i in range(0, 5): | |
scene_cat.append('{:.3f} -> {}'.format(probs[i], classes[idx[i]])) | |
#print('{:.3f} -> {}'.format(probs[i], classes[idx[i]])) | |
return env_image,scene_cat | |
git_processor = AutoProcessor.from_pretrained("microsoft/git-large-r-textcaps") | |
git_model = AutoModelForCausalLM.from_pretrained("microsoft/git-large-r-textcaps") | |
blip_processor = AutoProcessor.from_pretrained("jaimin/Imagecap") | |
blip_model = BlipForConditionalGeneration.from_pretrained("jaimin/Imagecap") | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
git_model.to(device) | |
blip_model.to(device) | |
def generate_caption(processor, model, image, use_float_16=False): | |
inputs = processor(images=image, return_tensors="pt").to(device) | |
if use_float_16: | |
inputs = inputs.to(torch.float16) | |
generated_ids = model.generate(pixel_values=inputs.pixel_values, max_length=50) | |
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
return generated_caption | |
def generate_captions(image): | |
#img = Image.open(image) | |
caption_git = generate_caption(git_processor, git_model, image) | |
caption_blip = generate_caption(blip_processor, blip_model, image) | |
env, scene = predict(image) | |
return env,scene,caption_git_large_textcaps, caption_blip_large | |
outputs = [gr.outputs.Textbox(label="Environment"), gr.outputs.Textbox(label="Objects detected"), gr.outputs.Textbox(label="Caption generated by GIT"), gr.outputs.Textbox(label="Caption generated by BLIP")] | |
title = "Image Cap with Scene" | |
description = " Image caption with scene" | |
interface = gr.Interface(fn=generate_captions, | |
inputs=gr.inputs.Image(type="pil"), | |
outputs=outputs, | |
title=title, | |
description=description, | |
enable_queue=True) | |
interface.launch(debug=True) |