import pandas as pd from torch.utils.data import DataLoader from torchvision import transforms from tqdm.notebook import tqdm import torch from torch.autograd import Variable import torchvision import pickle from PIL import Image import torch.nn as nn import math import random import gradio as gr device = "cpu" max_seq_len=67 with open('index_to_word.pkl', 'rb') as handle: index_to_word = pickle.load(handle) with open('word_to_index.pkl', 'rb') as handle: word_to_index = pickle.load(handle) resnet18 = torchvision.models.resnet18(pretrained=True).to(device) resnet18.eval() resNet18Layer4 = resnet18._modules.get('layer4').to(device) def create_df(img): df = pd.DataFrame({"image": [img]}) return df def get_vector(t_img): t_img = Variable(t_img) my_embedding = torch.zeros(1, 512, 7, 7) def copy_data(m, i, o): my_embedding.copy_(o.data) h = resNet18Layer4.register_forward_hook(copy_data) resnet18(t_img) h.remove() return my_embedding class extractImageFeatureResNetDataSet(): from PIL import Image def __init__(self, data): self.data = data self.scaler = transforms.Resize([224, 224]) self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.to_tensor = transforms.ToTensor() def __len__(self): return len(self.data) def __getitem__(self, idx): image_name = self.data.iloc[idx]['image'] img_loc = str(image_name) #os.getcwd()+'/imput_img/'+str(image_name) img = Image.open(img_loc) t_img = self.normalize(self.to_tensor(self.scaler(img))) return image_name, t_img def feature_exctractor(df): extract_imgFtr_ResNet_input = {} input_ImageDataset_ResNet = extractImageFeatureResNetDataSet(df[['image']]) input_ImageDataloader_ResNet = DataLoader(input_ImageDataset_ResNet, batch_size = 1, shuffle=False) for image_name, t_img in tqdm(input_ImageDataloader_ResNet): t_img = t_img.to("cpu") embdg = get_vector(t_img) extract_imgFtr_ResNet_input[image_name[0]] = embdg return extract_imgFtr_ResNet_input class PositionalEncoding(nn.Module): def __init__(self, d_model, dropout=0.1, max_len=max_seq_len): super(PositionalEncoding, self).__init__() self.dropout = nn.Dropout(p=dropout) pe = torch.zeros(max_len, d_model) position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0) self.register_buffer('pe', pe) def forward(self, x): if self.pe.size(0) < x.size(0): self.pe = self.pe.repeat(x.size(0), 1, 1).to(device) self.pe = self.pe[:x.size(0), : , : ] x = x + self.pe return self.dropout(x) class ImageCaptionModel(nn.Module): def __init__(self, n_head, n_decoder_layer, vocab_size, embedding_size): super(ImageCaptionModel, self).__init__() self.pos_encoder = PositionalEncoding(embedding_size, 0.1) self.TransformerDecoderLayer = nn.TransformerDecoderLayer(d_model = embedding_size, nhead = n_head) self.TransformerDecoder = nn.TransformerDecoder(decoder_layer = self.TransformerDecoderLayer, num_layers = n_decoder_layer) self.embedding_size = embedding_size self.embedding = nn.Embedding(vocab_size , embedding_size) self.last_linear_layer = nn.Linear(embedding_size, vocab_size) self.init_weights() def init_weights(self): initrange = 0.1 self.embedding.weight.data.uniform_(-initrange, initrange) self.last_linear_layer.bias.data.zero_() self.last_linear_layer.weight.data.uniform_(-initrange, initrange) def generate_Mask(self, size, decoder_inp): decoder_input_mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1) decoder_input_mask = decoder_input_mask.float().masked_fill(decoder_input_mask == 0, float('-inf')).masked_fill(decoder_input_mask == 1, float(0.0)) decoder_input_pad_mask = decoder_inp.float().masked_fill(decoder_inp == 0, float(0.0)).masked_fill(decoder_inp > 0, float(1.0)) decoder_input_pad_mask_bool = decoder_inp == 0 return decoder_input_mask, decoder_input_pad_mask, decoder_input_pad_mask_bool def forward(self, encoded_image, decoder_inp): encoded_image = encoded_image.permute(1,0,2) decoder_inp_embed = self.embedding(decoder_inp)* math.sqrt(self.embedding_size) decoder_inp_embed = self.pos_encoder(decoder_inp_embed) decoder_inp_embed = decoder_inp_embed.permute(1,0,2) decoder_input_mask, decoder_input_pad_mask, decoder_input_pad_mask_bool = self.generate_Mask(decoder_inp.size(1), decoder_inp) decoder_input_mask = decoder_input_mask.to(device) decoder_input_pad_mask = decoder_input_pad_mask.to(device) decoder_input_pad_mask_bool = decoder_input_pad_mask_bool.to(device) decoder_output = self.TransformerDecoder(tgt = decoder_inp_embed, memory = encoded_image, tgt_mask = decoder_input_mask, tgt_key_padding_mask = decoder_input_pad_mask_bool) final_output = self.last_linear_layer(decoder_output) return final_output, decoder_input_pad_mask def generate_caption(K, img_nm, extract_imgFtr_ResNet_input): from PIL import Image img_loc = str(img_nm)#os.getcwd()+'/imput_img/'+ image = Image.open(img_loc).convert("RGB") #plt.imshow(image) model.eval() img_embed = extract_imgFtr_ResNet_input[img_nm].to(device) img_embed = img_embed.permute(0,2,3,1) img_embed = img_embed.view(img_embed.size(0), -1, img_embed.size(3)) input_seq = [pad_token]*max_seq_len input_seq[0] = start_token input_seq = torch.tensor(input_seq).unsqueeze(0).to(device) predicted_sentence = [] with torch.no_grad(): for eval_iter in range(0, max_seq_len): output, padding_mask = model.forward(img_embed, input_seq) output = output[eval_iter, 0, :] values = torch.topk(output, K).values.tolist() indices = torch.topk(output, K).indices.tolist() next_word_index = random.choices(indices, values, k = 1)[0] next_word = index_to_word[next_word_index] input_seq[:, eval_iter+1] = next_word_index if next_word == '' : break predicted_sentence.append(next_word) return " ".join(predicted_sentence + ["."]) device = torch.device('cpu') model = torch.load('./BestModel_20000_Datos', map_location=device) start_token = word_to_index[''] end_token = word_to_index[''] pad_token = word_to_index[''] def predict(inp): device = "cpu" max_seq_len=67 with open('index_to_word.pkl', 'rb') as handle: index_to_word = pickle.load(handle) with open('word_to_index.pkl', 'rb') as handle: word_to_index = pickle.load(handle) resnet18 = torchvision.models.resnet18(pretrained=True).to(device) resnet18.eval() resNet18Layer4 = resnet18._modules.get('layer4').to(device) df = create_df(inp) extract_imgFtr_ResNet_input = feature_exctractor(df) prediction = generate_caption(1, inp, extract_imgFtr_ResNet_input) return prediction gr.Interface(fn=predict, inputs=gr.Image(type="filepath"), outputs=gr.Text(), title = "Clothe captioning model", description = "A clothe image captioning model to get descriptions of your code.\n Take your phone, make a picture of your clothes, upload it and you are ready to go").launch()