clip-spanish / test_on_image.py
edugp's picture
Switch to BERTIN model for training script and testing on image
04899df
raw
history blame
1.41 kB
import os
import jax
import torch
from torchvision.io import ImageReadMode, read_image
from transformers import AutoTokenizer
from modeling_hybrid_clip import FlaxHybridCLIP
from run_hybrid_clip import Transform
def prepare_image(image_path, model):
image = read_image(image_path, mode=ImageReadMode.RGB)
preprocess = Transform(model.config.vision_config.image_size)
preprocess = torch.jit.script(preprocess)
preprocessed_image = preprocess(image)
pixel_values = torch.stack([preprocessed_image]).permute(0, 2, 3, 1).numpy()
return pixel_values
def prepare_text(text, tokenizer):
return tokenizer(text, return_tensors="np")
def run_inference(image_path, text, model, tokenizer):
pixel_values = prepare_image(image_path, model)
input_text = prepare_text(text, tokenizer)
model_output = model(input_text["input_ids"], pixel_values, attention_mask=input_text["attention_mask"], train=False, return_dict=True)
logits = model_output["logits_per_image"]
score = jax.nn.sigmoid(logits)[0][0]
return score
if __name__ == "__main__":
model = FlaxHybridCLIP.from_pretrained("./")
tokenizer = AutoTokenizer.from_pretrained("bertin-project/bertin-roberta-base-spanish")
image_path = f"/home/{os.environ['USER']}/data/wit_scale_converted/Santuar.jpg"
text = "Fachada del Santuario"
print(run_inference(image_path, text, model, tokenizer))