Spaces:

nielsr
/

vilt-nlvr

Running

nielsr HF staff commited on Jan 20, 2022

Commit

3381383

•

1 Parent(s): e9848d6

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import ViltProcessor, ViltForNaturalLanguageVisualReasoning
 import torch
 # NLRV2 example images
@@ -8,16 +8,17 @@ torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg',
 torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/acorns_1.jpg', 'image3.jpg')
 torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/acorns_6.jpg', 'image4.jpg')
-processor = ViltProcessor.from_pretrained("nielsr/vilt-b32-finetuned-nlvr2")
-model = ViltForNaturalLanguageVisualReasoning.from_pretrained("nielsr/vilt-b32-finetuned-nlvr2")
 def predict(image1, image2, text):
-    encoding_1 = processor(image1, text, return_tensors="pt")
-    encoding_2 = processor(image2, text, return_tensors="pt")
     # forward pass
     with torch.no_grad():
-     outputs = model(input_ids=encoding_1.input_ids, pixel_values=encoding_1.pixel_values, pixel_values_2=encoding_2.pixel_values)
     logits = outputs.logits
     probs = torch.nn.functional.softmax(logits, dim=1)

 import gradio as gr
+from transformers import ViltProcessor, ViltForImagesAndTextClassification
 import torch
 # NLRV2 example images
 torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/acorns_1.jpg', 'image3.jpg')
 torch.hub.download_url_to_file('https://lil.nlp.cornell.edu/nlvr/exs/acorns_6.jpg', 'image4.jpg')
+processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")
+model = ViltForImagesAndTextClassification.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")
 def predict(image1, image2, text):
+    # prepare inputs
+    encoding = processor([image1, image2], text, return_tensors="pt")
+    pixel_values = torch.stack([encoding_1.pixel_values, encoding_2.pixel_values], dim=1)
     # forward pass
     with torch.no_grad():
+     outputs = model(input_ids=encoding.input_ids, pixel_values=encoding.pixel_values.unsqueeze(0))
     logits = outputs.logits
     probs = torch.nn.functional.softmax(logits, dim=1)