Spaces:

Geonmo
/

socratic-models-image-captioning-with-BLOOM

Runtime error

geonmo.gu commited on Jul 26, 2022

Commit

4b145c8

•

1 Parent(s): fba8607

add description

Files changed (2) hide show

.gitignore ADDED Viewed

app.py CHANGED Viewed

@@ -3,8 +3,6 @@ import torch
 import gradio as gr
 import time
 import clip
-#from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-#from flores200_codes import flores_codes
 import requests
 import csv
 import json
@@ -22,7 +20,6 @@ os.environ['CUDA_VISIBLE_DEVICES'] = ''
 API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
 HF_TOKEN = os.environ["HF_TOKEN"]
-headers = {"Authorization": f"Bearer {HF_TOKEN}"}
 def load_openimage_classnames(csv_path):
     csv_data = open(csv_path)
@@ -261,8 +258,23 @@ if __name__ == '__main__':
     title = "Socratic models for image captioning with BLOOM"
-    demo_status = "Demo is running on CPU"
-    description = f"Details: https://github.com/geonm/socratic-models-demo. {demo_status}"
     article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.00598'>Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language</a></p>"
     examples = ['k21-1.jpg']

 import gradio as gr
 import time
 import clip
 import requests
 import csv
 import json
 API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
 HF_TOKEN = os.environ["HF_TOKEN"]
 def load_openimage_classnames(csv_path):
     csv_data = open(csv_path)
     title = "Socratic models for image captioning with BLOOM"
+    description = """
+    ## Details
+    **Without any fine-tuning**, we can do image captioning using Visual-Language models (e.g., CLIP, SLIP, ...) and Large language models (e.g., GPT, BLOOM, ...).
+    In this demo, I choose BLOOM as the language model and CLIP ViT-L/14 as the visual-language model.
+    The order of generating image caption is as follow:
+    1. Classify whether there are people, where the location is, and what objects are in the input image using the visual-language model.
+    2. Then, build a prompt using classified results.
+    3. Request BLOOM API with the prompt.
+    This demo is slightly different with the original method proposed in the socratie model paper.
+    I used not only tencent ml class names, but also OpenImage class names and I adopt BLOOM for the large language model
+    If you want the demo using GPT3 from OpenAI, check https://github.com/geonm/socratic-models-demo.
+    Demo is running on CPU.
+    """
     article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.00598'>Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language</a></p>"
     examples = ['k21-1.jpg']