Spaces:
Runtime error
Runtime error
geonmo.gu
commited on
Commit
β’
4b145c8
1
Parent(s):
fba8607
add description
Browse files- .gitignore +2 -0
- app.py +17 -5
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
*.swp
|
2 |
+
*.pt
|
app.py
CHANGED
@@ -3,8 +3,6 @@ import torch
|
|
3 |
import gradio as gr
|
4 |
import time
|
5 |
import clip
|
6 |
-
#from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
7 |
-
#from flores200_codes import flores_codes
|
8 |
import requests
|
9 |
import csv
|
10 |
import json
|
@@ -22,7 +20,6 @@ os.environ['CUDA_VISIBLE_DEVICES'] = ''
|
|
22 |
|
23 |
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
|
24 |
HF_TOKEN = os.environ["HF_TOKEN"]
|
25 |
-
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
26 |
|
27 |
def load_openimage_classnames(csv_path):
|
28 |
csv_data = open(csv_path)
|
@@ -261,8 +258,23 @@ if __name__ == '__main__':
|
|
261 |
|
262 |
title = "Socratic models for image captioning with BLOOM"
|
263 |
|
264 |
-
|
265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.00598'>Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language</a></p>"
|
267 |
examples = ['k21-1.jpg']
|
268 |
|
|
|
3 |
import gradio as gr
|
4 |
import time
|
5 |
import clip
|
|
|
|
|
6 |
import requests
|
7 |
import csv
|
8 |
import json
|
|
|
20 |
|
21 |
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
|
22 |
HF_TOKEN = os.environ["HF_TOKEN"]
|
|
|
23 |
|
24 |
def load_openimage_classnames(csv_path):
|
25 |
csv_data = open(csv_path)
|
|
|
258 |
|
259 |
title = "Socratic models for image captioning with BLOOM"
|
260 |
|
261 |
+
description = """
|
262 |
+
## Details
|
263 |
+
**Without any fine-tuning**, we can do image captioning using Visual-Language models (e.g., CLIP, SLIP, ...) and Large language models (e.g., GPT, BLOOM, ...).
|
264 |
+
In this demo, I choose BLOOM as the language model and CLIP ViT-L/14 as the visual-language model.
|
265 |
+
The order of generating image caption is as follow:
|
266 |
+
1. Classify whether there are people, where the location is, and what objects are in the input image using the visual-language model.
|
267 |
+
2. Then, build a prompt using classified results.
|
268 |
+
3. Request BLOOM API with the prompt.
|
269 |
+
|
270 |
+
This demo is slightly different with the original method proposed in the socratie model paper.
|
271 |
+
I used not only tencent ml class names, but also OpenImage class names and I adopt BLOOM for the large language model
|
272 |
+
|
273 |
+
If you want the demo using GPT3 from OpenAI, check https://github.com/geonm/socratic-models-demo.
|
274 |
+
|
275 |
+
Demo is running on CPU.
|
276 |
+
"""
|
277 |
+
|
278 |
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2204.00598'>Socratic Models: Composing Zero-Shot Multimodal Reasoning with Language</a></p>"
|
279 |
examples = ['k21-1.jpg']
|
280 |
|