Upload 6 files
Browse files
output/cal_f1.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
|
4 |
+
def cal_f1(df, standard=False):
|
5 |
+
df['label_list'] = df['label'].apply(lambda x: [i.strip().lower() for i in x.split(';')])
|
6 |
+
#df['pred_list_go'] = df['pred'].apply(lambda x: [i.strip() for i in x.split(';')])
|
7 |
+
if standard:
|
8 |
+
df['pred_list'] = df['pred'].apply(lambda x: [i[0] for i in eval(str(x))])
|
9 |
+
else:
|
10 |
+
df['pred_list_prob'] = df['pred'].apply(lambda x: [eval(i.strip()) for i in str(x).split(';')])
|
11 |
+
df['pred_list'] = df['pred_list_prob'].apply(lambda x: [i[0] for i in x])
|
12 |
+
|
13 |
+
labels = []
|
14 |
+
pred_labels = []
|
15 |
+
for l in df['label_list']:
|
16 |
+
labels.extend(l)
|
17 |
+
|
18 |
+
label_count = {}
|
19 |
+
for x in labels:
|
20 |
+
if x not in label_count:
|
21 |
+
label_count[x] = 1
|
22 |
+
else:
|
23 |
+
label_count[x] += 1
|
24 |
+
|
25 |
+
labels = list(set(labels))
|
26 |
+
total = len(labels)
|
27 |
+
tp_dict, fp_dict, fn_dict = dict(zip(labels, [0] * len(labels))), dict(zip(labels, [0] * len(labels))), dict(
|
28 |
+
zip(labels, [0] * len(labels)))
|
29 |
+
for preds, label in zip(df['pred_list'], df['label_list']):
|
30 |
+
for t in label:
|
31 |
+
# supgo = godb.get_anchestors(t)
|
32 |
+
# if supgo.intersection(set(preds)):
|
33 |
+
if t in preds:
|
34 |
+
tp_dict[t] += 1
|
35 |
+
else:
|
36 |
+
fn_dict[t] += 1
|
37 |
+
for p in preds:
|
38 |
+
# supgo = godb.get_anchestors(p)
|
39 |
+
# if not supgo.intersection(set(label)):
|
40 |
+
if p not in label:
|
41 |
+
if p in fp_dict:
|
42 |
+
fp_dict[p] += 1
|
43 |
+
else:
|
44 |
+
fp_dict[p] = 1
|
45 |
+
pred_labels.extend(preds)
|
46 |
+
p_total = len(set(pred_labels))
|
47 |
+
recall, pr = 0., 0.
|
48 |
+
for x in labels:
|
49 |
+
recall += tp_dict[x] / (1.0 * (tp_dict[x] + fn_dict[x] + 1e-8))
|
50 |
+
pr += tp_dict[x] / (1.0 * (tp_dict[x] + fp_dict[x] + 1e-8))
|
51 |
+
r = recall / total
|
52 |
+
p = pr / p_total
|
53 |
+
f1 = 2 * p * r / (p + r + 1e-8)
|
54 |
+
|
55 |
+
print("preds not in labels: {}".format(len(list(fp_dict.keys())) - total))
|
56 |
+
print("recall:{}; percision:{}; f1 score: {}".format(r, p, f1))
|
57 |
+
|
58 |
+
|
59 |
+
names = ['output_test_mf_exp_493552.txt', 'output_test_mf_exp_445772_pre.txt', 'output_test_mf_exp_445772.txt', 'output_test_mf_exp_486524.txt', 'output_test_mf_493552_standard.csv', 'output_test_mf_445772_standard.csv', 'output_test_mf_exp_445772_withprompt.txt', 'output_test_mf_exp_506753.txt']
|
60 |
+
#names = ['output_test_bp_exp_451674.txt', 'output_test_bp_exp_493547_pre.txt', 'output_test_bp_exp_496359_withprompt.txt']
|
61 |
+
|
62 |
+
for name in names:
|
63 |
+
print(name)
|
64 |
+
df = pd.read_csv('/cluster/home/wenkai/LAVIS/output/mf_bp_cc/{}'.format(name), sep='|', header=None)
|
65 |
+
if df.iloc[0, 0] == 'name':
|
66 |
+
df = df[1:]
|
67 |
+
#print(df.shape)
|
68 |
+
df.columns = ['name', 'pred', 'label']
|
69 |
+
if 'standard' in name:
|
70 |
+
cal_f1(df, standard=True)
|
71 |
+
else:
|
72 |
+
cal_f1(df)
|
73 |
+
|
74 |
+
|
75 |
+
|
output/output_test_mf_445772_standard.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
output/output_val_mf_445772_standard.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
projects/blip2/README.md
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models
|
2 |
+
This is the official implementation of BLIP-2 [paper](https://arxiv.org/abs/2301.12597), a generic and efficient pre-training strategy that easily harvests development of pretrained vision models and large language models (LLMs) for vision-language pretraining. BLIP-2 beats Flamingo on zero-shot VQAv2 (**65.0** vs **56.3**), establishing new state-of-the-art on zero-shot captioning (on NoCaps **121.6** CIDEr score vs previous best **113.2**). Equipped with powerful LLMs (e.g. OPT, FlanT5), BLIP-2 also unlocks the new **zero-shot instructed vision-to-language generation** capabilities for various interesting applications!
|
3 |
+
|
4 |
+
<img src="blip2_illustration.png" width="500">
|
5 |
+
|
6 |
+
### Install:
|
7 |
+
```
|
8 |
+
pip install salesforce-lavis
|
9 |
+
```
|
10 |
+
or install from source following LAVIS instruction.
|
11 |
+
|
12 |
+
### Demo:
|
13 |
+
Try out our [Notebook Demo](https://github.com/salesforce/LAVIS/blob/main/examples/blip2_instructed_generation.ipynb) on instructed vision-to-language generation: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/salesforce/LAVIS/blob/main/examples/blip2_instructed_generation.ipynb)
|
14 |
+
|
15 |
+
|
16 |
+
### BLIP-2 Model Zoo
|
17 |
+
```python
|
18 |
+
# ==================================================
|
19 |
+
# Architectures Types
|
20 |
+
# ==================================================
|
21 |
+
# blip2_opt pretrain_opt2.7b, caption_coco_opt2.7b, pretrain_opt6.7b, caption_coco_opt6.7b
|
22 |
+
# blip2_t5 pretrain_flant5xl, caption_coco_flant5xl, pretrain_flant5xxl
|
23 |
+
# blip2 pretrain, coco
|
24 |
+
```
|
25 |
+
- Use ```pretrained_{LLM}``` model types for zero-shot image-to-text generation with prompts.
|
26 |
+
- Use ```caption_coco_{LLM}``` model types to generate coco-style captions.
|
27 |
+
- Use ```blip2``` model architecture for image-text feature extraction and retrieval.
|
28 |
+
|
29 |
+
### Image-to-text Generation Example
|
30 |
+
Let’s see how to use BLIP-2 models to perform zero-shot instructed image-to-text generation. We first load a sample image from local.
|
31 |
+
```python
|
32 |
+
import torch
|
33 |
+
from PIL import Image
|
34 |
+
# setup device to use
|
35 |
+
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
|
36 |
+
# load sample image
|
37 |
+
raw_image = Image.open("../../docs/_static/merlion.png").convert("RGB")
|
38 |
+
display(raw_image.resize((596, 437)))
|
39 |
+
```
|
40 |
+
|
41 |
+
Then we load a pre-trained BLIP-2 model with its preprocessors (transforms).
|
42 |
+
```python
|
43 |
+
import torch
|
44 |
+
from lavis.models import load_model_and_preprocess
|
45 |
+
# loads BLIP-2 pre-trained model
|
46 |
+
model, vis_processors, _ = load_model_and_preprocess(name="blip2_t5", model_type="pretrain_flant5xxl", is_eval=True, device=device)
|
47 |
+
# prepare the image
|
48 |
+
image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
|
49 |
+
```
|
50 |
+
|
51 |
+
Given the image and a text prompt, ask the model to generate the response.
|
52 |
+
```python
|
53 |
+
model.generate({"image": image, "prompt": "Question: which city is this? Answer:"})
|
54 |
+
# 'singapore'
|
55 |
+
```
|
56 |
+
|
57 |
+
Ask the model to explain its answer.
|
58 |
+
```python
|
59 |
+
model.generate({
|
60 |
+
"image": image,
|
61 |
+
"prompt": "Question: which city is this? Answer: singapore. Question: why?"})
|
62 |
+
# 'it has a statue of a merlion'
|
63 |
+
```
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
Ask a follow-up question.
|
69 |
+
```python
|
70 |
+
# prepare context prompt
|
71 |
+
context = [
|
72 |
+
("which city is this?", "singapore"),
|
73 |
+
("why?", "it has a statue of a merlion"),
|
74 |
+
]
|
75 |
+
question = "where is the name merlion coming from?"
|
76 |
+
template = "Question: {} Answer: {}."
|
77 |
+
prompt = " ".join([template.format(context[i][0], context[i][1]) for i in range(len(context))]) + " Question: " + question + " Answer:"
|
78 |
+
print(prompt)
|
79 |
+
# generate model's response
|
80 |
+
model.generate({"image": image,"prompt": prompt})
|
81 |
+
# 'merlion is a portmanteau of mermaid and lion'
|
82 |
+
```
|
83 |
+
|
84 |
+
### Feature Extraction Example
|
85 |
+
BLIP-2 supports the Unified Feature Extraction Interface of LAVIS. Checkout this [notebook](https://github.com/salesforce/LAVIS/blob/3446bac20c5646d35ae383ebe6d13cec4f8b00cb/examples/blip2_feature_extraction.ipynb) for an example.
|
86 |
+
|
87 |
+
### Image-Text Matching Example
|
88 |
+
BLIP-2 can compute the image-text matching score using the same interface as BLIP. Checkout this [notebook](https://github.com/salesforce/LAVIS/blob/3446bac20c5646d35ae383ebe6d13cec4f8b00cb/examples/blip2_image_text_matching.ipynb) for an example.
|
89 |
+
|
90 |
+
### Benchmark Evaluation
|
91 |
+
Follow [Dataset Download](https://opensource.salesforce.com/LAVIS//latest/getting_started.html#auto-downloading-and-loading-datasets) to prepare common vision-language datasets.
|
92 |
+
|
93 |
+
Run [these scripts](https://github.com/salesforce/LAVIS/tree/main/run_scripts/blip2/eval) for evaluating pretrained and finetuned models.
|
94 |
+
|
95 |
+
### Training
|
96 |
+
Stage-1 Pre-training (from scratch):
|
97 |
+
```bash run_scripts/blip2/train/pretrain_stage1.sh```
|
98 |
+
|
99 |
+
Stage-2 Pre-training:
|
100 |
+
```bash run_scripts/blip2/train/pretrain_stage2.sh```
|
101 |
+
|
102 |
+
Finetune for image captioning:
|
103 |
+
```bash run_scripts/blip2/train/train_caption_coco.sh```
|
104 |
+
|
105 |
+
The [config files](https://github.com/salesforce/LAVIS/tree/main/lavis/projects/blip2/train) can be modified for customized training.
|
106 |
+
|
107 |
+
### Citing BLIP-2
|
108 |
+
<pre>
|
109 |
+
@inproceedings{li2023blip2,
|
110 |
+
title={{BLIP-2:} Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
|
111 |
+
author={Junnan Li and Dongxu Li and Silvio Savarese and Steven Hoi},
|
112 |
+
year={2023},
|
113 |
+
booktitle={ICML},
|
114 |
+
}</pre>
|
115 |
+
|
116 |
+
### 🤗 Hugging Face integration
|
117 |
+
|
118 |
+
BLIP-2 is integrated into the Hugging Face 🤗 [Transformers](https://github.com/huggingface/transformers) library, and allows to leverage int8 quanitization thanks to [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). This roughly halves the amount of memory required to load the model, without performance degradation.
|
119 |
+
|
120 |
+
Documentation can be found [here](https://huggingface.co/docs/transformers/main/model_doc/blip-2).
|
121 |
+
|
122 |
+
Usage in half precision (float16) is as follows:
|
123 |
+
|
124 |
+
```
|
125 |
+
from PIL import Image
|
126 |
+
import requests
|
127 |
+
from transformers import Blip2Processor, Blip2ForConditionalGeneration
|
128 |
+
import torch
|
129 |
+
|
130 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
131 |
+
|
132 |
+
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
|
133 |
+
model = Blip2ForConditionalGeneration.from_pretrained(
|
134 |
+
"Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
|
135 |
+
)
|
136 |
+
model.to(device)
|
137 |
+
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
138 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
139 |
+
|
140 |
+
inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
|
141 |
+
|
142 |
+
generated_ids = model.generate(**inputs)
|
143 |
+
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
144 |
+
print(generated_text)
|
145 |
+
```
|
146 |
+
|
147 |
+
To leverage the int8 algorithm, you can run the model as follows:
|
148 |
+
|
149 |
+
```
|
150 |
+
import torch
|
151 |
+
import requests
|
152 |
+
from PIL import Image
|
153 |
+
from transformers import Blip2Processor, Blip2ForConditionalGeneration
|
154 |
+
|
155 |
+
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
|
156 |
+
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", load_in_8bit=True, device_map="auto")
|
157 |
+
|
158 |
+
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
|
159 |
+
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
|
160 |
+
|
161 |
+
question = "how many dogs are in the picture?"
|
162 |
+
inputs = processor(raw_image, question, return_tensors="pt").to("cuda", torch.float16)
|
163 |
+
|
164 |
+
out = model.generate(**inputs)
|
165 |
+
print(processor.decode(out[0], skip_special_tokens=True))
|
166 |
+
```
|
167 |
+
|
168 |
+
All models can be found on the [hub](https://huggingface.co/models?other=blip-2).
|
projects/blip2/blip2_illustration.png
ADDED
projects/blip2/model_card.pdf
ADDED
Binary file (125 kB). View file
|
|