Sachinthaka Abeywardana
commited on
Commit
•
8f21e3d
1
Parent(s):
903ad7c
updated with metadata
Browse files
README.md
DELETED
@@ -1,30 +0,0 @@
|
|
1 |
-
# Vit2-DistilGPT2
|
2 |
-
This model takes in an image and outputs a caption. It was trained using the Coco dataset and the full training script can be found in [this kaggle kernel](https://www.kaggle.com/sachin/visionencoderdecoder-model-training)
|
3 |
-
|
4 |
-
## Usage
|
5 |
-
```python
|
6 |
-
import Image
|
7 |
-
from transformers import AutoModel, GPT2Tokenizer, ViTFeatureExtractor
|
8 |
-
|
9 |
-
model = AutoModel.from_pretrained("sachin/vit2distilgpt2")
|
10 |
-
vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
|
11 |
-
|
12 |
-
# make sure GPT2 appends EOS in begin and end
|
13 |
-
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
14 |
-
outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
|
15 |
-
return outputs
|
16 |
-
|
17 |
-
GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
|
18 |
-
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
|
19 |
-
# set pad_token_id to unk_token_id -> be careful here as unk_token_id == eos_token_id == bos_token_id
|
20 |
-
gpt2_tokenizer.pad_token = gpt2_tokenizer.unk_token
|
21 |
-
|
22 |
-
image = (Image.open(image_path).convert("RGB"), return_tensors="pt").pixel_values
|
23 |
-
encoder_outputs = model.generate(image.unsqueeze(0))
|
24 |
-
generated_sentences = gpt2_tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True)
|
25 |
-
```
|
26 |
-
Note that the output sentence may be repeated, hence a post processing step may be required.
|
27 |
-
|
28 |
-
## Bias Warning
|
29 |
-
This model may be biased due to dataset, lack of long training and the model itself. The following gender bias is an example.
|
30 |
-
![](https://i.imgur.com/9zVN022.png)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Readme.md
DELETED
@@ -1,30 +0,0 @@
|
|
1 |
-
# Vit2-DistilGPT2
|
2 |
-
This model takes in an image and outputs a caption. It was trained using the Coco dataset and the full training script can be found in [this kaggle kernel](https://www.kaggle.com/sachin/visionencoderdecoder-model-training)
|
3 |
-
|
4 |
-
## Usage
|
5 |
-
```python
|
6 |
-
import Image
|
7 |
-
from transformers import AutoModel, GPT2Tokenizer, ViTFeatureExtractor
|
8 |
-
|
9 |
-
model = AutoModel.from_pretrained("sachin/vit2distilgpt2")
|
10 |
-
vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
|
11 |
-
|
12 |
-
# make sure GPT2 appends EOS in begin and end
|
13 |
-
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
14 |
-
outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
|
15 |
-
return outputs
|
16 |
-
|
17 |
-
GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
|
18 |
-
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
|
19 |
-
# set pad_token_id to unk_token_id -> be careful here as unk_token_id == eos_token_id == bos_token_id
|
20 |
-
gpt2_tokenizer.pad_token = gpt2_tokenizer.unk_token
|
21 |
-
|
22 |
-
image = (Image.open(image_path).convert("RGB"), return_tensors="pt").pixel_values
|
23 |
-
encoder_outputs = model.generate(image.unsqueeze(0))
|
24 |
-
generated_sentences = gpt2_tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True)
|
25 |
-
```
|
26 |
-
Note that the output sentence may be repeated, hence a post processing step may be required.
|
27 |
-
|
28 |
-
## Bias Warning
|
29 |
-
This model may be biased due to dataset, lack of long training and the model itself. The following gender bias is an example.
|
30 |
-
![](https://i.imgur.com/9zVN022.png)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|