ydshieh commited on
Commit
830aeea
1 Parent(s): 9b4bdf2

remove unused files

Browse files
app.py DELETED
@@ -1,46 +0,0 @@
1
- import streamlit as st
2
- from PIL import Image
3
- import numpy as np
4
-
5
-
6
- # Designing the interface
7
- st.title("WIT: Image -> Caption App")
8
- # For newline
9
- st.write('\n')
10
-
11
- image = Image.open('images/image.png')
12
- show = st.image(image, use_column_width=True)
13
-
14
- from model import *
15
-
16
- st.sidebar.title("Upload Image")
17
-
18
- # Disabling warning
19
- st.set_option('deprecation.showfileUploaderEncoding', False)
20
- # Choose your own image
21
- uploaded_file = st.sidebar.file_uploader(" ", type=['png', 'jpg', 'jpeg'])
22
-
23
- if uploaded_file is not None:
24
-
25
- image = Image.open(uploaded_file)
26
- show.image(image, 'Uploaded Image', use_column_width=True)
27
-
28
-
29
- # For newline
30
- st.sidebar.write('\n')
31
-
32
- if st.sidebar.button("Click here to get image caption"):
33
-
34
- if uploaded_file is None:
35
-
36
- st.sidebar.write("Please upload an Image to Classify")
37
-
38
- else:
39
-
40
- with st.spinner('Generating image caption ...'):
41
-
42
- caption = 'dummy caption'
43
- st.success(f'caption: {caption}')
44
-
45
- st.sidebar.header("ViT-GPT2 predicts:")
46
- st.sidebar.write(f"caption: {caption}", '\n')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/ckpt_2/config.json DELETED
@@ -1,163 +0,0 @@
1
- {
2
- "architectures": [
3
- "ViTGPT2LMForConditionalGeneration"
4
- ],
5
- "bos_token_id": 0,
6
- "decoder_start_token_id": 0,
7
- "eos_token_id": 2,
8
- "gpt2_config": {
9
- "_name_or_path": "",
10
- "activation_function": "gelu_new",
11
- "add_cross_attention": true,
12
- "architectures": null,
13
- "attn_pdrop": 0.1,
14
- "bad_words_ids": null,
15
- "bos_token_id": 0,
16
- "chunk_size_feed_forward": 0,
17
- "decoder_start_token_id": null,
18
- "diversity_penalty": 0.0,
19
- "do_sample": false,
20
- "early_stopping": false,
21
- "embd_pdrop": 0.1,
22
- "encoder_no_repeat_ngram_size": 0,
23
- "eos_token_id": 2,
24
- "finetuning_task": null,
25
- "forced_bos_token_id": null,
26
- "forced_eos_token_id": null,
27
- "gradient_checkpointing": false,
28
- "id2label": {
29
- "0": "LABEL_0",
30
- "1": "LABEL_1"
31
- },
32
- "initializer_range": 0.02,
33
- "is_decoder": false,
34
- "is_encoder_decoder": false,
35
- "label2id": {
36
- "LABEL_0": 0,
37
- "LABEL_1": 1
38
- },
39
- "layer_norm_epsilon": 1e-05,
40
- "length_penalty": 1.0,
41
- "max_length": 20,
42
- "min_length": 0,
43
- "model_type": "gpt2",
44
- "n_ctx": 1024,
45
- "n_embd": 768,
46
- "n_head": 12,
47
- "n_inner": null,
48
- "n_layer": 12,
49
- "n_positions": 1024,
50
- "no_repeat_ngram_size": 0,
51
- "num_beam_groups": 1,
52
- "num_beams": 1,
53
- "num_return_sequences": 1,
54
- "output_attentions": false,
55
- "output_hidden_states": false,
56
- "output_scores": false,
57
- "pad_token_id": 1,
58
- "prefix": null,
59
- "problem_type": null,
60
- "pruned_heads": {},
61
- "remove_invalid_values": false,
62
- "repetition_penalty": 1.0,
63
- "resid_pdrop": 0.1,
64
- "return_dict": true,
65
- "return_dict_in_generate": false,
66
- "scale_attn_weights": true,
67
- "sep_token_id": null,
68
- "summary_activation": null,
69
- "summary_first_dropout": 0.1,
70
- "summary_proj_to_labels": true,
71
- "summary_type": "cls_index",
72
- "summary_use_proj": true,
73
- "task_specific_params": null,
74
- "temperature": 1.0,
75
- "tie_encoder_decoder": false,
76
- "tie_word_embeddings": true,
77
- "tokenizer_class": null,
78
- "top_k": 50,
79
- "top_p": 1.0,
80
- "torch_dtype": null,
81
- "torchscript": false,
82
- "transformers_version": "4.9.0.dev0",
83
- "use_bfloat16": false,
84
- "use_cache": true,
85
- "vocab_size": 50000
86
- },
87
- "is_encoder_decoder": true,
88
- "model_type": "vit-gpt2",
89
- "pad_token_id": 1,
90
- "transformers_version": null,
91
- "vit_config": {
92
- "_name_or_path": "",
93
- "add_cross_attention": false,
94
- "architectures": [
95
- "ViTModel"
96
- ],
97
- "attention_probs_dropout_prob": 0.0,
98
- "bad_words_ids": null,
99
- "bos_token_id": null,
100
- "chunk_size_feed_forward": 0,
101
- "decoder_start_token_id": null,
102
- "diversity_penalty": 0.0,
103
- "do_sample": false,
104
- "early_stopping": false,
105
- "encoder_no_repeat_ngram_size": 0,
106
- "eos_token_id": null,
107
- "finetuning_task": null,
108
- "forced_bos_token_id": null,
109
- "forced_eos_token_id": null,
110
- "hidden_act": "gelu",
111
- "hidden_dropout_prob": 0.0,
112
- "hidden_size": 768,
113
- "id2label": {
114
- "0": "LABEL_0",
115
- "1": "LABEL_1"
116
- },
117
- "image_size": 224,
118
- "initializer_range": 0.02,
119
- "intermediate_size": 3072,
120
- "is_decoder": false,
121
- "is_encoder_decoder": false,
122
- "label2id": {
123
- "LABEL_0": 0,
124
- "LABEL_1": 1
125
- },
126
- "layer_norm_eps": 1e-12,
127
- "length_penalty": 1.0,
128
- "max_length": 20,
129
- "min_length": 0,
130
- "model_type": "vit",
131
- "no_repeat_ngram_size": 0,
132
- "num_attention_heads": 12,
133
- "num_beam_groups": 1,
134
- "num_beams": 1,
135
- "num_channels": 3,
136
- "num_hidden_layers": 12,
137
- "num_return_sequences": 1,
138
- "output_attentions": false,
139
- "output_hidden_states": false,
140
- "output_scores": false,
141
- "pad_token_id": null,
142
- "patch_size": 16,
143
- "prefix": null,
144
- "problem_type": null,
145
- "pruned_heads": {},
146
- "remove_invalid_values": false,
147
- "repetition_penalty": 1.0,
148
- "return_dict": true,
149
- "return_dict_in_generate": false,
150
- "sep_token_id": null,
151
- "task_specific_params": null,
152
- "temperature": 1.0,
153
- "tie_encoder_decoder": false,
154
- "tie_word_embeddings": true,
155
- "tokenizer_class": null,
156
- "top_k": 50,
157
- "top_p": 1.0,
158
- "torch_dtype": null,
159
- "torchscript": false,
160
- "transformers_version": "4.9.0.dev0",
161
- "use_bfloat16": false
162
- }
163
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/ckpt_2/flax_model.msgpack DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f91dda0691002393e4712170cb03a3e609b8d51b45de841db2f560cfb9549f05
3
- size 1012706583
 
 
 
 
checkpoints/ckpt_3/config.json DELETED
@@ -1,163 +0,0 @@
1
- {
2
- "architectures": [
3
- "ViTGPT2LMForConditionalGeneration"
4
- ],
5
- "bos_token_id": 0,
6
- "decoder_start_token_id": 0,
7
- "eos_token_id": 2,
8
- "gpt2_config": {
9
- "_name_or_path": "",
10
- "activation_function": "gelu_new",
11
- "add_cross_attention": true,
12
- "architectures": null,
13
- "attn_pdrop": 0.1,
14
- "bad_words_ids": null,
15
- "bos_token_id": 0,
16
- "chunk_size_feed_forward": 0,
17
- "decoder_start_token_id": null,
18
- "diversity_penalty": 0.0,
19
- "do_sample": false,
20
- "early_stopping": false,
21
- "embd_pdrop": 0.1,
22
- "encoder_no_repeat_ngram_size": 0,
23
- "eos_token_id": 2,
24
- "finetuning_task": null,
25
- "forced_bos_token_id": null,
26
- "forced_eos_token_id": null,
27
- "gradient_checkpointing": false,
28
- "id2label": {
29
- "0": "LABEL_0",
30
- "1": "LABEL_1"
31
- },
32
- "initializer_range": 0.02,
33
- "is_decoder": false,
34
- "is_encoder_decoder": false,
35
- "label2id": {
36
- "LABEL_0": 0,
37
- "LABEL_1": 1
38
- },
39
- "layer_norm_epsilon": 1e-05,
40
- "length_penalty": 1.0,
41
- "max_length": 20,
42
- "min_length": 0,
43
- "model_type": "gpt2",
44
- "n_ctx": 1024,
45
- "n_embd": 768,
46
- "n_head": 12,
47
- "n_inner": null,
48
- "n_layer": 12,
49
- "n_positions": 1024,
50
- "no_repeat_ngram_size": 0,
51
- "num_beam_groups": 1,
52
- "num_beams": 1,
53
- "num_return_sequences": 1,
54
- "output_attentions": false,
55
- "output_hidden_states": false,
56
- "output_scores": false,
57
- "pad_token_id": 1,
58
- "prefix": null,
59
- "problem_type": null,
60
- "pruned_heads": {},
61
- "remove_invalid_values": false,
62
- "repetition_penalty": 1.0,
63
- "resid_pdrop": 0.1,
64
- "return_dict": true,
65
- "return_dict_in_generate": false,
66
- "scale_attn_weights": true,
67
- "sep_token_id": null,
68
- "summary_activation": null,
69
- "summary_first_dropout": 0.1,
70
- "summary_proj_to_labels": true,
71
- "summary_type": "cls_index",
72
- "summary_use_proj": true,
73
- "task_specific_params": null,
74
- "temperature": 1.0,
75
- "tie_encoder_decoder": false,
76
- "tie_word_embeddings": true,
77
- "tokenizer_class": null,
78
- "top_k": 50,
79
- "top_p": 1.0,
80
- "torch_dtype": null,
81
- "torchscript": false,
82
- "transformers_version": "4.9.0.dev0",
83
- "use_bfloat16": false,
84
- "use_cache": true,
85
- "vocab_size": 50000
86
- },
87
- "is_encoder_decoder": true,
88
- "model_type": "vit-gpt2",
89
- "pad_token_id": 1,
90
- "transformers_version": null,
91
- "vit_config": {
92
- "_name_or_path": "",
93
- "add_cross_attention": false,
94
- "architectures": [
95
- "ViTModel"
96
- ],
97
- "attention_probs_dropout_prob": 0.0,
98
- "bad_words_ids": null,
99
- "bos_token_id": null,
100
- "chunk_size_feed_forward": 0,
101
- "decoder_start_token_id": null,
102
- "diversity_penalty": 0.0,
103
- "do_sample": false,
104
- "early_stopping": false,
105
- "encoder_no_repeat_ngram_size": 0,
106
- "eos_token_id": null,
107
- "finetuning_task": null,
108
- "forced_bos_token_id": null,
109
- "forced_eos_token_id": null,
110
- "hidden_act": "gelu",
111
- "hidden_dropout_prob": 0.0,
112
- "hidden_size": 768,
113
- "id2label": {
114
- "0": "LABEL_0",
115
- "1": "LABEL_1"
116
- },
117
- "image_size": 224,
118
- "initializer_range": 0.02,
119
- "intermediate_size": 3072,
120
- "is_decoder": false,
121
- "is_encoder_decoder": false,
122
- "label2id": {
123
- "LABEL_0": 0,
124
- "LABEL_1": 1
125
- },
126
- "layer_norm_eps": 1e-12,
127
- "length_penalty": 1.0,
128
- "max_length": 20,
129
- "min_length": 0,
130
- "model_type": "vit",
131
- "no_repeat_ngram_size": 0,
132
- "num_attention_heads": 12,
133
- "num_beam_groups": 1,
134
- "num_beams": 1,
135
- "num_channels": 3,
136
- "num_hidden_layers": 12,
137
- "num_return_sequences": 1,
138
- "output_attentions": false,
139
- "output_hidden_states": false,
140
- "output_scores": false,
141
- "pad_token_id": null,
142
- "patch_size": 16,
143
- "prefix": null,
144
- "problem_type": null,
145
- "pruned_heads": {},
146
- "remove_invalid_values": false,
147
- "repetition_penalty": 1.0,
148
- "return_dict": true,
149
- "return_dict_in_generate": false,
150
- "sep_token_id": null,
151
- "task_specific_params": null,
152
- "temperature": 1.0,
153
- "tie_encoder_decoder": false,
154
- "tie_word_embeddings": true,
155
- "tokenizer_class": null,
156
- "top_k": 50,
157
- "top_p": 1.0,
158
- "torch_dtype": null,
159
- "torchscript": false,
160
- "transformers_version": "4.9.0.dev0",
161
- "use_bfloat16": false
162
- }
163
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/ckpt_3/flax_model.msgpack DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a5152a207ba30a963b32775047d0276b9cec79d9c7343eb01db4e8dab14bac2
3
- size 1012706583
 
 
 
 
model.py DELETED
@@ -1,89 +0,0 @@
1
- import sys, os
2
-
3
- current_path = os.path.dirname(os.path.abspath(__file__))
4
- sys.path.append(current_path)
5
-
6
- # jax
7
- import jax
8
-
9
- # Main model - ViTGPT2LM
10
- from vit_gpt2.modeling_flax_vit_gpt2_lm import FlaxViTGPT2LMForConditionalGeneration
11
-
12
- # Vit - as encoder
13
- from transformers import ViTFeatureExtractor
14
- from PIL import Image
15
- import requests
16
- import numpy as np
17
-
18
- # GPT2 / GPT2LM - as decoder
19
- from transformers import ViTFeatureExtractor, GPT2Tokenizer
20
-
21
- model_name_or_path = './outputs/ckpt_2/'
22
- flax_vit_gpt2_lm = FlaxViTGPT2LMForConditionalGeneration.from_pretrained(model_name_or_path)
23
-
24
- vit_model_name = 'google/vit-base-patch16-224-in21k'
25
- feature_extractor = ViTFeatureExtractor.from_pretrained(vit_model_name)
26
-
27
- gpt2_model_name = 'asi/gpt-fr-cased-small'
28
- tokenizer = GPT2Tokenizer.from_pretrained(gpt2_model_name)
29
-
30
- max_length = 64
31
- num_beams = 16
32
- gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
33
-
34
-
35
- @jax.jit
36
- def predict_fn(pixel_values):
37
-
38
- return flax_vit_gpt2_lm.generate(pixel_values, **gen_kwargs)
39
-
40
- def predict(image, pxs=None):
41
-
42
- # batch dim is added automatically
43
- encoder_inputs = feature_extractor(images=image, return_tensors="jax")
44
- pixel_values = encoder_inputs.pixel_values
45
-
46
- if pxs is not None:
47
- pixel_values = pxs
48
-
49
- # generation
50
- generation = predict_fn(pixel_values)
51
-
52
- token_ids = np.array(generation.sequences)[0]
53
- caption = tokenizer.decode(token_ids)
54
-
55
- return caption, token_ids
56
-
57
-
58
- if __name__ == '__main__':
59
-
60
- from datetime import datetime
61
-
62
- idx = 11
63
- url = f'./wit_data_dir/train/images/{idx}.jpg'
64
- image = Image.open(url)
65
-
66
- encoder_inputs = feature_extractor(images=image, return_tensors="np")
67
- pv1 = encoder_inputs.pixel_values
68
- pv2 = np.load(f'./wit_data_dir/train/numpy/{idx}.npy')
69
- print(np.sum(np.abs(pv1 - pv2)))
70
-
71
- s = datetime.now()
72
- caption, token_ids = predict(image, pxs=pv2)
73
- e = datetime.now()
74
- e = (e - s).total_seconds()
75
- print(e)
76
-
77
- print(f'token_ids: {token_ids}')
78
- print(f'caption: {caption}')
79
-
80
- for _ in range(1):
81
- s = datetime.now()
82
- caption, token_ids = predict(image, pxs=None)
83
- e = datetime.now()
84
- e = (e - s).total_seconds()
85
- print(e)
86
- print('-' * 20)
87
-
88
- print(f'token_ids: {token_ids}')
89
- print(f'caption: {caption}')