ydshieh commited on
Commit
6f0178d
1 Parent(s): 5dfe197

update UI and samples

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +31 -26
  2. model.py +14 -4
  3. samples/COCO_val2014_000000581632.jpg +0 -0
  4. samples/COCO_val2014_000000581683.jpg +0 -0
  5. samples/COCO_val2014_000000581702.jpg +0 -0
  6. samples/COCO_val2014_000000581717.jpg +0 -0
  7. samples/COCO_val2014_000000581726.jpg +0 -0
  8. samples/COCO_val2014_000000581736.jpg +0 -0
  9. samples/COCO_val2014_000000581781.jpg +0 -0
  10. samples/COCO_val2014_000000581827.jpg +0 -0
  11. samples/COCO_val2014_000000581829.jpg +0 -0
  12. samples/COCO_val2014_000000581863.jpg +0 -0
  13. samples/COCO_val2014_000000581899.jpg +0 -0
  14. samples/COCO_val2017_000000006771.jpg +0 -0
  15. samples/COCO_val2017_000000021903.jpg +0 -0
  16. samples/COCO_val2017_000000030213.jpg +0 -0
  17. samples/COCO_val2017_000000039956.jpg +0 -0
  18. samples/COCO_val2017_000000045472.jpg +0 -0
  19. samples/COCO_val2017_000000053505.jpg +0 -0
  20. samples/COCO_val2017_000000057597.jpg +0 -0
  21. samples/COCO_val2017_000000059386.jpg +0 -0
  22. samples/COCO_val2017_000000067406.jpg +0 -0
  23. samples/COCO_val2017_000000069795.jpg +0 -0
  24. samples/COCO_val2017_000000084431.jpg +0 -0
  25. samples/COCO_val2017_000000088432.jpg +0 -0
  26. samples/COCO_val2017_000000100238.jpg +0 -0
  27. samples/COCO_val2017_000000104619.jpg +0 -0
  28. samples/COCO_val2017_000000104803.jpg +0 -0
  29. samples/COCO_val2017_000000124442.jpg +0 -0
  30. samples/COCO_val2017_000000125936.jpg +0 -0
  31. samples/COCO_val2017_000000132703.jpg +0 -0
  32. samples/COCO_val2017_000000146155.jpg +0 -0
  33. samples/COCO_val2017_000000149770.jpg +0 -0
  34. samples/COCO_val2017_000000152120.jpg +0 -0
  35. samples/COCO_val2017_000000154431.jpg +0 -0
  36. samples/COCO_val2017_000000161609.jpg +0 -0
  37. samples/COCO_val2017_000000163258.jpg +0 -0
  38. samples/COCO_val2017_000000168593.jpg +0 -0
  39. samples/COCO_val2017_000000170116.jpg +0 -0
  40. samples/COCO_val2017_000000172330.jpg +0 -0
  41. samples/COCO_val2017_000000173371.jpg +0 -0
  42. samples/COCO_val2017_000000175535.jpg +0 -0
  43. samples/COCO_val2017_000000178469.jpg +0 -0
  44. samples/COCO_val2017_000000180188.jpg +0 -0
  45. samples/COCO_val2017_000000180296.jpg +0 -0
  46. samples/COCO_val2017_000000181969.jpg +0 -0
  47. samples/COCO_val2017_000000190676.jpg +0 -0
  48. samples/COCO_val2017_000000199055.jpg +0 -0
  49. samples/COCO_val2017_000000204186.jpg +0 -0
  50. samples/COCO_val2017_000000213547.jpg +0 -0
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import streamlit as st
 
2
 
3
 
4
  # Designing the interface
@@ -7,56 +8,60 @@ st.write("[Yih-Dar SHIEH](https://huggingface.co/ydshieh)")
7
 
8
  st.sidebar.markdown(
9
  """
10
- An image captioning model [ViT-GPT2](https://huggingface.co/flax-community/vit-gpt2) by combining the ViT model with the GPT2 model.
11
- [Part of the [Huggingface JAX/Flax event](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/).]\n
12
- The encoder (ViT) and decoder (GPT2) are combined using Hugging Face transformers' `FlaxVisionEncoderDecoderModel`.
13
  The pretrained weights of both models are loaded, with a set of randomly initialized cross-attention weights.
14
  The model is trained on the COCO 2017 dataset for about 6900 steps (batch_size=256).
 
15
  """
16
  )
17
 
18
- #image = Image.open('samples/val_000000039769.jpg')
19
- #show = st.image(image, use_column_width=True)
20
- #show.image(image, 'Preloaded Image', use_column_width=True)
21
-
22
-
23
  with st.spinner('Loading and compiling ViT-GPT2 model ...'):
24
-
25
  from model import *
26
- # st.sidebar.write(f'Vit-GPT2 model loaded :)')
27
 
28
- st.sidebar.title("Select a sample image")
29
 
30
- sample_name = st.sidebar.selectbox(
31
- "Please choose an image",
32
- sample_fns
 
33
  )
34
 
35
- sample_name = f"COCO_val2014_{sample_name.replace('.jpg', '').zfill(12)}.jpg"
 
 
 
 
 
 
 
 
 
36
  sample_path = os.path.join(sample_dir, sample_name)
37
 
38
- image = Image.open(sample_path)
39
- show = st.image(image, width=480)
40
- show.image(image, '\n\nSelected Image', width=480)
 
 
 
 
 
 
 
41
 
42
  # For newline
43
  st.sidebar.write('\n')
44
 
45
-
46
  with st.spinner('Generating image caption ...'):
47
 
48
  caption = predict(image)
49
 
50
  caption_en = caption
51
- st.header(f'**Prediction (in English)**: {caption_en}')
52
-
53
- # caption_en = translator.translate(caption, src='fr', dest='en').text
54
- # st.header(f'**Prediction (in French) **{caption}')
55
- # st.header(f'**English Translation**: {caption_en}')
56
-
57
 
58
  st.sidebar.header("ViT-GPT2 predicts:")
59
  st.sidebar.write(f"**English**: {caption}")
60
 
61
-
62
  image.close()
 
1
  import streamlit as st
2
+ import requests
3
 
4
 
5
  # Designing the interface
 
8
 
9
  st.sidebar.markdown(
10
  """
11
+ An image captioning model by combining ViT model with GPT2 model.
12
+ The encoder (ViT) and decoder (GPT2) are combined using Hugging Face transformers' [Vision-To-Text Encoder-Decoder
13
+ framework](https://huggingface.co/transformers/master/model_doc/visionencoderdecoder.html).
14
  The pretrained weights of both models are loaded, with a set of randomly initialized cross-attention weights.
15
  The model is trained on the COCO 2017 dataset for about 6900 steps (batch_size=256).
16
+ [Follow-up work of [Huggingface JAX/Flax event](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/).]\n
17
  """
18
  )
19
 
 
 
 
 
 
20
  with st.spinner('Loading and compiling ViT-GPT2 model ...'):
 
21
  from model import *
 
22
 
 
23
 
24
+ st.sidebar.title("Select a sample image")
25
+ image_id = st.sidebar.selectbox(
26
+ "Please choose a sample image",
27
+ sample_image_ids
28
  )
29
 
30
+ random_image_id = None
31
+ if st.sidebar.button("Random COCO 2017 (val) images"):
32
+ random_image_id = get_random_image_id()
33
+
34
+ if random_image_id is not None:
35
+ image_id = random_image_id
36
+
37
+ st.write(image_id)
38
+
39
+ sample_name = f"COCO_val2017_{str(image_id).zfill(12)}.jpg"
40
  sample_path = os.path.join(sample_dir, sample_name)
41
 
42
+ if os.path.isfile(sample_path):
43
+ image = Image.open(sample_path)
44
+ else:
45
+ url = f"http://images.cocodataset.org/val2017/{str(image_id).zfill(12)}.jpg"
46
+ image = Image.open(requests.get(url, stream=True).raw)
47
+
48
+ resized = image.resize(size=(384, 384))
49
+ show = st.image(resized, width=384)
50
+ show.image(resized, '\n\nSelected Image', width=384)
51
+ resized.close()
52
 
53
  # For newline
54
  st.sidebar.write('\n')
55
 
 
56
  with st.spinner('Generating image caption ...'):
57
 
58
  caption = predict(image)
59
 
60
  caption_en = caption
61
+ st.header(f'Predicted caption:\n\n')
62
+ st.subheader(caption_en)
 
 
 
 
63
 
64
  st.sidebar.header("ViT-GPT2 predicts:")
65
  st.sidebar.write(f"**English**: {caption}")
66
 
 
67
  image.close()
model.py CHANGED
@@ -1,12 +1,13 @@
 
1
  import os, shutil
 
 
 
2
  from PIL import Image
3
  import jax
4
  from transformers import FlaxVisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
5
  from huggingface_hub import hf_hub_download
6
 
7
- from googletrans import Translator
8
- translator = Translator()
9
-
10
 
11
  # create target model directory
12
  model_dir = './models/'
@@ -65,4 +66,13 @@ _compile()
65
 
66
 
67
  sample_dir = './samples/'
68
- sample_fns = tuple([f"{int(f.replace('COCO_val2014_', '').replace('.jpg', ''))}.jpg" for f in os.listdir(sample_dir) if f.startswith('COCO_val2014_')])
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
  import os, shutil
3
+ import random
4
+
5
+
6
  from PIL import Image
7
  import jax
8
  from transformers import FlaxVisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
9
  from huggingface_hub import hf_hub_download
10
 
 
 
 
11
 
12
  # create target model directory
13
  model_dir = './models/'
 
66
 
67
 
68
  sample_dir = './samples/'
69
+ sample_image_ids = tuple([int(f.replace('COCO_val2017_', '').replace('.jpg', '')) for f in os.listdir(sample_dir) if f.startswith('COCO_val2017_')])
70
+
71
+ with open(os.path.join(sample_dir, "coco-val2017-img-ids.json"), "r", encoding="UTF-8") as fp:
72
+ coco_2017_val_image_ids = json.load(fp)
73
+
74
+
75
+ def get_random_image_id():
76
+
77
+ image_id = random.sample(coco_2017_val_image_ids, k=1)[0]
78
+ return image_id
samples/COCO_val2014_000000581632.jpg DELETED
Binary file (212 kB)
 
samples/COCO_val2014_000000581683.jpg DELETED
Binary file (231 kB)
 
samples/COCO_val2014_000000581702.jpg DELETED
Binary file (214 kB)
 
samples/COCO_val2014_000000581717.jpg DELETED
Binary file (155 kB)
 
samples/COCO_val2014_000000581726.jpg DELETED
Binary file (238 kB)
 
samples/COCO_val2014_000000581736.jpg DELETED
Binary file (180 kB)
 
samples/COCO_val2014_000000581781.jpg DELETED
Binary file (246 kB)
 
samples/COCO_val2014_000000581827.jpg DELETED
Binary file (297 kB)
 
samples/COCO_val2014_000000581829.jpg DELETED
Binary file (226 kB)
 
samples/COCO_val2014_000000581863.jpg DELETED
Binary file (196 kB)
 
samples/COCO_val2014_000000581899.jpg DELETED
Binary file (218 kB)
 
samples/COCO_val2017_000000006771.jpg ADDED
samples/COCO_val2017_000000021903.jpg ADDED
samples/COCO_val2017_000000030213.jpg ADDED
samples/COCO_val2017_000000039956.jpg ADDED
samples/COCO_val2017_000000045472.jpg ADDED
samples/COCO_val2017_000000053505.jpg ADDED
samples/COCO_val2017_000000057597.jpg ADDED
samples/COCO_val2017_000000059386.jpg ADDED
samples/COCO_val2017_000000067406.jpg ADDED
samples/COCO_val2017_000000069795.jpg ADDED
samples/COCO_val2017_000000084431.jpg ADDED
samples/COCO_val2017_000000088432.jpg ADDED
samples/COCO_val2017_000000100238.jpg ADDED
samples/COCO_val2017_000000104619.jpg ADDED
samples/COCO_val2017_000000104803.jpg ADDED
samples/COCO_val2017_000000124442.jpg ADDED
samples/COCO_val2017_000000125936.jpg ADDED
samples/COCO_val2017_000000132703.jpg ADDED
samples/COCO_val2017_000000146155.jpg ADDED
samples/COCO_val2017_000000149770.jpg ADDED
samples/COCO_val2017_000000152120.jpg ADDED
samples/COCO_val2017_000000154431.jpg ADDED
samples/COCO_val2017_000000161609.jpg ADDED
samples/COCO_val2017_000000163258.jpg ADDED
samples/COCO_val2017_000000168593.jpg ADDED
samples/COCO_val2017_000000170116.jpg ADDED
samples/COCO_val2017_000000172330.jpg ADDED
samples/COCO_val2017_000000173371.jpg ADDED
samples/COCO_val2017_000000175535.jpg ADDED
samples/COCO_val2017_000000178469.jpg ADDED
samples/COCO_val2017_000000180188.jpg ADDED
samples/COCO_val2017_000000180296.jpg ADDED
samples/COCO_val2017_000000181969.jpg ADDED
samples/COCO_val2017_000000190676.jpg ADDED
samples/COCO_val2017_000000199055.jpg ADDED
samples/COCO_val2017_000000204186.jpg ADDED
samples/COCO_val2017_000000213547.jpg ADDED