diff --git a/.gitattributes b/.gitattributes index b07961f7ff7e7298b2d45abc3b32b8372552a9bc..9c6bef50dee034cdfec1565fece4393ac070d06b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -36,3 +36,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text pages/4M-21/video_1.mp4 filter=lfs diff=lfs merge=lfs -text pages/Depth[[:space:]]Anything/video_1.mp4 filter=lfs diff=lfs merge=lfs -text pages/RT-DETR/video_1.mp4 filter=lfs diff=lfs merge=lfs -text +pages/KOSMOS-2/video_1.mp4 filter=lfs diff=lfs merge=lfs -text +pages/Aria/image_0.png filter=lfs diff=lfs merge=lfs -text +pages/Aria/image_2.png filter=lfs diff=lfs merge=lfs -text diff --git a/Home.py b/Home.py index 84631a7d0a5c143739f7bb6d19abaeeadd0db006..b11b23e17ed9ef3bbfcd1141cddee0274fb16204 100644 --- a/Home.py +++ b/Home.py @@ -2,17 +2,65 @@ import streamlit as st st.set_page_config(page_title="Home",page_icon="🏠") -# st.image("image_of_a_Turkish_lofi_girl_sitting_at_a_desk_writing_summaries_of_scientific_publications_ghibli_anime_like_hd.jpeg", use_column_width=True) - -st.write("# Vision Papers 📚") - - -st.markdown( +translations = { +'en': { + 'title': 'Vision Papers 📚', + 'introduction': """ - This app contains all of my paper posts on X for your convenience! - - Start browsing papers on the left tab. 🔖 - + This app contains all of my paper posts on [X](https://x.com/mervenoyann) for your convenience! + Start browsing papers on the left tab 🔖 This app is made by an amazing human being called [Loïck Bourdois](https://x.com/BdsLoick) so please show this some love and like the Space if you think it's useful 💖 + """, + 'extra_content': + """ + Beyond this pack of summaries of papers, if you'd like to dig deeper into the subject of vision language models, you can check out some of the other resources I've been working on 👩‍🔬: + * This [collection](https://hf.co/collections/merve/vision-language-models-papers-66264531f7152ac0ec80ceca) of papers (listing models which are not summarized in this Space but which may be of interest) 📄 + * Tasks that can be handled by these models, such as [Document Question Answering](https://huggingface.co/tasks/document-question-answering), [Image-Text-to-Text](https://huggingface.co/tasks/image-text-to-text) or [Visual Question Answering](https://huggingface.co/tasks/visual-question-answering) + * Blog posts on [ConvNets](https://merveenoyan.medium.com/complete-guide-on-deep-learning-architectures-chapter-1-on-convnets-1d3e8086978d), [Autoencoders](https://merveenoyan.medium.com/complete-guide-on-deep-learning-architectures-part-2-autoencoders-293351bbe027), [explaining vision language models](https://huggingface.co/blog/vlms), [finetuning it with TRL](https://huggingface.co/blog/dpo_vlm) and the announcement of certain models such as [PaliGemma](https://huggingface.co/blog/paligemma) ✍️ + * A GitHub repository containing various notebooks taking full advantage of these models (optimizations, quantization, distillation, finetuning, etc.): [smol-vision](https://github.com/merveenoyan/smol-vision) ⭐ + * A 12-minute summary YouTube video 🎥 + """ +}, +'fr': { + 'title': 'Papiers de vision 📚', + 'introduction': + """ + Cette appli contient tous les résumés de papiers que j'ai publiés sur [X](https://x.com/mervenoyann) afin de vous faciliter la tâche ! + Vous avez juste à parcourir l'onglet de gauche 🔖 + Cette application a été créée par un être humain extraordinaire, [Loïck Bourdois](https://x.com/BdsLoick), alors s'il vous plaît montrez-lui un peu d'amour et aimez le Space si vous le pensez utile 💖 + """, + 'extra_content': + """ + Au delà de ce pack de résumés de papiers, si vous souhaitez creuser le sujet des modèles de langage/vision, vous pouvez consulter d'autres ressources sur lesquelles j'ai travaillées 👩‍🔬: + * Cette [collection](https://hf.co/collections/merve/vision-language-models-papers-66264531f7152ac0ec80ceca) de papiers sur le sujet (listant des modèles non résumés dans ce Space qui pourraient tout de même vous intéresser) 📄 + * Les tâches pouvant être traitées par ces modèles comme le [Document Question Answering](https://huggingface.co/tasks/document-question-answering), l'[Image-Text-to-Text](https://huggingface.co/tasks/image-text-to-text) ou encore le [Visual Question Answering](https://huggingface.co/tasks/visual-question-answering) + * Des articles de blog portant sur [les ConvNets](https://merveenoyan.medium.com/complete-guide-on-deep-learning-architectures-chapter-1-on-convnets-1d3e8086978d), [les auto-encodeurs](https://merveenoyan.medium.com/complete-guide-on-deep-learning-architectures-part-2-autoencoders-293351bbe027), [l'explication des modèles de langage/vision](https://huggingface.co/blog/vlms), leur [finetuning avec TRL](https://huggingface.co/blog/dpo_vlm) ou encore l'annonce de modèles comme [PaliGemma](https://huggingface.co/blog/paligemma) ✍️ + * Un répertoire GitHub contenant divers notebooks pour tirer le meilleur parti de ces modèles (optimisations, quantization, distillation, finetuning, etc.) : [smol-vision](https://github.com/merveenoyan/smol-vision) ⭐ + * Une vidéo YouTube de synthèse en 12 minutes 🎥 """ -) \ No newline at end of file + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]['title']) + +# Main app content +# st.image("Turkish_girl_from_back_sitting_at_a_desk_writing_view_on_an_old_castle_in_a_window_wehre_a_cat_lying_ghibli_anime_like_hd.jpg", use_column_width=True) +st.markdown(""" """) +st.write(translations[lang]['introduction']) +st.markdown(""" """) +st.write(translations[lang]['extra_content']) +st.video("https://www.youtube.com/watch?v=IoGaGfU1CIg", format="video/mp4") \ No newline at end of file diff --git a/README.md b/README.md index 1e63885b44afe7a61d0f3a09ca90a4af2c7e9066..2abecb1bec03bcabbe23e147a33dc00c034c8b13 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ colorTo: blue sdk: streamlit sdk_version: 1.37.0 app_file: Home.py -pinned: false +pinned: true --- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/Turkish_girl_from_back_sitting_at_a_desk_writing_view_on_an_old_castle_in_a_window_wehre_a_cat_lying_ghibli_anime_like_hd.jpg b/Turkish_girl_from_back_sitting_at_a_desk_writing_view_on_an_old_castle_in_a_window_wehre_a_cat_lying_ghibli_anime_like_hd.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fdae1985b01122b4c136af69817ef63846bfd6de Binary files /dev/null and b/Turkish_girl_from_back_sitting_at_a_desk_writing_view_on_an_old_castle_in_a_window_wehre_a_cat_lying_ghibli_anime_like_hd.jpg differ diff --git a/pages/0_KOSMOS-2.py b/pages/0_KOSMOS-2.py new file mode 100644 index 0000000000000000000000000000000000000000..09a7a66041817ec7e67820033376e643ee99e7df --- /dev/null +++ b/pages/0_KOSMOS-2.py @@ -0,0 +1,214 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'KOSMOS-2', + 'original_tweet': + """ + [Original tweet](https://x.com/mervenoyann/status/1720126908384366649) (November 2, 2023) + """, + 'tweet_1': + """ + New 🤗 Transformers release includes a very powerful Multimodel Large Language Model (MLLM) by @Microsoft called KOSMOS-2! 🤩 + The highlight of KOSMOS-2 is grounding, the model is *incredibly* accurate! 🌎 + Play with the demo [here](https://huggingface.co/spaces/ydshieh/Kosmos-2) by [@ydshieh](https://x.com/ydshieh). + But how does this model work? Let's take a look! 👀🧶 + """, + 'tweet_2': + """ + Grounding helps machine learning models relate to real-world examples. Including grounding makes models more performant by means of accuracy and robustness during inference. It also helps reduce the so-called "hallucinations" in language models. + """, + 'tweet_3': + """ + In KOSMOS-2, model is grounded to perform following tasks and is evaluated on 👇 + - multimodal grounding & phrase grounding, e.g. localizing the object through natural language query + - multimodal referring, e.g. describing object characteristics & location + - perception-language tasks + - language understanding and generation + """, + 'tweet_4': + """ + The dataset used for grounding, called GRiT is also available on [Hugging Face Hub](https://huggingface.co/datasets/zzliang/GRIT). + Thanks to 🤗 Transformers integration, you can use KOSMOS-2 with few lines of code 🤩 + See below! 👇 + """, + 'ressources': + """ + Ressources: + [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) + by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei (2023) + [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2) + [Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/kosmos-2) + """ + }, +'fr': { + 'title': 'KOSMOS-2', + 'original_tweet': + """ + [Tweet de base](https://x.com/mervenoyann/status/1720126908384366649) (en anglais) (2 novembre 2023) + """, + 'tweet_1': + """ + La nouvelle version de 🤗 Transformers inclut un très puissant Multimodel Large Language Model (MLLM) de @Microsoft appelé KOSMOS-2 ! 🤩 + Le point fort de KOSMOS-2 est l'ancrage, le modèle est *incroyablement* précis ! 🌎 + Jouez avec la démo [ici](https://huggingface.co/spaces/ydshieh/Kosmos-2) de [@ydshieh](https://x.com/ydshieh). + Mais comment fonctionne t'il ? Jetons un coup d'œil ! 👀🧶 + """, + 'tweet_2': + """ + L'ancrage permet aux modèles d'apprentissage automatique d'être liés à des exemples du monde réel. L'inclusion de l'ancrage rend les modèles plus performants en termes de précision et de robustesse lors de l'inférence. Cela permet également de réduire les « hallucinations » dans les modèles de langage. """, + 'tweet_3': + """ + Dans KOSMOS-2, le modèle est ancré pour effectuer les tâches suivantes et est évalué sur 👇 + - l'ancrage multimodal et l'ancrage de phrases, par exemple la localisation de l'objet par le biais d'une requête en langage naturel + - la référence multimodale, par exemple la description des caractéristiques et de l'emplacement de l'objet + - tâches de perception-langage + - compréhension et génération du langage + """, + 'tweet_4': + """ + Le jeu de données utilisé pour l'ancrage, appelé GRiT, est également disponible sur le [Hub d'Hugging Face](https://huggingface.co/datasets/zzliang/GRIT). + Grâce à l'intégration dans 🤗 Transformers, vous pouvez utiliser KOSMOS-2 avec quelques lignes de code 🤩. + Voir ci-dessous ! 👇 + """, + 'ressources': + """ + Ressources : + [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) + de Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei (2023) + [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2) + [Documentation d'Hugging Face](https://huggingface.co/docs/transformers/model_doc/kosmos-2) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.video("pages/KOSMOS-2/video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/KOSMOS-2/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +with st.expander ("Code"): + if lang == "en": + st.code(""" + from transformers import AutoProcessor, AutoModelForVision2Seq + + model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to("cuda") + processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224") + + image_input = Image.open(user_image_path) + # prepend different preprompts optionally to describe images + brief_preprompt = "An image of" + detailed_preprompt = "Describe this image in detail:" + + + inputs = processor(text=text_input, images=image_input, return_tensors="pt").to("cuda") + + generated_ids = model.generate( + pixel_values=inputs["pixel_values"], + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + image_embeds=None, + image_embeds_position_mask=inputs["image_embeds_position_mask"], + use_cache=True, + max_new_tokens=128, + ) + + generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + + processed_text, entities = processor.post_process_generation(generated_text) + + # check out the Space for inference with bbox drawing + """) + else: + st.code(""" + from transformers import AutoProcessor, AutoModelForVision2Seq + + model = AutoModelForVision2Seq.from_pretrained("microsoft/kosmos-2-patch14-224").to("cuda") + processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224") + + image_input = Image.open(user_image_path) + # ajouter différents préprompts facultatifs pour décrire les images + brief_preprompt = "An image of" + detailed_preprompt = "Describe this image in detail:" + + + inputs = processor(text=text_input, images=image_input, return_tensors="pt").to("cuda") + + generated_ids = model.generate( + pixel_values=inputs["pixel_values"], + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + image_embeds=None, + image_embeds_position_mask=inputs["image_embeds_position_mask"], + use_cache=True, + max_new_tokens=128, + ) + + generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + + processed_text, entities = processor.post_process_generation(generated_text) + + # consultez le Space pour l'inférence avec le tracé des bbox + """) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("Home") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("Home") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("MobileSAM") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("MobileSAM") diff --git a/pages/10_LLaVA-NeXT.py b/pages/10_LLaVA-NeXT.py new file mode 100644 index 0000000000000000000000000000000000000000..329d4d5a85ff9697b4fdcc5a24b3014e28dfeeed --- /dev/null +++ b/pages/10_LLaVA-NeXT.py @@ -0,0 +1,196 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'LLaVA-NeXT', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1770832875551682563) (March 21, 2024) + """, + 'tweet_1': + """ + LLaVA-NeXT is recently merged to 🤗 Transformers and it outperforms many of the proprietary models like Gemini on various benchmarks!🤩 + For those who don't know LLaVA, it's a language model that can take image 💬 + Let's take a look, demo and more in this. + """, + 'tweet_2': + """ + LLaVA is essentially a vision-language model that consists of ViT-based CLIP encoder, a MLP projection and Vicuna as decoder ✨ + LLaVA 1.5 was released with Vicuna, but LLaVA NeXT (1.6) is released with four different LLMs: + - Nous-Hermes-Yi-34B + - Mistral-7B + - Vicuna 7B & 13B + """, + 'tweet_3': + """ + Thanks to 🤗 Transformers integration, it is very easy to use LLaVA NeXT, not only standalone but also with 4-bit loading and Flash Attention 2 💜 + See below on standalone usage 👇 + """, + 'tweet_4': + """ + To fit large models and make it even faster and memory efficient, you can enable Flash Attention 2 and load model into 4-bit using bitsandbytes ⚡️ transformers makes it very easy to do this! See below 👇 + """, + 'tweet_5': + """ + If you want to try the code right away, here's the [notebook](https://t.co/NvoxvY9z1u). + Lastly, you can directly play with the LLaVA-NeXT based on Mistral-7B through the demo [here](https://t.co/JTDlqMUwEh) 🤗 + """, + 'ressources': + """ + Ressources: + [LLaVA-NeXT: Improved reasoning, OCR, and world knowledge](https://llava-vl.github.io/blog/2024-01-30-llava-next/) + by Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, Yong Jae Lee (2024) + [GitHub](https://github.com/haotian-liu/LLaVA/tree/main) + [Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/llava_next) + """ + }, +'fr': { + 'title': 'LLaVA-NeXT', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1770832875551682563) (en anglais) (21 mars 2024) + """, + 'tweet_1': + """ + LLaVA-NeXT a récemment été intégré à 🤗 Transformers et surpasse de nombreux modèles propriétaires comme Gemini sur différents benchmarks !🤩 + Pour ceux qui ne connaissent pas LLaVA, il s'agit d'un modèle de langage qui peut prendre des images 💬. + """, + 'tweet_2': + """ + LLaVA est essentiellement un modèle langage/vision qui se compose d'un encodeur CLIP basé sur ViT, d'une projection MLP et de Vicuna en tant que décodeur ✨. + LLaVA 1.5 a été publié avec Vicuna, mais LLaVA NeXT (1.6) est publié avec quatre LLM différents : + - Nous-Hermes-Yi-34B + - Mistral-7B + - Vicuna 7B & 13B + """, + 'tweet_3': + """ + Grâce à l'intégration dans 🤗 Transformers, il est très facile d'utiliser LLaVA NeXT, non seulement en mode autonome mais aussi avec un chargement 4 bits et Flash Attention 2 💜. + Voir ci-dessous pour l'utilisation autonome 👇 + """, + 'tweet_4': + """ + Pour entraîner des grands modèles et les rendre encore plus rapides et efficaces en termes de mémoire, vous pouvez activer Flash Attention 2 et charger le modèle en 4 bits à l'aide de bitsandbytes ⚡️ ! Voir ci-dessous 👇 """, + 'tweet_5': + """ + Si vous voulez essayer le code tout de suite, voici le [notebook](https://t.co/NvoxvY9z1u). + Enfin, vous pouvez directement jouer avec le LLaVA-NeXT reposant sur Mistral-7B grâce à cette [démo](https://t.co/JTDlqMUwEh) 🤗 + """, + 'ressources': + """ + Ressources : + [LLaVA-NeXT: Improved reasoning, OCR, and world knowledge](https://llava-vl.github.io/blog/2024-01-30-llava-next/) + de Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, Yong Jae Lee (2024) + [GitHub](https://github.com/haotian-liu/LLaVA/tree/main) + [Documentation d'Hugging Face](https://huggingface.co/docs/transformers/model_doc/llava_next) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/LLaVA-NeXT/image_1.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/LLaVA-NeXT/image_2.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/LLaVA-NeXT/image_3.jpeg", use_column_width=True) +st.markdown(""" """) + +with st.expander ("Code"): + st.code(""" +from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration +import torch + +processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf") + +model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True) +model.to("cuda:0") + +inputs = processor(prompt, image, return_tensors="pt").to("cuda:0") + +output = model.generate(**inputs, max_new_tokens=100) +print(processor.decode(output[0], skip_special_tokens=True)) + """) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/LLaVA-NeXT/image_4.jpeg", use_column_width=True) +st.markdown(""" """) + +with st.expander ("Code"): + st.code(""" +from transformers import LlavaNextForConditionalGeneration, BitsandBytesconfig + +# 4bit +quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtpe="torch.float16") +model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto") + +# Flash Attention 2 +model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True, use_flash_attention_2=True).to(0) + """) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.video("pages/LLaVA-NeXT//video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("UDOP") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("UDOP") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("Painter") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("Painter") \ No newline at end of file diff --git a/pages/11_Painter.py b/pages/11_Painter.py new file mode 100644 index 0000000000000000000000000000000000000000..356ec8594ade73b78032492b922f2fe51ce87a17 --- /dev/null +++ b/pages/11_Painter.py @@ -0,0 +1,129 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'Painter', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1771542172946354643) (March 23, 2024) + """, + 'tweet_1': + """ + I read the Painter [paper](https://t.co/r3aHp29mjf) by [BAAIBeijing](https://x.com/BAAIBeijing) to convert the weights to 🤗 Transformers, and I absolutely loved the approach they took so I wanted to take time to unfold it here! + """, + 'tweet_2': + """ + So essentially this model takes inspiration from in-context learning, as in, in LLMs you give an example input output and give the actual input that you want model to complete (one-shot learning) they adapted this to images, thus the name "images speak in images". +
+ This model doesn't have any multimodal parts, it just has an image encoder and a decoder head (linear layer, conv layer, another linear layer) so it's a single modality. +
+ The magic sauce is the data: they input the task in the form of image and associated transformation and another image they want the transformation to take place and take smooth L2 loss over the predictions and ground truth this is like T5 of image models 😀 + """, + 'tweet_3': + """ + What is so cool about it is that it can actually adapt to out of domain tasks, meaning, in below chart, it was trained on the tasks above the dashed line, and the authors found out it generalized to the tasks below the line, image tasks are well generalized 🤯 + """, + 'ressources': + """ + Ressources: + [Images Speak in Images: A Generalist Painter for In-Context Visual Learning](https://arxiv.org/abs/2212.02499) + by Xinlong Wang, Wen Wang, Yue Cao, Chunhua Shen, Tiejun Huang (2022) + [GitHub](https://github.com/baaivision/Painter) + """ + }, +'fr': { + 'title': 'Painter', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1771542172946354643) (en anglais) (23 mars 2024) + """, + 'tweet_1': + """ + Pour pouvoir convertir les poids du Painter de [BAAIBeijing](https://x.com/BAAIBeijing) dans 🤗 Transformers, j'ai lu le [papier](https://t.co/r3aHp29mjf) et ai absolument adoré l'approche qu'ils ont adoptée. Donc j'ai voulu prendre le temps de l'exposer ici ! + """, + 'tweet_2': + """ + Ce modèle s'inspire donc essentiellement de l'apprentissage en contexte, c'est-à-dire que dans les LLM, vous donnez un exemple d'entrée et de sortie et vous donnez l'entrée réelle que vous voulez que le modèle complète (apprentissage 1-shot). Ils ont adapté cette méthode aux images, d'où le nom "images speak in images" (les images parlent en images). +
+ Ce modèle ne comporte aucune partie multimodale, mais seulement un encodeur d'images et une tête de décodage (couche linéaire, couche de convolution et autre couche linéaire), de sorte qu'il s'agit d'une modalité unique. +
+ La sauce magique, ce sont les données : ils introduisent la tâche sous la forme d'une image et d'une transformation associée, ainsi qu'une autre image qu'ils veulent transformer, et prennent une perte L2 lisse sur les prédictions et la vérité de terrain. C'est le T5 des modèles d'image 😀. + """, + 'tweet_3': + """ + Ce qui est particulièrement intéressant, c'est qu'il peut s'adapter à des tâches hors domaine, c'est-à-dire que dans le graphique ci-dessous, il a été entraîné sur les tâches situées au-dessus de la ligne pointillée, et les auteurs ont découvert qu'il s'adaptait aux tâches situées en dessous de la ligne. Les tâches liées à l'image sont bien généralisées 🤯 """, + 'ressources': + """ + Ressources : + [Images Speak in Images: A Generalist Painter for In-Context Visual Learning](https://arxiv.org/abs/2212.02499) + de Xinlong Wang, Wen Wang, Yue Cao, Chunhua Shen, Tiejun Huang (2022) + [GitHub](https://github.com/baaivision/Painter) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Painter/image_1.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Painter/image_2.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Painter/image_3.jpeg", use_column_width=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("LLaVA-NeXT") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("LLaVA-NeXT") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("SegGPT") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("SegGPT") \ No newline at end of file diff --git a/pages/12_SegGPT.py b/pages/12_SegGPT.py new file mode 100644 index 0000000000000000000000000000000000000000..9daac1e4234be17516b8acfcc85442607e8094b6 --- /dev/null +++ b/pages/12_SegGPT.py @@ -0,0 +1,184 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'SegGPT', + 'original_tweet': + """ + [Original tweet](https://x.com/mervenoyann/status/1773056450790666568) (March 27, 2024) + """, + 'tweet_1': + """ + SegGPT is a vision generalist on image segmentation, quite like GPT for computer vision ✨ + It comes with the last release of 🤗 Transformers 🎁 + Technical details, demo and how-to's under this! + """, + 'tweet_2': + """ + SegGPT is an extension of the Painter where you speak to images with images: the model takes in an image prompt, transformed version of the image prompt, the actual image you want to see the same transform, and expected to output the transformed image. +
+ SegGPT consists of a vanilla ViT with a decoder on top (linear, conv, linear). The model is trained on diverse segmentation examples, where they provide example image-mask pairs, the actual input to be segmented, and the decoder head learns to reconstruct the mask output. 👇🏻 + """, + 'tweet_3': + """ + This generalizes pretty well! + The authors do not claim state-of-the-art results as the model is mainly used zero-shot and few-shot inference. They also do prompt tuning, where they freeze the parameters of the model and only optimize the image tensor (the input context). + """, + 'tweet_4': + """ + Thanks to 🤗 Transformers you can use this model easily! See [here](https://t.co/U5pVpBhkfK). + """, + 'tweet_5': + """ + I have built an app for you to try it out. I combined SegGPT with Depth Anything Model, so you don't have to upload image mask prompts in your prompt pair 🤗 + Try it [here](https://t.co/uJIwqJeYUy). Also check out the [collection](https://t.co/HvfjWkAEzP). + """, + 'ressources': + """ + Ressources: + [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) + by Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang (2023) + [GitHub](https://github.com/baaivision/Painter) + """ + }, +'fr': { + 'title': 'SegGPT', + 'original_tweet': + """ + [Tweet de base](https://x.com/mervenoyann/status/1773056450790666568) (en anglais) (27 mars 2024) + """, + 'tweet_1': + """ + SegGPT est un modèle généraliste de vision pour la segmentation d'images; c'est un peu comme le GPT pour la vision par ordinateur ✨. + Il est intégré à la dernière version de 🤗 Transformers 🎁 + Détails techniques, démonstrations et manières de l'utiliser ci-dessous ! + """, + 'tweet_2': + """ + SegGPT est une extension de Painter où vous parlez aux images avec des images : le modèle reçoit une image, une version transformée de l'image, l'image réelle que vous voulez voir avec la même transformation, et est censé produire l'image transformée. +
+ SegGPT consiste en un ViT standard surmonté d'un décodeur (couche linéaire, convolution, couche linéaire). Le modèle est entraîné sur divers exemples de segmentation, où les auteurs fournissent des paires image-masque, l'entrée réelle à segmenter, et la tête du décodeur apprend à reconstruire la sortie du masque. 👇🏻 """, + 'tweet_3': + """ + Cela se généralise assez bien ! + Les auteurs ne prétendent pas obtenir des résultats de pointe, car le modèle est principalement utilisé pour l'inférence zéro-shot et few-shot. Ils effectuent également un prompt tuning, où ils gèlent les paramètres du modèle et optimisent uniquement le tenseur d'image (le contexte d'entrée). + """, + 'tweet_4': + """ + Grâce à 🤗 Transformers, vous pouvez utiliser ce modèle facilement ! Voir [ici] (https://t.co/U5pVpBhkfK). + """, + 'tweet_5': + """ + J'ai créé une application pour que vous puissiez l'essayer. J'ai combiné SegGPT avec Depth Anything Model, de sorte que vous n'avez pas besoin de télécharger des masques d'images dans votre paire de prompt 🤗. + Essayez-le [ici](https://t.co/uJIwqJeYUy). Consultez également la [collection](https://t.co/HvfjWkAEzP). + """, + 'ressources': + """ + Ressources : + [SegGPT: Segmenting Everything In Context](https://arxiv.org/abs/2304.03284) + de Xinlong Wang, Xiaosong Zhang, Yue Cao, Wen Wang, Chunhua Shen, Tiejun Huang (2023) + [GitHub](https://github.com/baaivision/Painter) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/SegGPT/image_1.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/SegGPT/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/SegGPT/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/SegGPT/image_4.jpeg", use_column_width=True) +st.markdown(""" """) + +with st.expander ("Code"): + st.code(""" + import torch + from transformers import SegGptImageProcessor, SegGptForImageSegmentation + + image_processor = SegGptImageProcessor.from_pretrained("BAAI/seggpt-vit-large") + model = SegGptForImageSegmentation.from_pretrained("BAAI/seggpt-vit-large") + + inputs = image_processor( + images=image_input, + prompt_images=image_prompt, + prompt_masks=mask_prompt, + num_labels=10, + return_tensors="pt") + + with torch.no_grad(): + outputs = model(**inputs) + + target_sizes = [image_input.size[::-1]] + mask = image_processor.post_process_semantic_segmentation(outputs, target_sizes, num_labels=10)[0] + """) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/SegGPT/image_5.jpeg", use_column_width=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("Painter") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("Painter") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("Grounding DINO") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("Grounding DINO") diff --git a/pages/13_Grounding_DINO.py b/pages/13_Grounding_DINO.py new file mode 100644 index 0000000000000000000000000000000000000000..100e13ab95d840881b3c469d6eab6a5b81a9f61b --- /dev/null +++ b/pages/13_Grounding_DINO.py @@ -0,0 +1,229 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'Grounding DINO', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1780558859221733563) (April 17, 2024) + """, + 'tweet_1': + """ + We have merged Grounding DINO in 🤗 Transformers 🦖 + It's an amazing zero-shot object detection model, here's why 🧶 + """, + 'tweet_2': + """ + There are two zero-shot object detection models as of now, one is OWL series by Google Brain and the other one is Grounding DINO 🦕 + Grounding DINO pays immense attention to detail ⬇️ + Also [try yourself](https://t.co/UI0CMxphE7). + """, + 'tweet_3': + """ + I have also built another [application](https://t.co/4EHpOwEpm0) for GroundingSAM, combining GroundingDINO and Segment Anything by Meta for cutting edge zero-shot image segmentation. + """, + 'tweet_4': + """ + Grounding DINO is essentially a model with connected image encoder (Swin transformer), text encoder (BERT) and on top of both, a decoder that outputs bounding boxes 🦖 + This is quite similar to OWL series, which uses a ViT-based detector on CLIP. + """, + 'tweet_5': + """ + The authors train Swin-L/T with BERT contrastively (not like CLIP where they match the images to texts by means of similarity) where they try to approximate the region outputs to language phrases at the head outputs 🤩 + """, + 'tweet_6': + """ + The authors also form the text features on the sub-sentence level. + This means it extracts certain noun phrases from training data to remove the influence between words while removing fine-grained information. + """, + 'tweet_7': + """ + Thanks to all of this, Grounding DINO has great performance on various REC/object detection benchmarks 🏆📈 + """, + 'tweet_8': + """ + Thanks to 🤗 Transformers, you can use Grounding DINO very easily! + You can also check out [NielsRogge](https://twitter.com/NielsRogge)'s [notebook here](https://t.co/8ADGFdVkta). + """, + 'ressources': + """ + Ressources: + [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) + by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Qing Jiang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang (2023) + [GitHub](https://github.com/IDEA-Research/GroundingDINO) + [Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/grounding-dino) + """ + }, +'fr': { + 'title': 'Grounding DINO', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1780558859221733563) (en anglais) (17 avril 2024) + """, + 'tweet_1': + """ + Nous avons ajouté Grounding DINO à 🤗 Transformers 🦖 + C'est un modèle incroyable de détection d'objets en zéro-shot, voici pourquoi 🧶 + """, + 'tweet_2': + """ + Il existe actuellement deux modèles de détection d'objets en zero-shot, l'un est la série OWL de Google Brain et l'autre est Grounding DINO 🦕. + Grounding DINO accorde une grande attention aux détails ⬇️ + [Essayez le vous-même](https://t.co/UI0CMxphE7). + """, + 'tweet_3': + """ + J'ai également créé une autre [application](https://t.co/4EHpOwEpm0) pour GroundingSAM, combinant GroundingDINO et Segment Anything de Meta pour une segmentation d'image en zéro-shot. + """, + 'tweet_4': + """ + Grounding DINO est essentiellement un modèle avec un encodeur d'image (Swin transformer), un encodeur de texte (BERT) et, au-dessus des deux, un décodeur qui produit des boîtes de délimitation 🦖. + Cela ressemble beaucoup à OWL, qui utilise un détecteur ViT basé sur CLIP. + """, + 'tweet_5': + """ + Les auteurs entraînent Swin-L/T avec BERT de manière contrastive (pas comme CLIP où ils font correspondre les images aux textes au moyen de la similarité) où ils essaient de faire une approximation entre la région sortie et la phrases sortie 🤩 + """, + 'tweet_6': + """ + Les auteurs forment les caractéristiques textuelles au niveau de la sous-phrase. + Cela signifie qu'ils extraient certaines phrases des données d'apprentissage afin de supprimer l'influence entre les mots tout en supprimant les informations plus fines. """, + 'tweet_7': + """ + Grâce à tout cela, Grounding DINO a d'excellentes performances sur divers benchmarks de détection de REC/objets 🏆📈. + """, + 'tweet_8': + """ + Grâce à 🤗 Transformers, vous pouvez utiliser Grounding DINO très facilement ! + Vous pouvez également consulter le [ notebook](https://t.co/8ADGFdVkta) de [NielsRogge](https://twitter.com/NielsRogge). + """, + 'ressources': + """ + Ressources : + [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) + de Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Qing Jiang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang (2023) + [GitHub](https://github.com/IDEA-Research/GroundingDINO) + [Documentation d'Hugging Face](https://huggingface.co/docs/transformers/model_doc/grounding-dino) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Grounding_DINO/image_1.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Grounding_DINO/image_2.jpeg", use_column_width=True) +st.image("pages/Grounding_DINO/image_3.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Grounding_DINO/image_4.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Grounding_DINO/image_5.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Grounding_DINO/image_6.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_6"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Grounding_DINO/image_7.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_7"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Grounding_DINO/image_8.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_8"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Grounding_DINO/image_9.jpeg", use_column_width=True) +st.markdown(""" """) + +with st.expander ("Code"): + st.code(""" + import torch + from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection + + model_id = "IDEA-Research/grounding-dino-tiny" + + processor = AutoProcessor.from_pretrained(model_id) + model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device) + + inputs = processor(images=image, text=text, return_tensors="pt").to(device) + with torch.no_grad(): + outputs = model(**inputs) + + results = processor.post_process_grounded_object_detection( + outputs, + inputs.input_ids, + box_threshold=0.4, + text_threshold=0.3, + target_sizes=[image.size[::-1]]) + """) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("SegGPT") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("SegGPT") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("DocOwl 1.5") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("DocOwl 1.5") diff --git a/pages/14_DocOwl_1.5.py b/pages/14_DocOwl_1.5.py new file mode 100644 index 0000000000000000000000000000000000000000..559390497f41de52cb8b25ff8837e550de41f3c3 --- /dev/null +++ b/pages/14_DocOwl_1.5.py @@ -0,0 +1,217 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'DocOwl 1.5', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1782421257591357824) (April 22, 2024) + """, + 'tweet_1': + """ + DocOwl 1.5 is the state-of-the-art document understanding model by Alibaba with Apache 2.0 license 😍📝 + Time to dive in and learn more 🧶 + """, + 'tweet_2': + """ + This model consists of a ViT-based visual encoder part that takes in crops of image and the original image itself. + Then the outputs of the encoder goes through a convolution based model, after that the outputs are merged with text and then fed to LLM. + """, + 'tweet_3': + """ + Initially, the authors only train the convolution based part (called H-Reducer) and vision encoder while keeping LLM frozen. + Then for fine-tuning (on image captioning, VQA etc), they freeze vision encoder and train H-Reducer and LLM. + """, + 'tweet_4': + """ + Also they use simple linear projection on text and documents. You can see below how they model the text prompts and outputs 🤓 + """, + 'tweet_5': + """ + They train the model various downstream tasks including: + - document understanding (DUE benchmark and more) + - table parsing (TURL, PubTabNet) + - chart parsing (PlotQA and more) + - image parsing (OCR-CC) + - text localization (DocVQA and more) + """, + 'tweet_6': + """ + They contribute a new model called DocOwl 1.5-Chat by: + 1. creating a new document-chat dataset with questions from document VQA datasets + 2. feeding them to ChatGPT to get long answers + 3. fine-tune the base model with it (which IMO works very well!) + """, + 'tweet_7': + """ + Resulting generalist model and the chat model are pretty much state-of-the-art 😍 + Below you can see how it compares to fine-tuned models. + """, + 'tweet_8': + """ + All the models and the datasets (also some eval datasets on above tasks!) are in this [organization](https://t.co/sJdTw1jWTR). + The [Space](https://t.co/57E9DbNZXf). + """, + 'ressources': + """ + Ressources: + [mPLUG-DocOwl 1.5: Unified Structure Learning for OCR-free Document Understanding](https://arxiv.org/abs/2403.12895) + by Anwen Hu, Haiyang Xu, Jiabo Ye, Ming Yan, Liang Zhang, Bo Zhang, Chen Li, Ji Zhang, Qin Jin, Fei Huang, Jingren Zhou (2024) + [GitHub](https://github.com/X-PLUG/mPLUG-DocOwl) + """ + }, +'fr': { + 'title': 'DocOwl 1.5', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1782421257591357824) (en anglais) (22 avril 2024) + """, + 'tweet_1': + """ + DocOwl 1.5 est le modèle de compréhension de documents d'Alibaba sous licence Apache 2.0 😍📝 + Il est temps de découvrir ce modèle 🧶 + """, + 'tweet_2': + """ + Ce modèle se compose d'un encodeur visuel basé sur un ViT qui prend en compte les crops de l'image et l'image originale elle-même. + Les sorties de l'encodeur passent ensuite par un modèle convolutif, après quoi les sorties sont fusionnées avec le texte, puis transmises au LLM. + """, + 'tweet_3': + """ + Au départ, les auteurs n'entraînent que la partie basée sur la convolution (appelée H-Reducer) et l'encodeur de vision tout en gardant le LLM gelé. + Ensuite, pour le finetuning (légendage d'image, VQA, etc.), ils gèlent l'encodeur de vision et entraînent le H-Reducer et le LLM. + """, + 'tweet_4': + """ + Ils utilisent également une simple projection linéaire sur le texte et les documents. Vous pouvez voir ci-dessous comment ils modélisent les prompts et les sorties textuelles 🤓 + """, + 'tweet_5': + """ + Ils entraînent le modèle pour diverses tâches en aval, notamment + - la compréhension de documents (DUE benchmark et autres) + - analyse de tableaux (TURL, PubTabNet) + - analyse de graphiques (PlotQA et autres) + - analyse d'images (OCR-CC) + - localisation de textes (DocVQA et autres) + """, + 'tweet_6': + """ + Ils contribuent à un nouveau modèle appelé DocOwl 1.5-Chat en : + 1. créant un nouveau jeu de données document-chat avec des questions provenant de jeux de données VQA + 2. en les envoyant à ChatGPT pour obtenir des réponses longues + 3. en finetunant le modèle de base à l'aide de ce dernier (qui fonctionne très bien selon moi) + """, + 'tweet_7': + """ + Le modèle généraliste qui en résulte et le modèle de chat sont pratiquement à l'état de l'art 😍 + Ci-dessous, vous pouvez voir comment ils se comparent aux modèles finetunés. + """, + 'tweet_8': + """ + Tous les modèles et jeux de données (y compris certains jeux de données d'évaluation sur les tâches susmentionnées !) se trouvent dans cette [organisation](https://t.co/sJdTw1jWTR). Le [Space](https://t.co/57E9DbNZXf). + """, + 'ressources': + """ + Ressources : + [mPLUG-DocOwl 1.5: Unified Structure Learning for OCR-free Document Understanding](https://arxiv.org/abs/2403.12895) + de Anwen Hu, Haiyang Xu, Jiabo Ye, Ming Yan, Liang Zhang, Bo Zhang, Chen Li, Ji Zhang, Qin Jin, Fei Huang, Jingren Zhou (2024) + [GitHub](https://github.com/X-PLUG/mPLUG-DocOwl) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/DocOwl_1.5/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/DocOwl_1.5/image_2.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/DocOwl_1.5/image_3.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/DocOwl_1.5/image_4.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/DocOwl_1.5/image_5.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_6"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/DocOwl_1.5/image_6.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_7"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/DocOwl_1.5/image_7.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_8"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/DocOwl_1.5/image_8.jpeg", use_column_width=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("Grounding DINO") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("Grounding DINO") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("MiniGemini") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("MiniGemini") \ No newline at end of file diff --git a/pages/15_MiniGemini.py b/pages/15_MiniGemini.py new file mode 100644 index 0000000000000000000000000000000000000000..5e9763697cd922ad5b6e6a5e610795368cc44571 --- /dev/null +++ b/pages/15_MiniGemini.py @@ -0,0 +1,165 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'MiniGemini', + 'original_tweet': + """ + [Original tweet](https://x.com/mervenoyann/status/1783864388249694520) (April 26, 2024) + """, + 'tweet_1': + """ + MiniGemini is the coolest VLM, let's explain 🧶 + """, + 'tweet_2': + """ + MiniGemini is a vision language model that understands both image and text and also generates text and an image that goes best with the context! 🤯 + """, + 'tweet_3': + """ + This model has two image encoders (one CNN and one ViT) in parallel to capture the details in the images. + I saw the same design in DocOwl 1.5 then it has a decoder to output text and also a prompt to be sent to SDXL for image generation (which works very well!) + """, + 'tweet_4': + """ + They adopt CLIP's ViT for low resolution visual embedding encoder and a CNN-based one for high resolution image encoding (precisely a pre-trained ConvNeXt). + """, + 'tweet_5': + """ + Thanks to the second encoder it can grasp details in images, which also comes in handy for e.g. document tasks (but see below the examples are mindblowing IMO). + """, + 'tweet_6': + """ + According to their reporting the model performs very well across many benchmarks compared to LLaVA 1.5 and Gemini Pro. + """, + 'ressources': + """ + Resources: + [Mini-Gemini: Mining the Potential of Multi-modality Vision Language Models](https://huggingface.co/papers/2403.18814) + by Yanwei Li, Yuechen Zhang, Chengyao Wang, Zhisheng Zhong, Yixin Chen, Ruihang Chu, Shaoteng Liu, Jiaya Jia (2024) + [GitHub](https://github.com/dvlab-research/MGM) + [Model Repository](https://huggingface.co/YanweiLi/MGM-13B-HD) + """ + }, +'fr': { + 'title': 'MiniGemini', + 'original_tweet': + """ + [Tweet de base](https://x.com/mervenoyann/status/1783864388249694520) (26 avril 2024) + """, + 'tweet_1': + """ + MiniGemini est le VLM le plus cool, voici pourquoi 🧶 + """, + 'tweet_2': + """ + MiniGemini est un modèle de langage/vision qui comprend à la fois l'image et le texte et qui génère également le texte et l'image qui s'accordent le mieux avec le contexte ! 🤯 """, + 'tweet_3': + """ + Ce modèle possède deux encodeurs d'images (un ConvNet et un ViT) en parallèle pour capturer les détails dans les images. + J'ai vu la même conception dans DocOwl 1.5 où il y a un décodeur pour produire du texte et aussi un prompt à envoyer au SDXL pour la génération d'images (qui fonctionne très bien !). """, + 'tweet_4': + """ + Les auteurs adoptent le ViT de CLIP pour les enchâssements visuels de basse résolution et un ConvNet pour les images en haute résolution (précisément un ConvNeXt pré-entraîné). + """, + 'tweet_5': + """ + Grâce au second encodeur, il peut saisir des détails dans les images, ce qui s'avère également utile pour les tâches documentaires (voir ci-dessous les exemples époustouflants). """, + 'tweet_6': + """ + D'après leur rapport, le modèle est très performant dans de nombreux benchmarks par rapport à LLaVA 1.5 et Gemini Pro. + """, + 'ressources': + """ + Resources : + [Mini-Gemini: Mining the Potential of Multi-modality Vision Language Models](https://huggingface.co/papers/2403.18814) + de Yanwei Li, Yuechen Zhang, Chengyao Wang, Zhisheng Zhong, Yixin Chen, Ruihang Chu, Shaoteng Liu, Jiaya Jia (2024) + [GitHub](https://github.com/dvlab-research/MGM) + [Modèle](https://huggingface.co/YanweiLi/MGM-13B-HD) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/MiniGemini/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/MiniGemini/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/MiniGemini/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/MiniGemini/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/MiniGemini/image_5.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_6"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/MiniGemini/image_6.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("DocOwl 1.5") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("DocOwl 1.5") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("CuMo") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("PLLaVA") \ No newline at end of file diff --git a/pages/16_PLLaVA.py b/pages/16_PLLaVA.py new file mode 100644 index 0000000000000000000000000000000000000000..35170de74133708ee50fb7ee91ebfc129800cb83 --- /dev/null +++ b/pages/16_PLLaVA.py @@ -0,0 +1,155 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'PLLaVA', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1786336055425138939) (May 3, 2024) + """, + 'tweet_1': + """ + Parameter-free LLaVA for video captioning works like magic! 🤩 Let's take a look! + """, + 'tweet_2': + """ + Most of the video captioning models work by downsampling video frames to reduce computational complexity and memory requirements without losing a lot of information in the process. + PLLaVA on the other hand, uses pooling! 🤩 +
+ How? 🧐 + It takes in frames of video, passed to ViT and then projection layer, and then output goes through average pooling where input shape is (# frames, width, height, text decoder input dim) 👇 + """, + 'tweet_3': + """ + Pooling operation surprisingly reduces the loss of spatial and temporal information. See below some examples on how it can capture the details 🤗 + """, + 'tweet_4': + """ + According to authors' findings, it performs way better than many of the existing models (including proprietary VLMs) and scales very well (on text decoder). + """, + 'tweet_5': + """ + Model repositories 🤗 [7B](https://t.co/AeSdYsz1U7), [13B](https://t.co/GnI1niTxO7), [34B](https://t.co/HWAM0ZzvDc) + Spaces🤗 [7B](https://t.co/Oms2OLkf7O), [13B](https://t.co/C2RNVNA4uR) + """, + 'ressources': + """ + Ressources: + [PLLaVA : Parameter-free LLaVA Extension from Images to Videos for Video Dense Captioning](https://arxiv.org/abs/2404.16994) + by Lin Xu, Yilin Zhao, Daquan Zhou, Zhijie Lin, See Kiong Ng, Jiashi Feng (2024) + [GitHub](https://github.com/magic-research/PLLaVA) + """ + }, +'fr': { + 'title': 'PLLaVA', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1786336055425138939) (en anglais) (3 mai 2024) + """, + 'tweet_1': + """ + Parameter-free LLaVA (PLLaVA) pour le sous-titrage vidéo fonctionne comme par magie ! 🤩 + Jetons un coup d'œil ! + """, + 'tweet_2': + """ + La plupart des modèles de sous-titrage vidéo fonctionnent par sous-échantillonnage des images vidéo afin de réduire la complexité de calcul et les besoins en mémoire sans perdre beaucoup d'informations au cours du processus. + PLLaVA, quant à lui, utilise le pooling ! 🤩 +
+ Comment ? + Il prend les images de la vidéo, les passe au ViT puis à la couche de projection, et la sortie passe par un average pooling où la forme d'entrée est (# images, largeur, hauteur, dim d'entrée du décodeur de texte) 👇 """, + 'tweet_3': + """ + L'opération de pooling réduit de manière surprenante la perte d'informations spatiales et temporelles. Voir ci-dessous quelques exemples de la façon dont elle peut capturer les détails 🤗 """, + 'tweet_4': + """ + Selon les conclusions des auteurs, il est bien plus performant que de nombreux modèles existants (y compris les VLM propriétaires) et passe à l'échelle très bien (sur le décodeur de texte). """, + 'tweet_5': + """ + Dépôts des modèles 🤗 [7 Mds](https://t.co/AeSdYsz1U7), [13 Mds](https://t.co/GnI1niTxO7), [34 Mds](https://t.co/HWAM0ZzvDc) + Spaces🤗 [7 Mds](https://t.co/Oms2OLkf7O), [13 Mds](https://t.co/C2RNVNA4uR) + """, + 'ressources': + """ + Ressources : + [PLLaVA : Parameter-free LLaVA Extension from Images to Videos for Video Dense Captioning](https://arxiv.org/abs/2404.16994) + de Lin Xu, Yilin Zhao, Daquan Zhou, Zhijie Lin, See Kiong Ng, Jiashi Feng (2024) + [GitHub](https://github.com/magic-research/PLLaVA) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/PLLaVA/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/PLLaVA/image_2.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/PLLaVA/image_3.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/PLLaVA/image_4.jpeg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("MiniGemini") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("MiniGemini") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("CuMo") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("CuMo") \ No newline at end of file diff --git a/pages/17_CuMo.py b/pages/17_CuMo.py new file mode 100644 index 0000000000000000000000000000000000000000..f77bb2cb9f64b24b57c2a982cb8860dab29d4798 --- /dev/null +++ b/pages/17_CuMo.py @@ -0,0 +1,140 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'CuMo', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1790665706205307191) (May 15, 2024) + """, + 'tweet_1': + """ + It's raining vision language models ☔️ + CuMo is a new vision language model that has MoE in every step of the VLM (image encoder, MLP and text decoder) and uses Mistral-7B for the decoder part 🤓 + """, + 'tweet_2': + """ + The authors firstly did pre-training of MLP with the by freezing the image encoder and text decoder, then they warmup the whole network by unfreezing and finetuning which they state to stabilize the visual instruction tuning when bringing in the experts. + """, + 'tweet_3': + """ + The mixture of experts MLP blocks above are simply the same MLP blocks initialized from the single MLP that was trained during pre-training and fine-tuned in pre-finetuning 👇 + """, + 'tweet_4': + """ + It works very well (also tested myself) that it outperforms the previous SOTA of it's size LLaVA-NeXT! 😍 + I wonder how it would compare to IDEFICS2-8B You can try it yourself [here](https://t.co/MLIYKVh5Ee). + """, + 'ressources': + """ + Ressources: + [CuMo: Scaling Multimodal LLM with Co-Upcycled Mixture-of-Experts](https://arxiv.org/abs/2405.05949) + by Jiachen Li, Xinyao Wang, Sijie Zhu, Chia-Wen Kuo, Lu Xu, Fan Chen, Jitesh Jain, Humphrey Shi, Longyin Wen (2024) + [GitHub](https://github.com/SHI-Labs/CuMo) + """ + }, +'fr': { + 'title': 'CuMo', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1790665706205307191) (en anglais) (15 mai 2024) + """, + 'tweet_1': + """ + Il pleut des modèles de langage/vision ☔️ + CuMo est un nouveau modèle de langage/vision qui intègre le MoE à chaque étape du VLM (encodeur d'images, MLP et décodeur de texte) et utilise Mistral-7B pour la partie décodeur 🤓 + """, + 'tweet_2': + """ + Les auteurs ont tout d'abord effectué un pré-entraînement du MLP en gelant l'encodeur d'images et le décodeur de texte, puis ils ont réchauffé l'ensemble du réseau en le réglant avec précision, ce qui, selon eux, permet de stabiliser le réglage des instructions visuelles lors de l'intervention des experts. + """, + 'tweet_3': + """ + Le mélange d'experts de blocs MLP ci-dessus est simplement le même bloc MLP initialisé à partir du MLP unique qui a été entraîné pendant le pré-entraînement et finetuné dans le pré-finetuning 👇 + """, + 'tweet_4': + """ + Cela fonctionne très bien (je l'ai testé moi-même) et surpasse le précédent SOTA de taille équivalente, LLaVA-NeXT ! 😍 + Je me demande comment il se compare à IDEFICS2-8B. Vous pouvez l'essayer vous-même [ici](https://t.co/MLIYKVh5Ee). + """, + 'ressources': + """ + Ressources : + [CuMo: Scaling Multimodal LLM with Co-Upcycled Mixture-of-Experts](https://arxiv.org/abs/2405.05949) + de Jiachen Li, Xinyao Wang, Sijie Zhu, Chia-Wen Kuo, Lu Xu, Fan Chen, Jitesh Jain, Humphrey Shi, Longyin Wen (2024) + [GitHub](https://github.com/SHI-Labs/CuMo) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/CuMo/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/CuMo/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/CuMo/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/CuMo/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("PLLaVA") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("PLLaVA") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("DenseConnector") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("DenseConnector") diff --git a/pages/18_DenseConnector.py b/pages/18_DenseConnector.py new file mode 100644 index 0000000000000000000000000000000000000000..d10d8c9f6ebcf4d74b6c4eafba08eff52c3ee545 --- /dev/null +++ b/pages/18_DenseConnector.py @@ -0,0 +1,156 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'DenseConnector', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1796089181988352216) (May 30, 2024) + """, + 'tweet_1': + """ + Do we fully leverage image encoders in vision language models? 👀 + A new paper built a dense connector that does it better! Let's dig in 🧶 + """, + 'tweet_2': + """ + VLMs consist of an image encoder block, a projection layer that projects image embeddings to text embedding space and then a text decoder sequentially connected 📖 + This [paper](https://t.co/DPQzbj0eWm) explores using intermediate states of image encoder and not a single output 🤩 + """, + 'tweet_3': + """ + The authors explore three different ways of instantiating dense connector: sparse token integration, sparse channel integration and dense channel integration (each of them just take intermediate outputs and put them together in different ways, see below). + """, + 'tweet_4': + """ + They explore all three of them integrated to LLaVA 1.5 and found out each of the new models are superior to the original LLaVA 1.5. + """, + 'tweet_5': + """ + I tried the [model](https://huggingface.co/spaces/HuanjinYao/DenseConnector-v1.5-8B) and it seems to work very well 🥹 + The authors have released various [checkpoints](https://t.co/iF8zM2qvDa) based on different decoders (Vicuna 7/13B and Llama 3-8B). + """, + 'ressources': + """ + Ressources: + [Dense Connector for MLLMs](https://arxiv.org/abs/2405.13800) + by Huanjin Yao, Wenhao Wu, Taojiannan Yang, YuXin Song, Mengxi Zhang, Haocheng Feng, Yifan Sun, Zhiheng Li, Wanli Ouyang, Jingdong Wang (2024) + [GitHub](https://github.com/HJYao00/DenseConnector) + """ + }, +'fr': { + 'title': 'DenseConnector', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1796089181988352216) (en anglais) (30 mai 2024) + """, + 'tweet_1': + """ + Exploitons-nous pleinement les encodeurs d'images dans les modèles de langage/vision ? 👀 + Un nouveau papier a construit un connecteur dense qui le fait mieux ! Creusons un peu 🧶 + """, + 'tweet_2': + """ + Les VLM se composent d'un bloc encodeur d'images, d'une couche de projection qui projette les enchâssements d'images dans l'espace d'enchâssement du texte, puis d'un décodeur de texte connecté séquentiellement 📖. + Ce [papier](https://t.co/DPQzbj0eWm) explore l'utilisation d'états intermédiaires de l'encodeur d'images et non d'une sortie unique 🤩 + """, + 'tweet_3': + """ + Les auteurs explorent trois manières différentes d'instancier un connecteur dense : l'intégration de tokens épars, l'intégration de canaux épars et l'intégration de canaux denses (chacune d'entre elles prend simplement des sorties intermédiaires et les rassemble de différentes manières, voir ci-dessous). + """, + 'tweet_4': + """ + Ils ont exploré les trois modèles intégrés à LLaVA 1.5 et ont constaté que chacun des nouveaux modèles est supérieur au LLaVA 1.5 original. + """, + 'tweet_5': + """ + J'ai essayé le [modèle](https://huggingface.co/spaces/HuanjinYao/DenseConnector-v1.5-8B) et il semble fonctionner très bien 🥹 + Les auteurs ont publié plusieurs [checkpoints](https://t.co/iF8zM2qvDa) basés sur différents décodeurs (Vicuna 7/13B et Llama 3-8B). + """, + 'ressources': + """ + Ressources : + [Dense Connector for MLLMs](https://arxiv.org/abs/2405.13800) + de Huanjin Yao, Wenhao Wu, Taojiannan Yang, YuXin Song, Mengxi Zhang, Haocheng Feng, Yifan Sun, Zhiheng Li, Wanli Ouyang, Jingdong Wang (2024) + [GitHub](https://github.com/HJYao00/DenseConnector) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/DenseConnector/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/DenseConnector/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/DenseConnector/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/DenseConnector/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/DenseConnector/image_5.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("CuMo") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("CuMo") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("Depth Anything v2") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("Depth Anything v2") diff --git a/pages/19_Depth_Anything_V2.py b/pages/19_Depth_Anything_V2.py new file mode 100644 index 0000000000000000000000000000000000000000..a3e2eafa5b317bbbcb4a973255c9ca2f0f1448ef --- /dev/null +++ b/pages/19_Depth_Anything_V2.py @@ -0,0 +1,167 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'Depth Anything V2', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1803063120354492658) (June 18, 2024) + """, + 'tweet_1': + """ + I love Depth Anything V2 😍 + It’s Depth Anything, but scaled with both larger teacher model and a gigantic dataset! Let’s unpack 🤓🧶! + """, + 'tweet_2': + """ + The authors have analyzed Marigold, a diffusion based model against Depth Anything and found out what’s up with using synthetic images vs real images for MDE: + 🔖 Real data has a lot of label noise, inaccurate depth maps (caused by depth sensors missing transparent objects etc) + 🔖 Synthetic data have more precise and detailed depth labels and they are truly ground-truth, but there’s a distribution shift between real and synthetic images, and they have restricted scene coverage + """, + 'tweet_3': + """ + The authors train different image encoders only on synthetic images and find out unless the encoder is very large the model can’t generalize well (but large models generalize inherently anyway) 🧐 + But they still fail encountering real images that have wide distribution in labels 🥲 + """, + 'tweet_4': + """ + Depth Anything v2 framework is to... + 🦖 Train a teacher model based on DINOv2-G based on 595K synthetic images + 🏷️ Label 62M real images using teacher model + 🦕 Train a student model using the real images labelled by teacher + Result: 10x faster and more accurate than Marigold! + """, + 'tweet_5': + """ + The authors also construct a new benchmark called DA-2K that is less noisy, highly detailed and more diverse! + I have created a [collection](https://t.co/3fAB9b2sxi) that has the models, the dataset, the demo and CoreML converted model 😚 + """, + 'ressources': + """ + Ressources: + [Depth Anything V2](https://arxiv.org/abs/2406.09414) by Lihe Yang, Bingyi Kang, Zilong Huang, Zhen Zhao, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao (2024) + [GitHub](https://github.com/DepthAnything/Depth-Anything-V2) + [Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/depth_anything_v2) + """ + }, +'fr': { + 'title': 'Depth Anything V2', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1803063120354492658) (en anglais) (18 juin 2024) + """, + 'tweet_1': + """ + J'adore Depth Anything V2 😍 + C'est un Depth Anything, mais passé à l'échelle avec à la fois un modèle enseignant plus grand et un jeu de données gigantesque ! + Décortiquons tout ça 🤓🧶 ! + """, + 'tweet_2': + """ + Les auteurs ont analysé Marigold, un modèle de diffusion vs Depth Anything et ont découvert ce qui se passe avec l'utilisation d'images synthétiques par rapport à des images réelles : + 🔖 Les données réelles peuvent être mal étiquettées, les cartes de profondeur peuvent être imprécises (du fait des capteurs de profondeur manquant des objets transparents, etc.) + 🔖 Les données synthétiques ont des étiquettes plus précises ainsi que des cartes de profondeur plus détaillées/véridiques, mais il y a un décalage de distribution entre les images réelles et synthétiques. Les scènes couvertes sont également plus restreintes. + """, + 'tweet_3': + """ + Les auteurs entraînent différents encodeurs d'images uniquement sur des images synthétiques et découvrent qu'à moins que l'encodeur ne soit très grand, + le modèle ne peut pas bien généraliser (mais les grands modèles généralisent de toute façon de manière inhérente) 🧐 + Mais ils ne parviennent toujours pas à trouver des images réelles ayant une grande diffusion d'étiquettes 🥲 + """, + 'tweet_4': + """ + Le framework Depth Anything v2 a pour but... + 🦖 d'entraîner un modèle enseignant basé sur DINOv2-G à partir de 595K images synthétiques + 🏷️ d'étiqueter 62M d'images réelles à l'aide du modèle enseignant + 🦕 d'entraîner un modèle étudiant en utilisant les images réelles étiquetées par l'enseignant + Résultat : 10x plus rapide et plus précis que Marigold ! + """, + 'tweet_5': + """ + Les auteurs porposent également un nouveau benchmark appelé DA-2K qui est moins bruité, très détaillé et plus diversifié ! + J'ai créé une [collection](https://t.co/3fAB9b2sxi) contenant le modèles le jeu de données, la démo et la convertion en CoreML 😚 + """, + 'ressources': + """ + Ressources : + [Depth Anything V2](https://arxiv.org/abs/2406.09414) de Lihe Yang, Bingyi Kang, Zilong Huang, Zhen Zhao, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao (2024) + [GitHub](https://github.com/DepthAnything/Depth-Anything-V2) + [Documentation d'Hugging Face](https://huggingface.co/docs/transformers/model_doc/depth_anything_v2) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Depth_Anything_v2/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Depth_Anything_v2/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Depth_Anything_v2/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Depth_Anything_v2/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("DenseConnector") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("DenseConnector") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("Florence-2") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("Florence-2") diff --git a/pages/1_MobileSAM.py b/pages/1_MobileSAM.py index 80e21e5e4cc13211869262ef9775baf217b32187..6e163fac05a00eb97fe0257e3fb5cc40520a0bb2 100644 --- a/pages/1_MobileSAM.py +++ b/pages/1_MobileSAM.py @@ -1,79 +1,172 @@ import streamlit as st from streamlit_extras.switch_page_button import switch_page -st.title("MobileSAM") - -st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1738959605542076863) (December 24, 2023)""", icon="ℹ️") +translations = { +'en': {'title': 'MobileSAM', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1738959605542076863) (December 24, 2023) + """, + 'tweet_1': + """Read the MobileSAM paper this weekend 📖 Sharing some insights! + The idea 💡: SAM model consist of three parts, a heavy image encoder, a prompt encoder (prompt can be text, bounding box, mask or point) and a mask decoder. +
+ To make the SAM model smaller without compromising from the performance, the authors looked into three types of distillation. + First one is distilling the decoder outputs directly (a more naive approach) with a completely randomly initialized small ViT and randomly initialized mask decoder. + However, when the ViT and the decoder are both in a bad state, this doesn't work well. + """, + 'tweet_2': + """ + The second type of distillation is called semi-coupled, where the authors only randomly initialized the ViT image encoder and kept the mask decoder. + This is called semi-coupled because the image encoder distillation still depends on the mask decoder (see below 👇) + """, + 'tweet_3': + """ + The last type of distillation, [decoupled distillation](https://openaccess.thecvf.com/content/CVPR2022/papers/Zhao_Decoupled_Knowledge_Distillation_CVPR_2022_paper.pdf), is the most intuitive IMO. + The authors have "decoupled" image encoder altogether and have frozen the mask decoder and didn't really distill based on generated masks. + This makes sense as the bottleneck here is the encoder itself and most of the time, distillation works well with encoding. + """, + 'tweet_4': + """ + Finally, they found out that decoupled distillation performs better than coupled distillation by means of mean IoU and requires much less compute! ♥️ + """, + 'tweet_5': + """ + Wanted to leave some links here if you'd like to try yourself 👇 + - MobileSAM [demo](https://huggingface.co/spaces/dhkim2810/MobileSAMMobileSAM) + - Model [repository](https://huggingface.co/dhkim2810/MobileSAM) + + If you'd like to experiment around TinyViT, [timm library](https://huggingface.co/docs/timm/index) ([Ross Wightman](https://x.com/wightmanr)) has a bunch of [checkpoints available](https://huggingface.co/models?sort=trending&search=timm%2Ftinyvit). + """, + 'ressources': + """ + Ressources: + [Faster Segment Anything: Towards Lightweight SAM for Mobile Applications](https://arxiv.org/abs/2306.14289) + by Chaoning Zhang, Dongshen Han, Yu Qiao, Jung Uk Kim, Sung-Ho Bae, Seungkyu Lee, Choong Seon Hong (2023) + [GitHub](https://github.com/ChaoningZhang/MobileSAM)""" + }, +'fr': { + 'title': 'MobileSAM', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1738959605542076863) (en anglais) (24 décembre 2023) + """, + 'tweet_1': + """J'ai lu le papier MobileSAM ce week-end 📖 Je vous partage quelques enseignements ! + L'idée 💡 : SAM se compose de trois parties : un gros encodeur pour les images, un encodeur pour le prompt + (pouvant être un texte, une bounding box, un masque ou un point) et un décodeur pour le masque. +
+ Pour réduire la taille du modèle SAM sans compromettre ses performances, les auteurs ont envisagé trois types de distillation. + Le premier (naïf) consiste à distiller directement les sorties du décodeur dans un petit ViT et dans un décodeur de masque, tous deux initialisés aléatoirement. + Cependant, lorsque le ViT et le décodeur sont tous deux dans un mauvais état, cela ne fonctionne pas bien. + """, + 'tweet_2': + """ + Le deuxième type de distillation est appelé semi-couplé, où les auteurs initialisent aléatoirement le ViT de l'encodeur d'image et conservent le décodeur de masque. + Ce type de distillation est appelé semi-couplé parce que la distillation de l'encodeur d'image dépend toujours du décodeur de masquage (voir ci-dessous 👇). + """, + 'tweet_3': + """ + Le dernier type de distillation, la [distillation découplée](https://openaccess.thecvf.com/content/CVPR2022/papers/Zhao_Decoupled_Knowledge_Distillation_CVPR_2022_paper.pdf), est le plus intuitif selon moi. + Les auteurs découplent complètement l'encodeur d'image et gèlent le décodeur de masque. Ils ne distillent pas sur les masques générés. + C'est logique car le goulot d'étranglement ici est l'encodeur lui-même et la plupart du temps la distillation fonctionne bien avec l'encodage. + """, + 'tweet_4': + """ + Finalement, ils observent que la distillation découplée est plus performante que la distillation couplée en termes d'IoU moyen et qu'elle nécessite beaucoup moins de calculs ! ♥️ + """, + 'tweet_5': + """ + Quelques liens si vous voulez essayer vous-même 👇 + - La [démo](https://huggingface.co/spaces/dhkim2810/MobileSAMMobileSAM) + - Le [dépôt Hugging Face du modèle](https://huggingface.co/dhkim2810/MobileSAM) + + Si vous souhaitez expérimenter avec un TinyViT, la bibliotèque [timm](https://huggingface.co/docs/timm/index) ([Ross Wightman](https://x.com/wightmanr)) + dispose d'un certain nombre de [checkpoints](https://huggingface.co/models?sort=trending&search=timm%2Ftinyvit). + """, + 'ressources': + """ + Ressources : + [Faster Segment Anything: Towards Lightweight SAM for Mobile Applications](https://arxiv.org/abs/2306.14289) + de Chaoning Zhang, Dongshen Han, Yu Qiao, Jung Uk Kim, Sung-Ho Bae, Seungkyu Lee, Choong Seon Hong (2023) + [GitHub](https://github.com/ChaoningZhang/MobileSAM) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") st.markdown(""" """) -st.markdown("""Read the MobileSAM paper this weekend 📖 Sharing some insights! -The idea 💡: SAM model consist of three parts, a heavy image encoder, a prompt encoder (prompt can be text, bounding box, mask or point) and a mask decoder. - -To make the SAM model smaller without compromising from the performance, the authors looked into three types of distillation. -First one is distilling the decoder outputs directly (a more naive approach) with a completely randomly initialized small ViT and randomly initialized mask decoder. -However, when the ViT and the decoder are both in a bad state, this doesn't work well. -""") +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/MobileSAM/image_1.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -The second type of distillation is called semi-coupled, where the authors only randomly initialized the ViT image encoder and kept the mask decoder. -This is called semi-coupled because the image encoder distillation still depends on the mask decoder (see below 👇) -""") +st.markdown(translations[lang]["tweet_2"]) st.markdown(""" """) st.image("pages/MobileSAM/image_2.jpg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -The last type of distillation, [decoupled distillation](https://openaccess.thecvf.com/content/CVPR2022/papers/Zhao_Decoupled_Knowledge_Distillation_CVPR_2022_paper.pdf), is the most intuitive IMO. -The authors have "decoupled" image encoder altogether and have frozen the mask decoder and didn't really distill based on generated masks. -This makes sense as the bottleneck here is the encoder itself and most of the time, distillation works well with encoding. -""") +st.markdown(translations[lang]["tweet_3"]) st.markdown(""" """) st.image("pages/MobileSAM/image_3.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -Finally, they found out that decoupled distillation performs better than coupled distillation by means of mean IoU and requires much less compute! ♥️ -""") +st.markdown(translations[lang]["tweet_4"]) st.markdown(""" """) st.image("pages/MobileSAM/image_4.jpg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -Wanted to leave some links here if you'd like to try yourself 👇 -- MobileSAM [demo](https://huggingface.co/spaces/dhkim2810/MobileSAMMobileSAM) -- Model [repository](https://huggingface.co/dhkim2810/MobileSAM) - -If you'd like to experiment around TinyViT, [timm library](https://huggingface.co/docs/timm/index) ([Ross Wightman](https://x.com/wightmanr)) has a bunch of [checkpoints available](https://huggingface.co/models?sort=trending&search=timm%2Ftinyvit). -""") +st.markdown(translations[lang]["tweet_5"]) st.markdown(""" """) st.image("pages/MobileSAM/image_5.jpeg", use_column_width=True) st.markdown(""" """) - -st.info(""" -Ressources: -[Faster Segment Anything: Towards Lightweight SAM for Mobile Applications](https://arxiv.org/abs/2306.14289) -by Chaoning Zhang, Dongshen Han, Yu Qiao, Jung Uk Kim, Sung-Ho Bae, Seungkyu Lee, Choong Seon Hong (2023) -[GitHub](https://github.com/ChaoningZhang/MobileSAM)""", icon="📚") +st.info(translations[lang]["ressources"], icon="📚") st.markdown(""" """) st.markdown(""" """) st.markdown(""" """) col1, col2, col3= st.columns(3) with col1: - if st.button('Previous paper', use_container_width=True): - switch_page("Home") + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("KOSMOS-2") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("KOSMOS-2") with col2: - if st.button('Home', use_container_width=True): - switch_page("Home") + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") with col3: - if st.button('Next paper', use_container_width=True): - switch_page("OneFormer") \ No newline at end of file + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("OneFormer") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("OneFormer") \ No newline at end of file diff --git a/pages/20_Florence-2.py b/pages/20_Florence-2.py new file mode 100644 index 0000000000000000000000000000000000000000..ddb761acd03cb414f6f6db26363f1569aa412a28 --- /dev/null +++ b/pages/20_Florence-2.py @@ -0,0 +1,176 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'Florence-2', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1803769866878623819) (June 20, 2024) + """, + 'tweet_1': + """ + Florence-2 is a new vision foundation model by Microsoft capable of a wide variety of tasks 🤯 + Let's unpack! 🧶 + """, + 'tweet_2': + """ + This model is can handle tasks that vary from document understanding to semantic segmentation 🤩 + [Demo](https://t.co/7YJZvjhw84) | [Collection](https://t.co/Ub7FGazDz1) + """, + 'tweet_3': + """ + The difference from previous models is that the authors have compiled a dataset that consists of 126M images with 5.4B annotations labelled with their own data engine ↓↓ + """, + 'tweet_4': + """ + The dataset also offers more variety in annotations compared to other datasets, it has region level and image level annotations with more variety in semantic granularity as well! + """, + 'tweet_5': + """ + The model is a similar architecture to previous models, an image encoder, a multimodality encoder with text decoder. + The authors have compiled the multitask dataset with prompts for each task which makes the model trainable on multiple tasks 🤗 + """, + 'tweet_6': + """ + You also fine-tune this model on any task of choice, the authors also released different results on downstream tasks and report their results when un/freezing vision encoder 🤓📉 + They have released fine-tuned models too, you can find them in the collection above 🤗 + """, + 'ressources': + """ + Ressources: + [Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks](https://arxiv.org/abs/2311.06242) + by Bin Xiao, Haiping Wu, Weijian Xu, Xiyang Dai, Houdong Hu, Yumao Lu, Michael Zeng, Ce Liu, Lu Yuan (2023) + [Hugging Face blog post](https://huggingface.co/blog/finetune-florence2) + [Fine-tuning notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_Florence_2.ipynb) + [Florence-2 fine-tuned](https://huggingface.co/models?search=florence-2) + """ + }, +'fr': { + 'title': 'Florence-2', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1803769866878623819) (en anglais) (20 juin 2024) + """, + 'tweet_1': + """ + Florence-2 est un nouveau modèle de fondation de vision de Microsoft capable d'une grande variété de tâches 🤯. + Déballons tout ça ! 🧶 + """, + 'tweet_2': + """ + Ce modèle peut traiter des tâches allant de la compréhension de documents à la segmentation sémantique 🤩 + [Demo](https://t.co/7YJZvjhw84) | [Collection](https://t.co/Ub7FGazDz1) + """, + 'tweet_3': + """ + La différence avec les modèles précédents est que les auteurs ont constitué un jeu de données de 126 millions d'images étiquetées avec 5,4 milliards d'annotations via leur propre moteur de données ↓↓ + """, + 'tweet_4': + """ + Ce jeu de données offre aussi une plus grande variété d'annotations par rapport aux autres jeux de données, avec des annotations au niveau de la région et de l'image, ainsi qu'une plus grande variété dans la granularité sémantique ! + """, + 'tweet_5': + """ + Le modèle a une architecture similaire aux modèles précédents, un encodeur d'image, un encodeur de multimodalité avec un décodeur de texte. + Les auteurs ont compilé le jeu de données multitâches avec des prompts chaque tâche, ce qui rend le modèle entraînable sur de multiples tâches 🤗 + """, + 'tweet_6': + """ + Vous pouvez finetuner ce modèle sur n'importe quelle tâche de votre choix. Les auteurs ont publié différents résultats sur des tâches en aval et rapportent leurs résultats lors qu'ils gèlent/dégelent l'encodeur de vision 🤓📉. + Ils ont aussi publié des modèles finetunés que vous pouvez trouver dans la collection ci-dessus 🤗 + """, + 'ressources': + """ + Ressources : + [Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks](https://arxiv.org/abs/2311.06242) + de Bin Xiao, Haiping Wu, Weijian Xu, Xiyang Dai, Houdong Hu, Yumao Lu, Michael Zeng, Ce Liu, Lu Yuan (2023) + [Article de blog sur Hugging Face](https://huggingface.co/blog/finetune-florence2) + [Notebook de finetuning](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_Florence_2.ipynb) + [Les Florence-2 finetunés](https://huggingface.co/models?search=florence-2) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Florence-2/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Florence-2/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Florence-2/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Florence-2/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Florence-2/image_5.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_6"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Florence-2/image_6.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("Depth Anything V2") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("Depth Anything V2") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("4M-21") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("4M-21") \ No newline at end of file diff --git a/pages/21_4M-21.py b/pages/21_4M-21.py new file mode 100644 index 0000000000000000000000000000000000000000..f22a27995b9e0d92fb094e48ec2c4b935984f3b0 --- /dev/null +++ b/pages/21_4M-21.py @@ -0,0 +1,156 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': '4M-21', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1804138208814309626) (June 21, 2024) + """, + 'tweet_1': + """ + EPFL and Apple just released 4M-21: single any-to-any model that can do anything from text-to-image generation to generating depth masks! 🙀 + Let's unpack 🧶 + """, + 'tweet_2': + """ + 4M is a multimodal training [framework](https://t.co/jztLublfSF) introduced by Apple and EPFL. + Resulting model takes image and text and output image and text 🤩 + [Models](https://t.co/1LC0rAohEl) | [Demo](https://t.co/Ra9qbKcWeY) + """, + 'tweet_3': + """ + This model consists of transformer encoder and decoder, where the key to multimodality lies in input and output data: + input and output tokens are decoded to generate bounding boxes, generated image's pixels, captions and more! + """, + 'tweet_4': + """ + This model also learnt to generate canny maps, SAM edges and other things for steerable text-to-image generation 🖼️ + The authors only added image-to-all capabilities for the demo, but you can try to use this model for text-to-image generation as well ☺️ + """, + 'tweet_5': + """ + In the project page you can also see the model's text-to-image and steered generation capabilities with model's own outputs as control masks! + """, + 'ressources': + """ + Ressources + [4M-21: An Any-to-Any Vision Model for Tens of Tasks and Modalities](https://arxiv.org/abs/2406.09406) by Roman Bachmann, Oğuzhan Fatih Kar, David Mizrahi, Ali Garjani, Mingfei Gao, David Griffiths, Jiaming Hu, Afshin Dehghan, Amir Zamir (2024) + [GitHub](https://github.com/apple/ml-4m/) + """ + }, +'fr': { + 'title': '4M-21', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1804138208814309626) (en anglais) (21 juin 2024) + """, + 'tweet_1': + """ + L'EPFL et Apple viennent de publier 4M-21 : un modèle unique qui peut tout faire, de la génération texte-à-image à la génération de masques de profondeur ! 🙀 + Détaillons tout ça 🧶 + """, + 'tweet_2': + """ + 4M est un [framework](https://t.co/jztLublfSF) d'entraînement multimodal introduit par Apple et l'EPFL. + Le modèle résultant prend une image et un texte et produit une image et un texte 🤩 + [Modèles](https://t.co/1LC0rAohEl) | [Demo](https://t.co/Ra9qbKcWeY) + """, + 'tweet_3': + """ + Ce modèle se compose d'un transformer encodeur-décodeur, où la clé de la multimodalité réside dans les données d'entrée et de sortie : + les tokens d'entrée et de sortie sont décodés pour générer des boîtes de délimitation, les pixels de l'image, les légendes, etc. ! + """, + 'tweet_4': + """ + Ce modèle a aussi appris à générer des filtres de Canny, des bordures SAM et pleins d'autres choses pour tout ce qui est pilotage de la génération d'images à partir de textes 🖼️ + Les auteurs n'ont ajouté que des capacités image-vers-tout pour la démo, mais vous pouvez essayer d'utiliser ce modèle pour la génération texte-image également ☺️ """, + 'tweet_5': + """ + Dans la page du projet, vous pouvez également voir les capacités du modèle en matière de texte vers image et de génération dirigée avec les propres sorties du modèle en tant que masques de contrôle ! """, + 'ressources': + """ + Ressources : + [4M-21: An Any-to-Any Vision Model for Tens of Tasks and Modalities](https://arxiv.org/abs/2406.09406) de Roman Bachmann, Oğuzhan Fatih Kar, David Mizrahi, Ali Garjani, Mingfei Gao, David Griffiths, Jiaming Hu, Afshin Dehghan, Amir Zamir (2024) + [GitHub](https://github.com/apple/ml-4m/) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/4M-21/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.video("pages/4M-21/video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/4M-21/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/4M-21/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.video("pages/4M-21/video_2.mp4", format="video/mp4") +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("Florence-2") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("Florence-2") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("RT-DETR") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("RT-DETR") \ No newline at end of file diff --git a/pages/22_RT-DETR.py b/pages/22_RT-DETR.py new file mode 100644 index 0000000000000000000000000000000000000000..29f4c3aab2f734faba511fd8f80dde87091120ed --- /dev/null +++ b/pages/22_RT-DETR.py @@ -0,0 +1,156 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'RT-DETR', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1807790959884665029) (July 1, 2024) + """, + 'tweet_1': + """ + Real-time DEtection Transformer (RT-DETR) landed in 🤗 Transformers with Apache 2.0 license 😍 + Do DETRs Beat YOLOs on Real-time Object Detection? Keep reading 👀 + """, + 'tweet_2': + """ + Short answer, it does! 📖 + [notebook](https://t.co/NNRpG9cAEa), 🔖 [models](https://t.co/ctwWQqNcEt), 🔖 [demo](https://t.co/VrmDDDjoNw) + + YOLO models are known to be super fast for real-time computer vision, but they have a downside with being volatile to NMS 🥲 + Transformer-based models on the other hand are computationally not as efficient 🥲 + Isn't there something in between? Enter RT-DETR! + + The authors combined CNN backbone, multi-stage hybrid decoder (combining convs and attn) with a transformer decoder ⇓ + """, + 'tweet_3': + """ + In the paper, authors also claim one can adjust speed by changing decoder layers without retraining altogether. + They also conduct many ablation studies and try different decoders. + """, + 'tweet_4': + """ + The authors find out that the model performs better in terms of speed and accuracy compared to the previous state-of-the-art 🤩 + """, + 'ressources': + """ + Ressources: + [DETRs Beat YOLOs on Real-time Object Detection](https://arxiv.org/abs/2304.08069) + by Yian Zhao, Wenyu Lv, Shangliang Xu, Jinman Wei, Guanzhong Wang, Qingqing Dang, Yi Liu, Jie Chen (2023) + [GitHub](https://github.com/lyuwenyu/RT-DETR/) + [Hugging Face documentation](https://huggingface.co/docs/transformers/main/en/model_doc/rt_detr) + """ + }, +'fr': { + 'title': 'RT-DETR', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1807790959884665029) (en anglais) (1er juillet 2024) + """, + 'tweet_1': + """ + Real-time DEtection Transformer (RT-DETR) débarque dans 🤗 Transformers avec une licence Apache 2.0 😍 + Les DETR battent-ils les YOLO en matière de détection d'objets en temps réel ? Continuez à lire 👀 + """, + 'tweet_2': + """ + Réponse courte, c'est le cas ! 📖 + [notebook](https://t.co/NNRpG9cAEa), 🔖 [models](https://t.co/ctwWQqNcEt), 🔖 [demo](https://t.co/VrmDDDjoNw) + + Les YOLO sont connus pour être super rapides pour de la vision par ordinateur en temps réel, mais ils ont l'inconvénient d'être volatils pour la suppression non maximale 🥲. + Les modèles basés sur les transformers quant à eux ne sont pas aussi efficaces sur le plan du calcul 🥲 + N'y a-t-il pas une solution intermédiaire ? C'est là que rentre en jeu RT-DETR ! + + Les auteurs ont combiné un ConvNet, un décodeur hybride à plusieurs étapes (combinant convolution et attention) avec un transformer-décodeur ⇓ + """, + 'tweet_3': + """ + Dans le papier, les auteurs affirment qu'il est possible d'ajuster la vitesse en changeant les couches du décodeur sans procéder à un nouvel entraînement. + Ils mènent également de nombreuses études d'ablation et essaient différents décodeurs. + """, + 'tweet_4': + """ + Les auteurs constatent que le modèle est plus performant en termes de rapidité et de précision que les modèles précédents 🤩 + """, + 'ressources': + """ + Ressources : + [DETRs Beat YOLOs on Real-time Object Detection](https://arxiv.org/abs/2304.08069) + de Yian Zhao, Wenyu Lv, Shangliang Xu, Jinman Wei, Guanzhong Wang, Qingqing Dang, Yi Liu, Jie Chen (2023) + [GitHub](https://github.com/lyuwenyu/RT-DETR/) + [Documentation d'Hugging Face](https://huggingface.co/docs/transformers/main/en/model_doc/rt_detr) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.video("pages/RT-DETR/video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/RT-DETR/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/RT-DETR/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/RT-DETR/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("4M-21") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("4M-21") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("ColPali") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("ColPali") diff --git a/pages/23_ColPali.py b/pages/23_ColPali.py new file mode 100644 index 0000000000000000000000000000000000000000..416e1079f48fb4f4ffd74ad5298bbb0df81876d0 --- /dev/null +++ b/pages/23_ColPali.py @@ -0,0 +1,186 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'ColPali', + 'original_tweet': + """ + [Original tweet](https://x.com/mervenoyann/status/1811003265858912670) (Jul 10, 2024) + """, + 'tweet_1': + """ + Forget any document retrievers, use ColPali 💥💥 +
+ Document retrieval is done through OCR + layout detection, but it's overkill and doesn't work well! 🤓 +
+ ColPali uses a vision language model, which is better in doc understanding 📑 + """, + 'tweet_2': + """ + Check out [ColPali model](https://huggingface.co/vidore/colpali) (mit license!) + Check out the [blog](https://huggingface.co/blog/manu/colpali) +
+ The authors also released a new benchmark for document retrieval, [ViDoRe Leaderboard](https://huggingface.co/spaces/vidore/vidore-leaderboard), submit your model! + """, + 'tweet_3': + """ + Regular document retrieval systems use OCR + layout detection + another model to retrieve information from documents, and then use output representations in applications like RAG 🥲 +
+ Meanwhile modern image encoders demonstrate out-of-the-box document understanding capabilities! +
+ ColPali marries the idea of modern vision language models with retrieval 🤝 +
+ The authors apply contrastive fine-tuning to SigLIP on documents, and pool the outputs (they call it BiSigLip). Then they feed the patch embedding outputs to PaliGemma and create BiPali 🖇️ + """, + 'tweet_4': + """ + BiPali natively supports image patch embeddings to an LLM, which enables leveraging the ColBERT-like late interaction computations between text tokens and image patches (hence the name ColPali!) 🤩 +
+ The authors created the ViDoRe benchmark by collecting PDF documents and generate queries from Claude-3 Sonnet. +
+ See below how every model and ColPali performs on ViDoRe 👇🏻 + """, + 'tweet_5': + """ + Aside from performance improvements, ColPali is very fast for offline indexing as well! + """, + 'ressources': + """ + Resources: + [ColPali: Efficient Document Retrieval with Vision Language Models](https://huggingface.co/papers/2407.01449) + by Manuel Faysse, Hugues Sibille, Tony Wu, Bilel Omrani, Gautier Viaud, Céline Hudelot, Pierre Colombo (2024) + [GitHub](https://github.com/illuin-tech/colpali) + [Models](https://huggingface.co/models?search=vidore) + [Leaderboard](https://huggingface.co/spaces/vidore/vidore-leaderboard) + """ + }, +'fr': { + 'title': 'ColPali', + 'original_tweet': + """ + [Tweet de base](https://x.com/mervenoyann/status/1811003265858912670) (en anglais) (10 juillet 2024) + """, + 'tweet_1': + """ + Oubliez la recherche de documents, utilisez ColPali 💥💥💥 + + La recherche de documents se fait par OCR + détection de la mise en page, mais c'est exagéré et ça ne fonctionne pas bien ! 🤓 + + ColPali utilise un modèle de langage/vision, qui est meilleur pour la compréhension des documents 📑 + """, + 'tweet_2': + """ + Consultez [le modèle](https://huggingface.co/vidore/colpali) (licence mit !) et l'article de [blog](https://huggingface.co/blog/manu/colpali). + + Les auteurs ont également publié un nouveau benchmark pour la recherche de documents, [ViDoRe Leaderboard](https://huggingface.co/spaces/vidore/vidore-leaderboard), soumettez votre modèle ! + """, + 'tweet_3': + """ + Les systèmes de recherche documentaire classiques utilisent l'OCR + la détection de la mise en page + un modèle pour extraire les informations des documents, avant de finalement de fournir les représentations à un système de RAG 🥲. + + Alors que les encodeurs d'images modernes démontrent des capacités de compréhension de documents prêtes à l'emploi ! + + ColPali combine l'idée de modèles de langage-vision modernes avec la recherche documentaire 🤝 + + Les auteurs procèdent à un finetuningf de SigLIP sur des documents et font un pooling des sorties (qu'ils appellent BiSigLip). Ils transmettent ensuite des patchs d'enchassements à PaliGemma et créent BiPali 🖇️. """, + 'tweet_4': + """ + BiPali supporte nativement les patchs d'enchâssements d'images dans un LLM ce qui permet d'exploiter des interactions de calculs (semblables à ColBERT) entre les tokens de texte et les patchs d'images. D'où le nom de ColPali ! 🤩 + + Les auteurs ont créé le benchmark ViDoRe en collectant des documents PDF et en générant des requêtes à partir de Claude-3 Sonnet. + + Découvrez ci-dessous les performances de chaque modèle et de ColPali sur ViDoRe 👇🏻 + """, + 'tweet_5': + """ + Au-delà des améliorations de performance, ColPali est également très rapide pour l'indexation hors ligne ! + """, + 'ressources': + """ + Ressources : + [ColPali: Efficient Document Retrieval with Vision Language Models](https://huggingface.co/papers/2407.01449) + de Manuel Faysse, Hugues Sibille, Tony Wu, Bilel Omrani, Gautier Viaud, Céline Hudelot, Pierre Colombo (2024) + [GitHub](https://github.com/illuin-tech/colpali) + [Modèles](https://huggingface.co/models?search=vidore) + [Leaderboard](https://huggingface.co/spaces/vidore/vidore-leaderboard) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/ColPali/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/ColPali/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/ColPali/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/ColPali/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/ColPali/image_5.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("RT-DETR") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("RT-DETR") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("Llava-NeXT-Interleave") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("Llava-NeXT-Interleave") diff --git a/pages/24_Llava-NeXT-Interleave.py b/pages/24_Llava-NeXT-Interleave.py new file mode 100644 index 0000000000000000000000000000000000000000..6e48c3ccff000e45e45ef0017be88515df1ada1c --- /dev/null +++ b/pages/24_Llava-NeXT-Interleave.py @@ -0,0 +1,208 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'Llava-NeXT-Interleave', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1813560292397203630) (July 17, 2024) + """, + 'tweet_1': + """ + The vision language model in this video is 0.5B and can take in image, video and 3D! 🤯 + Llava-NeXT-Interleave is a new vision language model trained on interleaved image, video and 3D data. + Keep reading ⥥⥥ + """, + 'tweet_2': + """ + This model comes with 0.5B, 7B and 7B-DPO variants, all can be used with 🤗 Transformers 😍 + [Collection of models](https://t.co/sZsaglSXa3) | [Demo](https://t.co/FbpaMWJY8k) + See how to use below 👇🏻 + """, + 'tweet_3': + """ + Authors of this paper have explored training LLaVA-NeXT on interleaved data where the data consists of multiple modalities, including image(s), video, 3D 📚 + They have discovered that interleaved data increases results across all benchmarks! + """, + 'tweet_4': + """ + The model can do task transfer from single image tasks to multiple images 🤯 + The authors have trained the model on single images and code yet the model can solve coding with multiple images. + """, + 'tweet_5': + """ + Same applies to other modalities, see below for video: + """, + 'tweet_6': + """ + The model also has document understanding capabilities and many real-world application areas. + """, + 'tweet_7': + """ + This release also comes with the dataset this model was fine-tuned on 📖 [M4-Instruct-Data](https://t.co/rutXMtNC0I) + """, + 'ressources': + """ + Ressources: + [LLaVA-NeXT: Tackling Multi-image, Video, and 3D in Large Multimodal Models](https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/) + by Feng Li, Renrui Zhang, Hao Zhang, Yuanhan Zhang, Bo Li, Wei Li, Zejun Ma, Chunyuan Li (2024) + [GitHub](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/inference/docs/LLaVA-NeXT-Interleave.md) + [Hugging Face documentation](https://huggingface.co/docs/transformers/en/model_doc/llava_next) + """ + }, +'fr': { + 'title': 'Llava-NeXT-Interleave', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1813560292397203630) (en anglais) (17 juillet 2024) + """, + 'tweet_1': + """ + Le modèle de langage/vision dans cette vidéo est de 500M de parmaètres et peut prendre en charge image, vidéo et 3D ! 🤯 + Llava-NeXT-Interleave est un nouveau modèle entraîné sur des images, des vidéos et des données 3D entrelacées. + Continuez à lire ⥥⥥⥥ + """, + 'tweet_2': + """ + Ce modèle est disponible en versions 0.5B, 7B et 7B-DPO, toutes utilisables avec 🤗 Transformers 😍 + [Les modèles](https://t.co/sZsaglSXa3) | [Demo](https://t.co/FbpaMWJY8k) + Voir comment les utiliser ci-dessous👇🏻 + """, + 'tweet_3': + """ + Les auteurs ont explorer d'entraîner LLaVA-NeXT sur des données entrelacées où les données sont constituées de plusieurs modalités, y compris des images, des vidéos, de la 3D 📚. + Ils ont découvert que ces données augmentent les résultats de tous les benchmarks ! + """, + 'tweet_4': + """ + Le modèle peut transférer des tâches d'une image unique à des images multiples 🤯 + Les auteurs ont entraîné le modèle sur des images et des codes uniques, mais le modèle peut résoudre le codage avec des images multiples. + """, + 'tweet_5': + """ + La même chose s'applique à d'autres modalités, voir ci-dessous pour la vidéo : + """, + 'tweet_6': + """ + Le modèle possède également des capacités de compréhension des documents et a donc de nombreux domaines d'application dans le monde réel. + """, + 'tweet_7': + """ + Les auteurs mettent également en ligne le jeu de données utilisé pour le finetuning : 📖 [M4-Instruct-Data](https://t.co/rutXMtNC0I) + """, + 'ressources': + """ + Ressources : + [LLaVA-NeXT: Tackling Multi-image, Video, and 3D in Large Multimodal Models](https://llava-vl.github.io/blog/2024-06-16-llava-next-interleave/) + de Feng Li, Renrui Zhang, Hao Zhang, Yuanhan Zhang, Bo Li, Wei Li, Zejun Ma, Chunyuan Li (2024) + [GitHub](https://github.com/LLaVA-VL/LLaVA-NeXT/blob/inference/docs/LLaVA-NeXT-Interleave.md) + [Documentation d'Hugging Face](https://huggingface.co/docs/transformers/en/model_doc/llava_next) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.video("pages/Llava-NeXT-Interleave/video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Llava-NeXT-Interleave/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +with st.expander ("Code"): + st.code(""" +import torch +from transformers import AutoProcessor, LlavaForConditionalGeneration + +model_id = "llava-hf/llava-interleave-qwen-7b-hf" +model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True).to("cuda") + +prompt = "<|im_start|>user \nWhat are these?|im_end|><|im_start|>assistant" +inputs = processor(prompt, raw_image, return_tensors='pt').to(0, torch.float16) + +output = model.generate(**inputs, max_new_tokens=200, do_sample=False) +print(processor.decode(output[0][2:], skip_special_tokens=True)) + """) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Llava-NeXT-Interleave/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Llava-NeXT-Interleave/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Llava-NeXT-Interleave/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_6"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Llava-NeXT-Interleave/image_5.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_7"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Llava-NeXT-Interleave/image_6.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("ColPali") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("ColPali") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("Chameleon") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("Chameleon") diff --git a/pages/25_Chameleon.py b/pages/25_Chameleon.py new file mode 100644 index 0000000000000000000000000000000000000000..8dab49ee70c4b615dc1b2e8d822c9d6275991632 --- /dev/null +++ b/pages/25_Chameleon.py @@ -0,0 +1,192 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'Chameleon', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1814278511785312320) (July 19, 2024) + """, + 'tweet_1': + """ + Chameleon 🦎 by Meta is now available in 🤗 Transformers. + A multimodal model that comes in 7B and 34B sizes 🤩 + But what makes this model so special? Keep reading ⇣ + """, + 'tweet_2': + """ + [Demo](https://t.co/GsGE17fSdI) | [Models](https://t.co/cWUiVbsRz6) + Find below the API to load this model locally use it ⬇️ + """, + 'tweet_3': + """ + Chameleon is a unique model: it attempts to scale early fusion 🤨 + But what is early fusion? + Modern vision language models use a vision encoder with a projection layer to project image embeddings so it can be promptable to text decoder. + """, + 'tweet_4': + """ + Early fusion on the other hand attempts to fuse all features together (image patches and text) by using an image tokenizer and all tokens are projected into a shared space, which enables seamless generation 😏 + """, + 'tweet_5': + """ + Authors have also introduced different architectural improvements (QK norm and revise placement of layer norms) for scalable and stable training. + This way they were able to increase the token count (5x tokens compared to Llama 3 which is a must with early-fusion IMO) . + """, + 'tweet_6': + """ + This model is an any-to-any model thanks to early fusion: it can take image and text input and output image and text, but image generation are disabled to prevent malicious use. + """, + 'tweet_7': + """ + One can also do text-only prompting, authors noted the model catches up with larger LLMs, and you can also see how it compares to VLMs with image-text prompting. + """, + 'ressources': + """ + Ressources: + [Chameleon: Mixed-Modal Early-Fusion Foundation Models](https://arxiv.org/abs/2405.09818) + by Chameleon Team (2024) + [GitHub](https://github.com/facebookresearch/chameleon) + [Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/chameleon) + """ + }, +'fr': { + 'title': 'Chameleon', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1814278511785312320) (en anglais) (19 juillet 2024) + """, + 'tweet_1': + """ + Chameleon 🦎 de Meta est désormais disponible dans 🤗 Transformers. + Un modèle multimodal qui se décline en tailles 7 Mds et 34 Mds de paramètres 🤩 + Mais qu'est-ce qui rend ce modèle si particulier ? Continuez à lire ⇣ + """, + 'tweet_2': + """ + [Demo](https://t.co/GsGE17fSdI) | [Modèles](https://t.co/cWUiVbsRz6) + Vous trouverez ci-dessous l'API permettant de charger ce modèle et de l'utiliser localement ⬇️ + """, + 'tweet_3': + """ + Chameleon is a unique model: it attempts to scale early fusion 🤨 + But what is early fusion? + Modern vision language models use a vision encoder with a projection layer to project image embeddings so it can be promptable to text decoder. + + Chameleon est un modèle unique : il tente de mettre à l'échelle la fusion précoce 🤨 + Mais qu'est-ce que la fusion précoce ? + Les modèles de langage/vision modernes utilisent un encodeur de vision avec une couche de projection pour projeter des enchâssements d'images de manière à ce qu'ils puissent être transmis au décodeur de texte. + """, + 'tweet_4': + """ + La fusion précoce, quant à elle, tente de fusionner toutes les caractéristiques ensemble (patchs d'image et texte) en utilisant un tokenizer d'image et tous les tokens sont projetés dans un espace partagé, ce qui permet une génération homogène 😏 """, + 'tweet_5': + """ + Les auteurs ont également introduit différentes améliorations architecturales (norme QK et modification du placement des normalisations de couches) pour un entraînement passable à l'échelle et stable. + De cette manière, ils ont pu augmenter le nombre de tokens (5x plus par rapport à Llama 3, ce qui est indispensable avec la fusion précoce selon moi). + """, + 'tweet_6': + """ + Ce modèle est un modèle pouvant tout faire grâce à la fusion précoce : il peut prendre des images et du texte en entrée et produire des images et du texte en sortie, mais la génération d'images est désactivée afin d'éviter toute utilisation malveillante. + """, + 'tweet_7': + """ + Il est également possible d'utiliser des prompts textuels, les auteurs ont noté que le modèle rejoignait les LLM plus grands, et vous pouvez également voir comment il se compare aux VLM avec des prompts image-texte. """, + 'ressources': + """ + Ressources : + [Chameleon: Mixed-Modal Early-Fusion Foundation Models](https://arxiv.org/abs/2405.09818) + de Chameleon Team (2024) + [GitHub](https://github.com/facebookresearch/chameleon) + [Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/chameleon) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.video("pages/Chameleon/video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Chameleon/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Chameleon/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Chameleon/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Chameleon/image_4.jpg", use_column_width=True) + +st.markdown(translations[lang]["tweet_6"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Chameleon/image_5.jpg", use_column_width=True) + +st.markdown(translations[lang]["tweet_7"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Chameleon/image_6.jpg", use_column_width=True) +st.image("pages/Chameleon/image_7.jpg", use_column_width=True) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("Llava-NeXT-Interleave") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("Llava-NeXT-Interleave") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("Video-LLaVA") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("Video-LLaVA") diff --git a/pages/26_Video-LLaVA.py b/pages/26_Video-LLaVA.py new file mode 100644 index 0000000000000000000000000000000000000000..734fa7759a52d7153c1f3ab08aaf901d404f5d48 --- /dev/null +++ b/pages/26_Video-LLaVA.py @@ -0,0 +1,191 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'Video-LLaVA', + 'original_tweet': + """ + [Original tweet](https://x.com/mervenoyann/status/1816427325073842539) (July 25, 2024) + """, + 'tweet_1': + """ + We have recently merged Video-LLaVA to 🤗 Transformers! 🎞️ + What makes this model different? Keep reading ⇊ + """, + 'tweet_2': + """ + [Demo](https://t.co/MVP14uEj9e) | [Model](https://t.co/oqSCMUqwJo) + See below how to initialize the model and processor and infer ⬇️ + """, + 'tweet_3': + """ + Compared to other models that take image and video input and either project them separately or downsampling video and projecting selected frames, Video-LLaVA is converting images and videos to unified representation and project them using a shared projection layer. + """, + 'tweet_4': + """ + It uses Vicuna 1.5 as the language model and LanguageBind's own encoders that's based on OpenCLIP, these encoders project the modalities to an unified representation before passing to projection layer. + """, + 'tweet_5': + """ + I feel like one of the coolest features of this model is the joint understanding which is also introduced recently with many models. + It's a relatively older model but ahead of it's time and works very well! + """, + 'ressources': + """ + Ressources: + [Video-LLaVA: Learning United Visual Representation by Alignment Before Projection](https://arxiv.org/abs/2311.10122) + by Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, Li Yuan (2023) + [GitHub](https://github.com/PKU-YuanGroup/Video-LLaVA) + [Hugging Face documentation](https://huggingface.co/docs/transformers/main/en/model_doc/video_llava) + """ + }, +'fr': { + 'title': 'Video-LLaVA', + 'original_tweet': + """ + [Tweet de base](https://x.com/mervenoyann/status/1816427325073842539) (en anglais) (25 juillet 2024) + """, + 'tweet_1': + """ + Nous avons récemment intégré Video-LLaVA dans 🤗 Transformers ! 🎞️ + Qu'est-ce qui rend ce modèle différent ? Continuez à lire ⇊ + """, + 'tweet_2': + """ + [Demo](https://t.co/MVP14uEj9e) | [Modèle](https://t.co/oqSCMUqwJo) + Voir ci-dessous comment initialiser le modèle et le processeur puis inférer ⬇️ + """, + 'tweet_3': + """ + Par rapport à d'autres modèles qui prennent des images et des vidéos en entrée et les projettent séparément ou qui réduisent l'échantillonnage vidéo et projettent des images sélectionnées, Video-LLaVA convertit les images et les vidéos en une représentation unifiée et les projette à l'aide d'une couche de projection partagée. """, + 'tweet_4': + """ + Il utilise Vicuna 1.5 comme modèle de langage et les encodeurs de LanguageBind basés sur OpenCLIP. Ces encodeurs projettent les modalités vers une représentation unifiée avant de passer à la couche de projection. """, + 'tweet_5': + """ + J'ai l'impression que l'une des caractéristiques les plus intéressantes de ce modèle est la compréhension conjointe qui a été introduite récemment dans de nombreux modèles. + Il s'agit d'un modèle relativement ancien, mais il est en avance sur son temps et fonctionne très bien ! + """, + 'ressources': + """ + Ressources : + [Video-LLaVA: Learning United Visual Representation by Alignment Before Projection](https://arxiv.org/abs/2311.10122) + de Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, Li Yuan (2023) + [GitHub](https://github.com/PKU-YuanGroup/Video-LLaVA) + [Documentation d'Hugging Face](https://huggingface.co/docs/transformers/main/en/model_doc/video_llava) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.video("pages/Video-LLaVA/video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Video-LLaVA/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +with st.expander ("Code"): + if lang == "en": + st.code(""" + from transformers import VideoLlavaForConditionalGeneration, VideoLlavaProcessor + import torch + + # load the model and processor + model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", torch_dtype-torch.float16, device_map="cuda") + processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-78-hf") + + # process inputs and infer + inputs = processor(text=prompt, videos=sampled_frames, return_tensors="pt") + generate_ids = model.generate(**inputs, max_tength=80) + out = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + + # this is a non-exhaustive example to show the API, see model card for full inference + """) + else: + st.code(""" + from transformers import VideoLlavaForConditionalGeneration, VideoLlavaProcessor + import torch + + # chargement du modèle et du processeur + model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", torch_dtype-torch.float16, device_map="cuda") + processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-78-hf") + + # traiter les entrées et inférer + inputs = processor(text=prompt, videos=sampled_frames, return_tensors="pt") + generate_ids = model.generate(**inputs, max_tength=80) + out = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + + # Il s'agit d'un exemple non exhaustif pour montrer l'API, voir la carte de modèle pour l'inférence complète + """) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Video-LLaVA/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Video-LLaVA/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Video-LLaVA/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("Chameleon") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("Chameleon") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("SAMv2") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("SAMv2") diff --git a/pages/27_SAMv2.py b/pages/27_SAMv2.py new file mode 100644 index 0000000000000000000000000000000000000000..9c90ead6a40a1a88740bf5af5ce595c18da3354f --- /dev/null +++ b/pages/27_SAMv2.py @@ -0,0 +1,187 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'SAMv2', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1818675981634109701) (July 31, 2024) + """, + 'tweet_1': + """ + SAMv2 is just mindblowingly good 😍 + Learn what makes this model so good at video segmentation, keep reading 🦆⇓ + """, + 'tweet_2': + """ + Check out the [demo](https://t.co/35ixEZgPaf) by [skalskip92](https://x.com/skalskip92) to see how to use the model locally. + Check out Meta's [demo](https://t.co/Bcbli9Cfim) where you can edit segmented instances too! +
+ Segment Anything Model by Meta was released as a universal segmentation model in which you could prompt a box or point prompt to segment the object of interest. + SAM consists of an image encoder to encode images, a prompt encoder to encode prompts, then outputs of these two are given to a mask decoder to generate masks. + """, + 'tweet_3': + """ + However SAM doesn't naturally track object instances in videos, one needs to make sure to prompt the same mask or point prompt for that instance in each frame and feed each frame, which is infeasible 😔 + But don't fret, that is where SAMv2 comes in with a memory module! +
+ SAMv2 defines a new task called "masklet prediction" here masklet refers to the same mask instance throughout the frames 🎞️ + Unlike SAM, SAM 2 decoder is not fed the image embedding directly from an image encoder, but attention of memories of prompted frames and object pointers. + """, + 'tweet_4': + """ + 🖼️ These "memories" are essentially past predictions of object of interest up to a number of recent frames, + and are in form of feature maps of location info (spatial feature maps). + 👉🏻 The object pointers are high level semantic information of the object of interest based on. +
+ Just like SAM paper SAMv2 depends on a data engine, and the dataset it generated comes with the release: SA-V 🤯 + This dataset is gigantic, it has 190.9K manual masklet annotations and 451.7K automatic masklets! + """, + 'tweet_5': + """ + Initially they apply SAM to each frame to assist human annotators to annotate a video at six FPS for high quality data, + in the second phase they add SAM and SAM2 to generate masklets across time consistently. Finally they use SAM2 to refine the masklets. +
+ They have evaluated this model on J&F score (Jaccard Index + F-measure for contour acc) which is used to evaluate zero-shot + video segmentation benchmarks. + SAMv2 seems to outperform two previously sota models that are built on top of SAM! 🥹 + """, + 'ressources': + """ + Ressources: + [SAM 2: Segment Anything in Images and Videos](https://arxiv.org/abs/2408.00714) by Nikhila Ravi, Valentin Gabeur, Yuan-Ting Hu, Ronghang Hu, Chaitanya Ryali, Tengyu Ma, Haitham Khedr, Roman Rädle, Chloe Rolland, Laura Gustafson, Eric Mintun, Junting Pan, Kalyan Vasudev Alwala, Nicolas Carion, Chao-Yuan Wu, Ross Girshick, Piotr Dollár, Christoph Feichtenhofer (2024) + [GitHub](https://github.com/facebookresearch/segment-anything-2) + [Models and Demos Collection](https://huggingface.co/collections/merve/sam2-66ac9deac6fca3bc5482fe30) + """ + }, +'fr': { + 'title': 'SAMv2', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1818675981634109701) (en anglais) (31 juillet 2024) + """, + 'tweet_1': + """ + SAMv2 est tout simplement époustouflant 😍 + Pour savoir ce qui rend ce modèle si performant en matière de segmentation vidéo, continuez à lire 🦆⇓ + """, + 'tweet_2': + """ + Consultez la [demo](https://t.co/35ixEZgPaf) de [skalskip92](https://x.com/skalskip92) pour voir comment utiliser le modèle localement. + Consultez la [demo](https://t.co/Bcbli9Cfim) de Meta où vous pouvez éditer des instances segmentées ! + + Le modèle Segment Anything de Meta a été lancé en tant que modèle de segmentation universel dans lequel vous pouvez prompter une boîte ou à un point pour segmenter l'objet d'intérêt. + SAM se compose d'un encodeur d'images pour encoder les images, d'un encodeur de prompt pour encoder les prompts, puis les sorties de ces deux encodeurs sont données à un décodeur masqué pour générer des masques. + """, + 'tweet_3': + """ + Cependant SAM ne traque pas les instances d'objets dans les vidéos, il faut s'assurer de demander le même masque ou le même point pour cette instance dans chaque image, ce qui est infaisable 😔. + Mais ne vous inquiétez pas, c'est là que SAMv2 intervient avec un module de mémoire ! + + SAMv2 définit une nouvelle tâche appelée "prédiction de masque". Ici le masque se réfère à la même instance de masque à travers les images 🎞️ + Contrairement à SAM, le décodeur SAM 2 n'est pas nourri par l'enchâssement de l'image issu de l'encodeur d'image, mais par l'attention des mémoires des images promptées/pointeurs d'objets. + """, + 'tweet_4': + """ + 🖼️ Ces "mémoires" sont essentiellement des prédictions passées de l'objet d'intérêt jusqu'à un certain nombre d'images récentes, et se présentent sous la forme de cartes de caractéristiques d'informations de localisation (cartes de caractéristiques spatiales). + 👉🏻 Les pointeurs d'objets sont des informations sémantiques de haut niveau sur l'objet d'intérêt. + + Tout comme SAM, SAMv2 dépend d'un moteur de données, et le jeu de données utilisé est fourni : SA-V 🤯 + Il est gigantesque, contenant 190,9K masques annotés manuellement et 451,7K automatiquement ! + """, + 'tweet_5': + """ + Dans un premier temps, les auteurs appliquent SAM à chaque image pour aider les annotateurs humains à annoter une vidéo de 6 FPS afin d'obtenir des données de haute qualité. Dans un deuxième temps, ils ajoutent SAM et SAM2 pour générer des masques de manière cohérente dans le temps. Enfin, ils utilisent SAM2 pour affiner les masques. + + Ils ont évalué ce modèle sur le score J&F (indice de Jaccard et F-mesure pour la précision des contours) qui est utilisé dans les benchmarks de segmentation de vidéos 0-shot. + SAMv2 semble surpasser deux modèles précédemment à l'état de l'art qui sont construits sur SAM ! 🥹 + """, + 'ressources': + """ + Ressources : + [SAM 2: Segment Anything in Images and Videos](https://arxiv.org/abs/2408.00714) de Nikhila Ravi, Valentin Gabeur, Yuan-Ting Hu, Ronghang Hu, Chaitanya Ryali, Tengyu Ma, Haitham Khedr, Roman Rädle, Chloe Rolland, Laura Gustafson, Eric Mintun, Junting Pan, Kalyan Vasudev Alwala, Nicolas Carion, Chao-Yuan Wu, Ross Girshick, Piotr Dollár, Christoph Feichtenhofer (2024) + [GitHub](https://github.com/facebookresearch/segment-anything-2) + [Collection de modèles et démonstrateurs](https://huggingface.co/collections/merve/sam2-66ac9deac6fca3bc5482fe30) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +col1, col2, col3 = st.columns(3) +with col2: + st.video("pages/SAMv2/video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/SAMv2/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/SAMv2/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/SAMv2/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/SAMv2/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("Video-LLaVA") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("Video-LLaVA") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("NVEagle") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("NVEagle") diff --git a/pages/28_NVEagle.py b/pages/28_NVEagle.py new file mode 100644 index 0000000000000000000000000000000000000000..345a18b0fe32827fa2133990a8b33f27d8a9dad4 --- /dev/null +++ b/pages/28_NVEagle.py @@ -0,0 +1,165 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'NVEagle', + 'original_tweet': + """ + [Original tweet](https://x.com/mervenoyann/status/1829144958101561681) (August 29, 2024) + """, + 'tweet_1': + """ + NVIDIA just dropped NVEagle 🦅 + Super impressive vision language model that comes in 7B, 13B and 13B fine-tuned on chat, improved visual perception with MoE vision encoders 💬 + Keep reading for details and links ⇓ + """, + 'tweet_2': + """ + [Model repositories](https://huggingface.co/collections/merve/nveagle-66d0705108582d73bb235c26) | Try it [here](https://huggingface.co/spaces/NVEagle/Eagle-X5-13B-Chat) 💬 (works very well! 🤯) + """, + 'tweet_3': + """ + This model essentially explores having different experts (MoE) and fusion strategies for image encoders. + I have been talking about how VLMs improve when using multiple encoders in parallel, so seeing this paper MoE made me happy! 🥲 + """, + 'tweet_4': + """ + How? 🧐 + The authors concatenate the vision encoder output tokens together, and they apply "pre-alignment": essentially fine-tune experts with frozen text encoder. + Rest of the architecture is quite similar to LlaVA. + """, + 'tweet_5': + """ + Then they freeze both experts and the decoder and just train the projection layer, and finally, they unfreeze everything for supervised fine-tuning ✨ +
+ They explore different fusion strategies and encoders, extending basic CLIP encoder, and find out that simply concatenating visual tokens works well 🥹 + See below the performances of different experts ⇓⇓ + """, + 'ressources': + """ + Ressources: + [Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of Encoders](https://www.arxiv.org/abs/2408.15998) + by Min Shi, Fuxiao Liu, Shihao Wang, Shijia Liao, Subhashree Radhakrishnan, De-An Huang, Hongxu Yin, Karan Sapra, Yaser Yacoob, Humphrey Shi, Bryan Catanzaro, Andrew Tao, Jan Kautz, Zhiding Yu, Guilin Liu (2024) + [GitHub](https://github.com/NVlabs/Eagle) + [Models and Demos Collection](https://huggingface.co/collections/merve/nveagle-66d0705108582d73bb235c26) + """ + }, +'fr': { + 'title': 'NVEagle', + 'original_tweet': + """ + [Tweet de base](https://x.com/mervenoyann/status/1829144958101561681) (en anglais) (29 août 2024) + """, + 'tweet_1': + """ + NVIDIA vient de sortir NVEagle 🦅 + Un modèle langage-vision très impressionnant disponible en taille 7B, 13B et 13B, finetuné sur des données de chat. + Il dispose d'une perception visuelle améliorée via un mélange d'experts (MoE) d'encodeurs de vision 💬 + Continuez à lire pour plus de détails et des liens ⇓ + """, + 'tweet_2': + """ + [Répertoire des modèles](https://huggingface.co/collections/merve/nveagle-66d0705108582d73bb235c26) | [Essayez-le ici](https://huggingface.co/spaces/NVEagle/Eagle-X5-13B-Chat) 💬 (fonctionne très bien ! 🤯) + """, + 'tweet_3': + """ + Ce modèle explore le fait d'avoir différents experts et des stratégies de fusion pour les encodeurs d'images. + J'ai parlé de la façon dont les VLM s'améliorent lors de l'utilisation de plusieurs encodeurs en parallèle. Ce papier m'a ainsi rendu heureuse ! 🥲 + """, + 'tweet_4': + """ + Comment ? 🧐 + Les auteurs concatènent les tokens de sortie de l'encodeur de vision ensemble, et ils appliquent un « pré-alignement » : ils finetunent les experts avec un encodeur de texte gelé. Le reste de l'architecture est assez similaire à LlaVA. + """, + 'tweet_5': + """ + Ensuite, ils gèlent les experts et le décodeur et entraînent simplement la couche de projection. Finalement, ils dégèlent le tout pour un finetuning supervisé ✨ +
+ Ils explorent différentes stratégies de fusion et d'encodeurs, étendant l'encodeur CLIP de base, et découvrent que la simple concaténation de tokens visuels fonctionne bien 🥹 + Voir ci-dessous les performances de différents experts ⇓⇓ + """, + 'ressources': + """ + Ressources : + [Eagle: Exploring The Design Space for Multimodal LLMs with Mixture of Encoders](https://www.arxiv.org/abs/2408.15998) + de Min Shi, Fuxiao Liu, Shihao Wang, Shijia Liao, Subhashree Radhakrishnan, De-An Huang, Hongxu Yin, Karan Sapra, Yaser Yacoob, Humphrey Shi, Bryan Catanzaro, Andrew Tao, Jan Kautz, Zhiding Yu, Guilin Liu (2024) + [GitHub](https://github.com/NVlabs/Eagle) + [Models and Demos Collection](https://huggingface.co/collections/merve/nveagle-66d0705108582d73bb235c26) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/NVEagle/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/NVEagle/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/NVEagle/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/NVEagle/image_4.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("SAMv2") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("SAMv2") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("NVLM") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("NVLM") diff --git a/pages/29_NVLM.py b/pages/29_NVLM.py new file mode 100644 index 0000000000000000000000000000000000000000..0c9e583c96c32086c57871676523d642ec33185e --- /dev/null +++ b/pages/29_NVLM.py @@ -0,0 +1,167 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'NVLM', + 'original_tweet': + """ + [Original tweet](https://x.com/mervenoyann/status/1841098941900767323) (October 1st, 2024) + """, + 'tweet_1': + """ + NVIDIA just dropped a gigantic multimodal model called NVLM 72B 🦖 + Explaining everything from what I got of reading the paper here 📝 + """, + 'tweet_2': + """ + The paper contains many ablation studies on various ways to use the LLM backbone 👇🏻 + + 🦩 Flamingo-like cross-attention (NVLM-X) + 🌋 Llava-like concatenation of image and text embeddings to a decoder-only model (NVLM-D) + ✨ a hybrid architecture (NVLM-H) + """, + 'tweet_3': + """ + Checking evaluations, NVLM-D and NVLM-H are best or second best compared to other models 👏 + + The released model is NVLM-D based on Qwen-2 Instruct, aligned with InternViT-6B using a huge mixture of different datasets + """, + 'tweet_4': + """ + You can easily use this model by loading it through 🤗 Transformers' AutoModel 😍 + """, + 'ressources': + """ + Ressources: + [NVLM: Open Frontier-Class Multimodal LLMs](https://arxiv.org/abs/2409.11402) + by Wenliang Dai, Nayeon Lee, Boxin Wang, Zhuoling Yang, Zihan Liu, Jon Barker, Tuomas Rintamaki, Mohammad Shoeybi, Bryan Catanzaro, Wei Ping (2024) + [GitHub](https://nvlm-project.github.io/) + [Model](https://huggingface.co/nvidia/NVLM-D-72B) + """ + }, +'fr': { + 'title': 'NVLM', + 'original_tweet': + """ + [Tweet de base](https://x.com/mervenoyann/status/1841098941900767323) (en anglais) (1er ocotbre 2024) + """, + 'tweet_1': + """ + NVIDIA vient de publier un gigantesque modèle multimodal appelé NVLM 72B 🦖 + J'explique tout ce que j'ai compris suite à la lecture du papier 📝 + """, + 'tweet_2': + """ + L'article contient de nombreuses études d'ablation sur les différentes façons d'utiliser le backbone 👇🏻 + + 🦩 Attention croisée de type Flamingo (NVLM-X) + 🌋 concaténation de type Llava d'embeddings d'images et de textes à un décodeur (NVLM-D) + ✨ une architecture hybride (NVLM-H) + """, + 'tweet_3': + """ + En vérifiant les évaluations, NVLM-D et NVLM-H sont les meilleurs ou les deuxièmes par rapport aux autres modèles 👏 + + Le modèle publié est NVLM-D basé sur Qwen-2 Instruct, aligné avec InternViT-6B en utilisant un énorme mélange de différents jeux de données. + """, + 'tweet_4': + """ + Vous pouvez facilement utiliser ce modèle en le chargeant via AutoModel de 🤗 Transformers 😍 + """, + 'ressources': + """ + Ressources : + [NVLM: Open Frontier-Class Multimodal LLMs](https://arxiv.org/abs/2409.11402) + de Wenliang Dai, Nayeon Lee, Boxin Wang, Zhuoling Yang, Zihan Liu, Jon Barker, Tuomas Rintamaki, Mohammad Shoeybi, Bryan Catanzaro, Wei Ping (2024) + [GitHub](https://nvlm-project.github.io/) + [Modèle](https://huggingface.co/nvidia/NVLM-D-72B) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/NVLM/image_1.png", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/NVLM/image_2.png", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/NVLM/image_3.png", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/NVLM/image_4.png", use_column_width=True) +st.markdown(""" """) + +with st.expander ("Code"): + st.code(""" + import torch + from transformers import AutoModel + + path = "nvidia/NVLM-D-72B" + + model = AutoModel.from_pretrained( + path, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + use_flash_attn=False, + trust_remote_code=True).eval() + """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("NVEagle") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("NVEagle") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("GOT") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("GOT") diff --git a/pages/2_Oneformer.py b/pages/2_Oneformer.py index c6b2996b0e8cec6324ee7313ab998633e47a931f..d92297eeeee928926ce7e9ef285b0d94bf3d5814 100644 --- a/pages/2_Oneformer.py +++ b/pages/2_Oneformer.py @@ -1,62 +1,178 @@ import streamlit as st from streamlit_extras.switch_page_button import switch_page -st.title("OneFormer") -st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1739707076501221608) (December 26, 2023)""", icon="ℹ️") +translations = { +'en': {'title': 'OneFormer', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1739707076501221608) (December 26, 2023) + """, + 'tweet_1': + """ + OneFormer: one model to segment them all? 🤯 + I was looking into paperswithcode leaderboards when I came across OneFormer for the first time so it was time to dig in! + """, + 'tweet_2': + """ + OneFormer is a "truly universal" model for semantic, instance and panoptic segmentation tasks ⚔️ + What makes is truly universal is that it's a single model that is trained only once and can be used across all tasks 👇 + """, + 'tweet_3': + """ + The enabler here is the text conditioning, i.e. the model is given a text query that states task type along with the appropriate input, and using contrastive loss, the model learns the difference between different task types 👇 + + """, + 'tweet_4': + """ + Thanks to 🤗 Transformers, you can easily use the model! + I have drafted a [notebook](https://t.co/cBylk1Uv20) for you to try right away 😊 + You can also check out the [Space](https://t.co/31GxlVo1W5) without checking out the code itself. + """, + 'ressources': + """ + Ressources: + [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) + by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi (2022) + [GitHub](https://github.com/SHI-Labs/OneFormer) + [Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/oneformer)""" + }, +'fr': { + 'title': 'OneFormer', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1739707076501221608) (en anglais) (26 décembre 2023) + """, + 'tweet_1': + """ + OneFormer : un seul modèle pour tous les segmenter ? 🤯 + Je regardais les classements de paperswithcode quand je suis tombée sur OneFormer pour la première fois. J'ai donc creusé les choses ! + """, + 'tweet_2': + """ + OneFormer est un modèle "véritablement universel" pour les tâches de segmentation sémantique, d'instance et panoptique ⚔️ + Ce qui le rend vraiment universel, c'est qu'il s'agit d'un modèle unique qui n'est entraîné qu'une seule fois et qui peut être utilisé pour toutes les tâches 👇 + """, + 'tweet_3': + """ + Le catalyseur ici est le conditionnement du texte, c'est-à-dire que le modèle reçoit une requête textuelle indiquant le type de tâche ainsi que l'entrée appropriée, et en utilisant la perte contrastive, le modèle apprend la différence entre les différents types de tâches 👇 """, + 'tweet_4': + """ + Grâce à 🤗 Transformers, vous pouvez facilement utiliser ce modèle ! + J'ai rédigé un [notebook](https://t.co/cBylk1Uv20) que vous pouvez essayer sans attendre 😊 + Vous pouvez également consulter le [Space](https://t.co/31GxlVo1W5) sans consulter le code lui-même. + """, + 'ressources': + """ + Ressources : + [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) + de Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi (2022) + [GitHub](https://github.com/SHI-Labs/OneFormer) + [Documentation d'Hugging Face](https://huggingface.co/docs/transformers/model_doc/oneformer) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") st.markdown(""" """) -st.markdown(""" -OneFormer: one model to segment them all? 🤯 -I was looking into paperswithcode leaderboards when I came across OneFormer for the first time so it was time to dig in! -""") +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/OneFormer/image_1.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown("""OneFormer is a "truly universal" model for semantic, instance and panoptic segmentation tasks ⚔️ -What makes is truly universal is that it's a single model that is trained only once and can be used across all tasks 👇 -""") +st.markdown(translations[lang]["tweet_2"]) st.markdown(""" """) st.image("pages/OneFormer/image_2.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -The enabler here is the text conditioning, i.e. the model is given a text query that states task type along with the appropriate input, and using contrastive loss, the model learns the difference between different task types 👇 -""") +st.markdown(translations[lang]["tweet_3"]) st.markdown(""" """) st.image("pages/OneFormer/image_3.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown("""Thanks to 🤗 Transformers, you can easily use the model! -I have drafted a [notebook](https://t.co/cBylk1Uv20) for you to try right away 😊 -You can also check out the [Space](https://t.co/31GxlVo1W5) without checking out the code itself. -""") +st.markdown(translations[lang]["tweet_4"]) st.markdown(""" """) st.image("pages/OneFormer/image_4.jpeg", use_column_width=True) st.markdown(""" """) -st.info(""" -Ressources: -[OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) -by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi (2022) -[GitHub](https://github.com/SHI-Labs/OneFormer) -[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/oneformer)""", icon="📚") +with st.expander ("Code"): + if lang == "en": + st.code(""" + from transformers import OneFormerProcessor, OneFormerForUniversalSegmentation + + # Loading a single model for all three tasks + processor = OneformerProcessor.from_pretrained("shi-Labs/oneformer_cityscapes_swin_large") + model = OneFormerForUniversalSegmentation.from_pretrained("shi-labs/oneformer_cityscapes_swin_large") + + # To get panoptic and instance segmentation results, swap task_inputs with "panoptic" or "instance" and use the appropriate post processing method + semantic_inputs = processor(images=image, task_inputs=["semantic"], return_tensors="pt") + semantic_outputs = model(**semantic_inputs) + + # pass through image_processor for postprocessing + predicted_semantic_map = processor.post_process_semantic_segmentation(semantic_outputs, target_sizes=[image.size[::-1]])[0] + """) + else: + st.code(""" + from transformers import OneFormerProcessor, OneFormerForUniversalSegmentation + + # Chargement d'un seul modèle pour les trois tâches + processor = OneformerProcessor.from_pretrained("shi-Labs/oneformer_cityscapes_swin_large") + model = OneFormerForUniversalSegmentation.from_pretrained("shi-labs/oneformer_cityscapes_swin_large") + + # Pour avoir des résultats de segmentation panoptique ou par instance, remplacez task_inputs par "panoptic" ou "instance" et utilisez la méthode de post-traitement appropriée + semantic_inputs = processor(images=image, task_inputs=["semantic"], return_tensors="pt") + semantic_outputs = model(**semantic_inputs) + + # passage par image_processor pour le post-traitement + predicted_semantic_map = processor.post_process_semantic_segmentation(semantic_outputs, target_sizes=[image.size[::-1]])[0] + """) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") st.markdown(""" """) st.markdown(""" """) st.markdown(""" """) -col1, col2, col3 = st.columns(3) +col1, col2, col3= st.columns(3) with col1: - if st.button('Previous paper', use_container_width=True): - switch_page("MobileSAM") + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("MobileSAM") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("MobileSAM") with col2: - if st.button('Home', use_container_width=True): - switch_page("Home") + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") with col3: - if st.button('Next paper', use_container_width=True): - switch_page("VITMAE") \ No newline at end of file + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("VITMAE") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("VITMAE") \ No newline at end of file diff --git a/pages/30_GOT.py b/pages/30_GOT.py new file mode 100644 index 0000000000000000000000000000000000000000..1b264b37a6d01dca6ec597387f16633d49ffd346 --- /dev/null +++ b/pages/30_GOT.py @@ -0,0 +1,195 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'GOT', + 'original_tweet': + """ + [Original tweet](https://x.com/mervenoyann/status/1843278355749065084) (October 7, 2024) + """, + 'tweet_1': + """ + I'm bullish on this foundation OCR model called GOT 📝 + This model can transcribe anything and it's Apache-2.0! + Keep reading to learn more 🧶 + """, + 'tweet_2': + """ + This model can take in screenshots of tables/LaTeX and output formatted text, music sheets, charts, literally anything to meaningful format! + [Try it](https://huggingface.co/spaces/stepfun-ai/GOT_official_online_demo) + """, + 'tweet_3': + """ + This model has the same architecture as other vision language models 👀 Consists of an image encoder, projector and text decoder. +
+ What makes this model special in my opinion are two things: + 1. Diverse, high quality data mixture (thus data engine) + 2. Alignment technique + """, + 'tweet_4': + """ + Authors followed the following recipe: + 🔥 pre-trained a vision encoder by using OPT-125M + ✨ keep training same encoder, add a new linear layer and Qwen-0.5B and train all the components + ❄️ finally they freeze the encoder and do fine-tuning 👇🏻 + """, + 'tweet_5': + """ + Their training data generated with engine consists of: + 📝 plain OCR data + 📑 mathpix markdown (tables, LaTeX formulas etc) + 📊 charts (chart to JSON output) + 📐 geometric shapes (into TikZ) + 🎼 even music sheets + """, + 'tweet_6': + """ + The authors have reported different metrics and it seems despite it's small size, the model seems to be the state-of-the-art in many benchmarks! + """, + 'ressources': + """ + Ressources: + [General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model](https://arxiv.org/abs/2409.01704) + by Haoran Wei, Chenglong Liu, Jinyue Chen, Jia Wang, Lingyu Kong, Yanming Xu, Zheng Ge, Liang Zhao, Jianjian Sun, Yuang Peng, Chunrui Han, Xiangyu Zhang (2024) + [GitHub](https://github.com/Ucas-HaoranWei/GOT-OCR2.0/) + [Model](https://huggingface.co/stepfun-ai/GOT-OCR2_0) + """ + }, +'fr': { + 'title': 'GOT', + 'original_tweet': + """ + [Tweet de base](https://x.com/mervenoyann/status/1843278355749065084) (en anglais) (7 ocotbre 2024) + """, + 'tweet_1': + """ + Je suis enthousiaste pour de ce modèle d'OCR appelé GOT 📝 + Ce modèle peut transcrire n'importe quoi et il est Apache-2.0 ! + Continuez à lire pour en savoir plus 🧶 + """, + 'tweet_2': + """ + Ce modèle peut recevoir des captures d'écran de tableaux/LaTeX et produire du texte formaté, des partitions, des graphiques, littéralement tout ce qui peut être mis en forme ! + [Essayez-le](https://huggingface.co/spaces/stepfun-ai/GOT_official_online_demo) + """, + 'tweet_3': + """ + Ce modèle a la même architecture que d'autres modèles de langage de vision 👀 + Il se compose d'un encodeur d'images, d'un projecteur et d'un décodeur de texte. +
+ Ce qui rend ce modèle spécial à mon avis, ce sont deux choses : + 1. Mélange de données diversifiées et de haute qualité (donc moteur de données). + 2. Technique d'alignement + """, + 'tweet_4': + """ + Les auteurs ont suivi la recette suivante : + 🔥 pré-entraînement d'un encodeur de vision en utilisant OPT-125M + ✨ poursuite de l'entraînement du même encodeur, ajout d'une nouvelle couche linéaire et de Qwen-0.5B et entraînement de tous les composants + ❄️ enfin, ils figent l'encodeur et procèdent à un finetuning 👇🏻 + """, + 'tweet_5': + """ + Les données d'entraînement générées par le moteur sont : + 📝 des données OCR simples + 📑 des mathpix markdown (tableaux, formules LaTeX, etc.) + 📊 des graphiques (sortie des graphiques en JSON) + 📐 des formes géométriques (dans TikZ) + 🎼 des partitions de musique + """, + 'tweet_6': + """ + Les auteurs ont rapporté différentes métriques et il semble qu'en dépit de sa petite taille, le modèle soit SOTA dans de nombreux benchmarks ! + """, + 'ressources': + """ + Ressources : + [General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model](https://arxiv.org/abs/2409.01704) + de Haoran Wei, Chenglong Liu, Jinyue Chen, Jia Wang, Lingyu Kong, Yanming Xu, Zheng Ge, Liang Zhao, Jianjian Sun, Yuang Peng, Chunrui Han, Xiangyu Zhang (2024) + [GitHub](https://github.com/Ucas-HaoranWei/GOT-OCR2.0/) + [Modèle](https://huggingface.co/stepfun-ai/GOT-OCR2_0) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/GOT/image_1.png", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/GOT/image_2.png", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/GOT/image_3.png", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/GOT/image_4.png", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_6"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/GOT/image_5.png", use_column_width=True) +st.markdown(""" """) + + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("NVLM") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("NVLM") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("Aria") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("Aria") diff --git a/pages/31_Aria.py b/pages/31_Aria.py new file mode 100644 index 0000000000000000000000000000000000000000..da91d4caa9aedca0f0b7c836363f6e7b566be0c9 --- /dev/null +++ b/pages/31_Aria.py @@ -0,0 +1,187 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'Aria', + 'original_tweet': + """ + [Original tweet](https://x.com/mervenoyann/status/1844356121370427546) (October 10, 2024) + """, + 'tweet_1': + """ + This is the BEST vision language model I have ever tried! +
+ Aria is a new model by @rhymes_ai_ : a 25.3B multimodal model that can take image/video inputs 🤩 +
+ They release the model with Apache-2.0 license and fine-tuning scripts as well 👏 + I tested it extensively, keep reading to learn more 🧶 + """, + 'tweet_2': + """ + The model is open-sourced [here](huggingface.co/rhymes-ai/Aria) +
+ The authors have released fine-tuning examples on RefCOCO, NextQA and NLVR and [inference examples](github.com/rhymes-ai/Aria) +
+ Try the demo [here](rhymes.ai) +
+ It's super nice that you can get started with this model using 🤗 Transformers. + """, + 'tweet_3': + """ + I saw on the paper that it can debug screenshot of code??? 🤯 + So I tried it on piece of code that calculates KL-div and it understood very well! + """, + 'tweet_4': + """ + The model has very impressive OCR capabilities even with the bad handwriting 📝 + """, + 'tweet_5': + """ + Real world knowledge ⇓ + """, + 'ressources': + """ + Ressources: + [Aria: An Open Multimodal Native Mixture-of-Experts Model](https://arxiv.org/abs/2410.05993) + by Dongxu Li, Yudong Liu, Haoning Wu, Yue Wang, Zhiqi Shen, Bowen Qu, Xinyao Niu, Guoyin Wang, Bei Chen, Junnan Li (2024) + [GitHub](https://github.com/rhymes-ai/Aria) + [Model](https://huggingface.co/rhymes-ai/Aria) + """ + }, +'fr': { + 'title': 'Aria', + 'original_tweet': + """ + [Tweet de base](https://x.com/mervenoyann/status/1844356121370427546) (en anglais) (10 ocotbre 2024) + """, + 'tweet_1': + """ + C'est le MEILLEUR modèle de langage-vision que j'ai jamais essayé ! +
+ Aria est un nouveau modèle de @rhymes_ai_ : de 25,3Mds paramètres ce un modèle multimodal peut prendre des images et des vidéos en entrée 🤩 +
+ Ils publient le modèle avec une licence Apache-2.0 et des scripts fine-tuning 👏 + Je l'ai testé en profondeur, continuez à lire pour en savoir plus 🧶 + """, + 'tweet_2': + """ + Le modèle est en libre accès [ici](huggingface.co/rhymes-ai/Aria) +
+ Les auteurs ont publié des exemples de finetuning sur RefCOCO, NextQA et NLVR et des [exemples d'inférence](github.com/rhymes-ai/Aria). +
+ Essayez la démo [ici](rhymes.ai) +
+ C'est super sympa de pouvoir utiliser avec ce modèle en utilisant 🤗 Transformers + """, + 'tweet_3': + """ + J'ai vu sur le papier qu'il pouvait déboguer des captures d'écran de code ? ??? 🤯 + J'ai donc essayé sur un bout de code qui calcule la divergence de Kullback-Leibler et il a très bien compris ! + """, + 'tweet_4': + """ + Le modèle possède des capacités d'OCR très impressionnantes, même avec une mauvaise écriture. 📝 + """, + 'tweet_5': + """ + Connaissance du monde réel ⇓ + """, + 'ressources': + """ + Ressources : + [Aria: An Open Multimodal Native Mixture-of-Experts Model](https://arxiv.org/abs/2410.05993) + de Dongxu Li, Yudong Liu, Haoning Wu, Yue Wang, Zhiqi Shen, Bowen Qu, Xinyao Niu, Guoyin Wang, Bei Chen, Junnan Li (2024) + [GitHub](https://github.com/rhymes-ai/Aria) + [Model](https://huggingface.co/rhymes-ai/Aria) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.video("pages/Aria/video_1.mp4", format="video/mp4") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Aria/image_0.png", use_column_width=True) +st.markdown(""" """) +with st.expander ("Code"): + st.code(""" + from transformers import AutoModelForCausalLM, AutoProcessor + model_id_or_path = "rhymes-ai/Aria" + + model = AutoModelForCausalLM.from_pretrained(model_id_or_path, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True) + + processor = AutoProcessor.from_pretrained(model_id_or_path, trust_remote_code=True) + """) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Aria/image_1.png", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Aria/image_2.png", use_column_width=True) +st.image("pages/Aria/image_3.png", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/Aria/image_4.png", use_column_width=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("GOT") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("GOT") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("Home") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("Home") diff --git a/pages/3_VITMAE.py b/pages/3_VITMAE.py index 582c71f3a0f4f3e77f71afbd658a3a18dfea9869..546d27d4cdab6648a247d31c846190f0790c1ae8 100644 --- a/pages/3_VITMAE.py +++ b/pages/3_VITMAE.py @@ -1,63 +1,150 @@ import streamlit as st from streamlit_extras.switch_page_button import switch_page -st.title("VITMAE") -st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1740688304784183664) (December 29, 2023)""", icon="ℹ️") +translations = { +'en': {'title': 'VITMAE', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1740688304784183664) (December 29, 2023) + """, + 'tweet_1': + """ + Just read ViTMAE paper, sharing some highlights 🧶 + ViTMAE is a simply yet effective self-supervised pre-training technique, where authors combined vision transformer with masked autoencoder. + The images are first masked (75 percent of the image!) and then the model tries to learn about the features through trying to reconstruct the original image! + """, + 'tweet_2': + """ + The image is not masked, but rather only the visible patches are fed to the encoder (and that is the only thing encoder sees!). + Next, a mask token is added to where the masked patches are (a bit like BERT, if you will) and the mask tokens and encoded patches are fed to decoder. + The decoder then tries to reconstruct the original image. + """, + 'tweet_3': + """ + As a result, the authors found out that high masking ratio works well in fine-tuning for downstream tasks and linear probing 🤯🤯 + """, + 'tweet_4': + """ + If you want to try the model or fine-tune, all the pre-trained VITMAE models released released by Meta are available on [Huggingface](https://t.co/didvTL9Zkm). + We've built a [demo](https://t.co/PkuACJiKrB) for you to see the intermediate outputs and reconstruction by VITMAE. +
+ Also there's a nice [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTMAE/ViT_MAE_visualization_demo.ipynb) by [@NielsRogge](https://twitter.com/NielsRogge). + """, + 'ressources': + """ + Ressources: + [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377v3) + by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick (2021) + [GitHub](https://github.com/facebookresearch/mae) + [Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/vit_mae)""" + }, +'fr': { + 'title': 'VITMAE', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1740688304784183664) (en anglais) (29 décembre 2023) + """, + 'tweet_1': + """ + Je viens de lire le papier du ViTMAE, voici quelques points marquants 🧶 + ViTMAE est une technique de pré-entraînement autosupervisée simple mais efficace, où les auteurs combinent un vision transformer avec un autoencodeur masqué. + Les images sont d'abord masquées (75 % de l'image !), puis le modèle tente d'apprendre les caractéristiques en reconstruisant l'image originale ! + """, + 'tweet_2': + """ + Techniquement l'image n'est pas masquée, seules les parties visibles sont transmises à l'encodeur (et c'est la seule chose qu'il voit !). + Ensuite, un token de masque est ajouté à l'endroit où se trouvent les patchs masqués (un peu comme BERT) et l'ensemble est transmis au décodeur. + Le décodeur tente alors de reconstruire l'image originale. + """, + 'tweet_3': + """ + Les auteurs ont constaté qu'un taux de masquage élevé fonctionnait bien pour le finetuning et l'échantillonage linéaire 🤯🤯. + """, + 'tweet_4': + """ + Si vous souhaitez essayer le modèle ou le finetuner, tous les poids pré-entraînés publiés par Meta sont disponibles sur [Huggingface](https://t.co/didvTL9Zkm). + Nous avons aussi créé une [demo](https://t.co/PkuACJiKrB) pour que vous puissiez voir les sorties intermédiaires et la reconstruction par le VITMAE. +
+ Vous pouvez aussi consulter le [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTMAE/ViT_MAE_visualization_demo.ipynb) de [@NielsRogge](https://twitter.com/NielsRogge). """, + 'ressources': + """ + Ressources : + [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377v3) + de Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick (2021) + [GitHub](https://github.com/facebookresearch/mae) + [Documentation d'Hugging Face](https://huggingface.co/docs/transformers/model_doc/vit_mae) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") st.markdown(""" """) -st.markdown("""Just read VitMAE paper, sharing some highlights 🧶 -ViTMAE is a simply yet effective self-supervised pre-training technique, where authors combined vision transformer with masked autoencoder. -The images are first masked (75 percent of the image!) and then the model tries to learn about the features through trying to reconstruct the original image! -""") +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/VITMAE/image_1.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown("""The image is not masked, but rather only the visible patches are fed to the encoder (and that is the only thing encoder sees!). -Next, a mask token is added to where the masked patches are (a bit like BERT, if you will) and the mask tokens and encoded patches are fed to decoder. -The decoder then tries to reconstruct the original image. -""") +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/VITMAE/image_2.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown("""As a result, the authors found out that high masking ratio works well in fine-tuning for downstream tasks and linear probing 🤯🤯 -""") +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/VITMAE/image_3.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown("""If you want to try the model or fine-tune, all the pre-trained VITMAE models released released by Meta are available on [Huggingface](https://t.co/didvTL9Zkm). -We've built a [demo](https://t.co/PkuACJiKrB) for you to see the intermediate outputs and reconstruction by VITMAE. - -Also there's a nice [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTMAE/ViT_MAE_visualization_demo.ipynb) by [@NielsRogge](https://twitter.com/NielsRogge). -""") +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/VITMAE/image_4.jpeg", use_column_width=True) st.markdown(""" """) -st.info(""" -Ressources: -[Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377v3) -by LKaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick (2021) -[GitHub](https://github.com/facebookresearch/mae) -[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/vit_mae)""", icon="📚") +st.info(translations[lang]["ressources"], icon="📚") st.markdown(""" """) st.markdown(""" """) st.markdown(""" """) -col1, col2, col3 = st.columns(3) +col1, col2, col3= st.columns(3) with col1: - if st.button('Previous paper', use_container_width=True): - switch_page("OneFormer") + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("OneFormer") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("OneFormer") with col2: - if st.button('Home', use_container_width=True): - switch_page("Home") + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") with col3: - if st.button('Next paper', use_container_width=True): - switch_page("DINOV2") \ No newline at end of file + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("DINOV2") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("DINOV2") \ No newline at end of file diff --git a/pages/4_DINOv2.py b/pages/4_DINOv2.py index 2d365f7eac441248d7329eea2a130143aad144cd..fb18535b18f623a3a2b010e7dba1a7b910fb3bc2 100644 --- a/pages/4_DINOv2.py +++ b/pages/4_DINOv2.py @@ -1,78 +1,176 @@ import streamlit as st from streamlit_extras.switch_page_button import switch_page -st.title("DINOv2") -st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1743290724672495827) (January 5, 2024)""", icon="ℹ️") +translations = { +'en': {'title': 'DINOv2', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1743290724672495827) (January 5, 2024) + """, + 'tweet_1': + """ + DINOv2 is the king for self-supervised learning in images 🦖🦕 + But how does it work? I've tried to explain how it works but let's expand on it 🧶 + """, + 'tweet_2': + """ + DINOv2 is essentially DINO on steroids, so let's talk about DINOv1 first 🦕 + It's essentially a pre-training technique to train ViTs with self-supervision, that uses an unusual way of distillation 🧟‍♂️👨🏻‍🏫. + Distillation is a technique where there's a large pre-trained model (teacher), and you have a smaller model (student) initialized randomly. + Then during training the student, you take both models'outputs, calculate divergence between them and then update the loss accordingly. + In this case, we have no labels! And the teacher is not pretrained!!!! 🤯 + Well, the outputs here are the distributions, and teacher is iteratively updated according to student, which is called exponential moving average. + """, + 'tweet_3': + """ + DINO doesn't use any contrastive loss or clustering but only cross entropy loss (again, what a paper) which leads the model to collapse. + This can be avoided by normalizing the teacher output multiple times, but authors center (to squish logits) and sharpen (through temperature) the teacher outputs. + Finally, local and global crops are given to student and only global crops are given to teacher and this sort of pushes student to identify context from small parts of the image. + """, + 'tweet_4': + """ + How does DINOv2 improve DINO? + ⚡️ More efficient thanks to FSDP and Flash Attention + 🦖 Has a very efficient data augmentation technique that apparently scales to 100M+ images (put below) + 👨🏻‍🏫 Uses ViT-g instead of training from scratch + """, + 'tweet_5': + """ + The model is so powerful that you can use DINOv2 even with knn or linear classifiers without need to fine-tuning! + But if you'd like DINOv2 to work even better, [NielsRogge](https://twitter.com/NielsRogge) has built a [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DINOv2/Fine\_tune\_DINOv2\_for\_image\_classification\_%5Bminimal%5D.ipynb) to fine-tune it using Trainer 📖 + He also has a [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DINOv2/Train\_a\_linear\_classifier\_on\_top\_of\_DINOv2\_for\_semantic\_segmentation.ipynb) if you feel like training a linear classifier only 📔 + All the different DINO/v2 model checkpoints are [here](https://huggingface.co/models?search=dinoLastly). + Lastly, special thanks to [ykilcher](https://twitter.com/ykilcher) as I couldn't make sense of certain things in the paper and watched his awesome [tutorial](https://youtube.com/watch?v=h3ij3F) 🤩 + """, + 'ressources': + """ + Ressources: + [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski (2023) + [GitHub](https://github.com/facebookresearch/dinov2) + [Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/dinov2)""" + }, +'fr': { + 'title': 'DINOv2', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1743290724672495827) (en anglais) (5 janvier 2024) + """, + 'tweet_1': + """ + DINOv2 is the king for self-supervised learning in images 🦖🦕 + But how does it work? I've tried to explain how it works but let's expand on it 🧶 + """, + 'tweet_2': + """ + DINOv2 is essentially DINO on steroids, so let's talk about DINOv1 first 🦕 + It's essentially a pre-training technique to train ViTs with self-supervision, that uses an unusual way of distillation 🧟‍♂️👨🏻‍🏫. + Distillation is a technique where there's a large pre-trained model (teacher), and you have a smaller model (student) initialized randomly. + Then during training the student, you take both models'outputs, calculate divergence between them and then update the loss accordingly. + In this case, we have no labels! And the teacher is not pretrained!!!! 🤯 + Well, the outputs here are the distributions, and teacher is iteratively updated according to student, which is called exponential moving average. + """, + 'tweet_3': + """ + DINO doesn't use any contrastive loss or clustering but only cross entropy loss (again, what a paper) which leads the model to collapse. + This can be avoided by normalizing the teacher output multiple times, but authors center (to squish logits) and sharpen (through temperature) the teacher outputs. + Finally, local and global crops are given to student and only global crops are given to teacher and this sort of pushes student to identify context from small parts of the image. + """, + 'tweet_4': + """ + How does DINOv2 improve DINO? + ⚡️ More efficient thanks to FSDP and Flash Attention + 🦖 Has a very efficient data augmentation technique that apparently scales to 100M+ images (put below) + 👨🏻‍🏫 Uses ViT-g instead of training from scratch + """, + 'tweet_5': + """ + The model is so powerful that you can use DINOv2 even with knn or linear classifiers without need to fine-tuning! + But if you'd like DINOv2 to work even better, [NielsRogge](https://twitter.com/NielsRogge) has built a [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DINOv2/Fine\_tune\_DINOv2\_for\_image\_classification\_%5Bminimal%5D.ipynb) to fine-tune it using Trainer 📖 + He also has a [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DINOv2/Train\_a\_linear\_classifier\_on\_top\_of\_DINOv2\_for\_semantic\_segmentation.ipynb) if you feel like training a linear classifier only 📔 + All the different DINO/v2 model checkpoints are [here](https://huggingface.co/models?search=dinoLastly). + Lastly, special thanks to [ykilcher](https://twitter.com/ykilcher) as I couldn't make sense of certain things in the paper and watched his awesome [tutorial](https://youtube.com/watch?v=h3ij3F) 🤩 + """, + 'ressources': + """ + Ressources : + [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) de Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski (2023) + [GitHub](https://github.com/facebookresearch/dinov2) + [Documentation d'Hugging Face](https://huggingface.co/docs/transformers/model_doc/dinov2) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") st.markdown(""" """) -st.markdown("""DINOv2 is the king for self-supervised learning in images 🦖🦕 -But how does it work? I've tried to explain how it works but let's expand on it 🧶 -""") +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/DINOv2/image_1.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -DINOv2 is essentially DINO on steroids, so let's talk about DINOv1 first 🦕 -It's essentially a pre-training technique to train ViTs with self-supervision, that uses an unusual way of distillation 🧟‍♂️👨🏻‍🏫. -Distillation is a technique where there's a large pre-trained model (teacher), and you have a smaller model (student) initialized randomly. -Then during training the student, you take both models'outputs, calculate divergence between them and then update the loss accordingly. -In this case, we have no labels! And the teacher is not pretrained!!!! 🤯 -Well, the outputs here are the distributions, and teacher is iteratively updated according to student, which is called exponential moving average. -""") +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/DINOv2/image_2.jpg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -DINO doesn't use any contrastive loss or clustering but only cross entropy loss (again, what a paper) which leads the model to collapse. -This can be avoided by normalizing the teacher output multiple times, but authors center (to squish logits) and sharpen (through temperature) the teacher outputs. -Finally, local and global crops are given to student and only global crops are given to teacher and this sort of pushes student to identify context from small parts of the image. -""") +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/DINOv2/image_3.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown("""How does DINOv2 improve DINO? -⚡️ More efficient thanks to FSDP and Flash Attention -🦖 Has a very efficient data augmentation technique that apparently scales to 100M+ images (put below) -👨🏻‍🏫 Uses ViT-g instead of training from scratch -""") +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/DINOv2/image_4.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -The model is so powerful that you can use DINOv2 even with knn or linear classifiers without need to fine-tuning! -But if you'd like DINOv2 to work even better, [NielsRogge](https://twitter.com/NielsRogge) has built a [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DINOv2/Fine\_tune\_DINOv2\_for\_image\_classification\_%5Bminimal%5D.ipynb) to fine-tune it using Trainer 📖 -He also has a [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DINOv2/Train\_a\_linear\_classifier\_on\_top\_of\_DINOv2\_for\_semantic\_segmentation.ipynb) if you feel like training a linear classifier only 📔 -All the different DINO/v2 model checkpoints are [here](https://huggingface.co/models?search=dinoLastly). -Lastly, special thanks to [ykilcher](https://twitter.com/ykilcher) as I couldn't make sense of certain things in the paper and watched his awesome [tutorial](https://youtube.com/watch?v=h3ij3F) 🤩 -""") +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) st.markdown(""" """) -st.info(""" -Ressources: -[DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) -by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski (2023) -[GitHub](https://github.com/facebookresearch/dinov2) -[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/dinov2)""", icon="📚") +st.info(translations[lang]["ressources"], icon="📚") st.markdown(""" """) st.markdown(""" """) st.markdown(""" """) -col1, col2, col3 = st.columns(3) +col1, col2, col3= st.columns(3) with col1: - if st.button('Previous paper', use_container_width=True): - switch_page("VITMAE") + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("VITMAE") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("VITMAE") with col2: - if st.button('Home', use_container_width=True): - switch_page("Home") + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") with col3: - if st.button('Next paper', use_container_width=True): - switch_page("SigLIP") \ No newline at end of file + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("SigLIP") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("SigLIP") \ No newline at end of file diff --git a/pages/5_SigLIP.py b/pages/5_SigLIP.py index 0cba9330d032284b95a3b20e3585f1cc60c8a8f0..7cbc5a57960c1e88d6072a4a8e5c35b2831d60fc 100644 --- a/pages/5_SigLIP.py +++ b/pages/5_SigLIP.py @@ -1,78 +1,192 @@ import streamlit as st from streamlit_extras.switch_page_button import switch_page -st.title("SigLIP") -st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1745476609686089800) (January 11. 2024)""", icon="ℹ️") +translations = { +'en': {'title': 'SigLIP', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1745476609686089800) (January 11. 2024) + """, + 'tweet_1': + """ + SigLIP just got merged to 🤗 Transformers and it's super easy to use! + To celebrate this, I have created a repository on various SigLIP based projects! + But what is it and how does it work? + SigLIP an vision-text pre-training technique based on contrastive learning. It jointly trains an image encoder and text encoder such that the dot product of embeddings are most similar for the appropriate text-image pairs. + The image below is taken from CLIP, where this contrastive pre-training takes place with softmax, but SigLIP replaces softmax with sigmoid. 📎 + """, + 'tweet_2': + """ + Highlights✨ + 🖼️📝 Authors used medium sized B/16 ViT for image encoder and B-sized transformer for text encoder + 😍 More performant than CLIP on zero-shot + 🗣️ Authors trained a multilingual model too! + ⚡️ Super efficient, sigmoid is enabling up to 1M items per batch, but the authors chose 32k (see saturation on perf below) + """, + 'tweet_3': + """ + Below you can find prior CLIP models and SigLIP across different image encoder sizes and their performance on different datasets 👇🏻 + """, + 'tweet_4': + """ + With 🤗 Transformers integration there comes zero-shot-image-classification pipeline, makes SigLIP super easy to use! + """, + 'tweet_5': + """ + What to use SigLIP for? 🧐 + Honestly the possibilities are endless, but you can use it for image/text retrieval, zero-shot classification, training multimodal models! + I have made a [GitHub repository](https://t.co/Ah1CrHVuPY) with notebooks and applications that are also hosted on Spaces. + I have built ["Draw to Search Art"](https://t.co/DcmQWMc1qd) where you can input image (upload one or draw) and search among 10k images in wikiart! + I've also built apps to [compare](https://t.co/m699TMvuW9) CLIP and SigLIP outputs. + """, + 'ressources': + """ + Ressources: + [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer (2023) + [GitHub](https://github.com/google-research/big_vision) + [Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/siglip) + """ + }, +'fr': { + 'title': 'SigLIP', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1745476609686089800) (en anglais) (11 janvier 2024) + """, + 'tweet_1': + """ + SigLIP vient d'être ajouté à 🤗 Transformers et il est super facile à utiliser ! + Pour fêter cela, j'ai créé un dépôt sur différents projets utilisant SigLIP ! + Mais qu'est-ce que c'est et comment ça marche ? + SigLIP est une technique de pré-entraînement vision-texte basée sur l'apprentissage contrastif. On entraîne conjointement un encodeur d'image et un encodeur de texte de telle sorte que le produit scalaire des enchâssements soit le plus similaire possible pour les paires texte-image liées. + L'image ci-dessous est tirée de CLIP, où ce pré-entraînement contrastif est effectué avec une fonction Softmax, là où SigLIP utilise à la place une fonction Sigmoïde. 📎 + """, + 'tweet_2': + """ + Principaux faits✨ + 🖼️📝 Les auteurs ont utilisé un ViT B/16 pour l'encodeur d'images et un transformer B pour l'encodeur de texte + 😍 Plus performant que CLIP en zéro-shot + 🗣️ Les auteurs ont également entraîné un modèle multilingue ! + ⚡️ Super efficace, la sigmoïde permet de traiter jusqu'à 1M d'éléments par batch, mais les auteurs ont opté pour 32k (voir la saturation sur les performances ci-dessous) + """, + 'tweet_3': + """ + Vous trouverez ci-dessous les performances des modèles CLIP et SigLIP pour différentes tailles d'encodeurs d'images et leurs performances sur différents jeux de données 👇🏻 + """, + 'tweet_4': + """ + Avec l'intégration dans 🤗 Transformers, il est possible d'utiliser SigLIP très simplement via le pipeline de classification d'images en zéro-shot ! + """, + 'tweet_5': + """ + Pourquoi utiliser SigLIP ? 🧐 + Honnêtement, les possibilités sont infinies, mais vous pouvez l'utiliser pour la recherche d'images/de textes, la classification zéro-shot, l'entraînement de modèles multimodaux ! + J'ai créé un [dépôt GitHub]((https://t.co/Ah1CrHVuPY)) contenant des notebooks et des applications. + Par exemple ["Draw to Search Art"](https://t.co/DcmQWMc1qd) où l'on peut saisir une image (en charger une ou bien la dessiner) et effectuer une recherche parmi les 10 000 images de wikiart ! + Ou encore une application pour [comparer](https://t.co/m699TMvuW9) les sorties CLIP et SigLIP. + """, + 'ressources': + """ + Ressources : + [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) de Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer (2023) + [GitHub](https://github.com/google-research/big_vision) + [Documentation d'Hugging Face](https://huggingface.co/docs/transformers/model_doc/siglip) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") st.markdown(""" """) -st.markdown("""SigLIP just got merged to 🤗 Transformers and it's super easy to use! -To celebrate this, I have created a repository on various SigLIP based projects! -But what is it and how does it work? -SigLIP an vision-text pre-training technique based on contrastive learning. It jointly trains an image encoder and text encoder such that the dot product of embeddings are most similar for the appropriate text-image pairs. -The image below is taken from CLIP, where this contrastive pre-training takes place with softmax, but SigLIP replaces softmax with sigmoid. 📎 -""") +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/SigLIP/image_1.jpg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -Highlights✨ -🖼️📝 Authors used medium sized B/16 ViT for image encoder and B-sized transformer for text encoder -😍 More performant than CLIP on zero-shot -🗣️ Authors trained a multilingual model too! -⚡️ Super efficient, sigmoid is enabling up to 1M items per batch, but the authors chose 32k (see saturation on perf below) -""") +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/SigLIP/image_2.jpg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -Below you can find prior CLIP models and SigLIP across different image encoder sizes and their performance on different datasets 👇🏻 -""") +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/SigLIP/image_3.jpg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -With 🤗 Transformers integration there comes zero-shot-image-classification pipeline, makes SigLIP super easy to use! -""") +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/SigLIP/image_4.jpg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -What to use SigLIP for? 🧐 -Honestly the possibilities are endless, but you can use it for image/text retrieval, zero-shot classification, training multimodal models! -I have made a repository with notebooks and applications that are also hosted on [Spaces](https://t.co/Ah1CrHVuPY). -I have built ["Draw to Search Art"](https://t.co/DcmQWMc1qd) where you can input image (upload one or draw) and search among 10k images in wikiart! -I've also built apps to [compare](https://t.co/m699TMvuW9) CLIP and SigLIP outputs. -""") + +with st.expander ("Code"): + st.code(""" + from transformers import pipeline + + # pipeline + image_classifier = pipeline(task="zero-shot-image-classification", model="google/siglip-base-patch16-256-multilingual") + + # inference + outputs = image_classifier( image, candidate_labels=["2 cats", "a plane", "a remote"]) + outputs = [{"score": round(output["score"], 4), "label": output["label"] } for output in outputs] + print(outputs) + + # [{'score': 0.2157, 'label': '2 cats'}, {'score': 0.0001, 'label': 'a remote'}, {'score': 0.0, 'label': 'a plane'}] + """) +st.markdown(""" """) + + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/SigLIP/image_5.jpg", use_column_width=True) st.markdown(""" """) -st.info(""" -Ressources: -[Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) -by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer (2023) -[GitHub](https://github.com/google-research/big_vision) -[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/siglip)""", icon="📚") +st.info(translations[lang]["ressources"], icon="📚") + st.markdown(""" """) st.markdown(""" """) st.markdown(""" """) -col1, col2, col3 = st.columns(3) +col1, col2, col3= st.columns(3) with col1: - if st.button('Previous paper', use_container_width=True): - switch_page("DINOv2") + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("DINOv2") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("DINOv2") with col2: - if st.button('Home', use_container_width=True): - switch_page("Home") + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") with col3: - if st.button('Next paper', use_container_width=True): - switch_page("OWLv2") \ No newline at end of file + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("OWLv2") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("OWLv2") \ No newline at end of file diff --git a/pages/6_OWLv2.py b/pages/6_OWLv2.py index 3fb749dacc41e7dd5213e1daf77dd2d64338703d..6a44bba3066889b71bb5ee1361d049c8fa1e9d32 100644 --- a/pages/6_OWLv2.py +++ b/pages/6_OWLv2.py @@ -1,87 +1,197 @@ import streamlit as st from streamlit_extras.switch_page_button import switch_page -st.title("OWLv2") -st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1748411972675150040) (January 19, 2024)""", icon="ℹ️") -st.markdown(""" """) - -st.markdown("""Explaining the 👑 of zero-shot open-vocabulary object detection: OWLv2 🦉🧶""") +translations = { +'en': {'title': 'OWLv2', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1748411972675150040) (January 19, 2024) + """, + 'tweet_1': + """ + Explaining the 👑 of zero-shot open-vocabulary object detection: OWLv2 🦉🧶 + """, + 'tweet_2': + """ + OWLv2 is scaled version of a model called OWL-ViT, so let's take a look at that first 📝 + OWLViT is an open vocabulary object detector, meaning, it can detect objects it didn't explicitly see during the training 👀 + What's cool is that it can take both image and text queries! This is thanks to how the image and text features aren't fused together. + """, + 'tweet_3': + """ + Taking a look at the architecture, the authors firstly do contrastive pre-training of a vision and a text encoder (just like CLIP). + They take that model, remove the final pooling layer and attach a lightweight classification and box detection head and fine-tune. + """, + 'tweet_4': + """ + During fine-tuning for object detection, they calculate the loss over bipartite matches. + Simply put, loss is calculated over the predicted objects against ground truth objects and the goal is to find a perfect match of these two sets where each object is matched to one object in ground truth. +
+ OWL-ViT is very scalable. + One can easily scale most language models or vision-language models because they require no supervision, but this isn't the case for object detection: you still need supervision. + Moreover, only scaling the encoders creates a bottleneck after a while. + """, + 'tweet_5': + """ + The authors wanted to scale OWL-ViT with more data, so they used OWL-ViT for labelling to train a better detector, "self-train" a new detector on the labels, and fine-tune the model on human-annotated data. + """, + 'tweet_6': + """ + Thanks to this, OWLv2 scaled very well and is tops leaderboards on open vocabulary object detection 👑 + """, + 'tweet_7': + """ + Want to try OWL models? + I've created a [notebook](https://t.co/ick5tA6nyx) for you to see how to use it with 🤗 Transformers. + If you want to play with it directly, you can use this [Space](https://t.co/oghdLOtoa5). + All the models and the applications of OWL-series is in this [collection](https://huggingface.co/collections/merve/owl-series-65aaac3114e6582c300544df). + """, + 'ressources': + """ + Ressources: + [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby (2023) + [GitHub](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit) + [Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/owlv2) + """ + }, +'fr': { + 'title': 'OWLv2', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1748411972675150040) (en anglais) (19 janvier 2024) + """, + 'tweet_1': + """ + Explication du 👑 de la détection d'objets en zéro-shot à vocabulaire ouvert : OWLv2 🦉🧶 + """, + 'tweet_2': + """ + OWLv2 est une version passée à l'échelle d'un modèle appelé OWL-ViT, que nous allons donc examiner d'abord 📝 + OWLViT est un détecteur d'objets à vocabulaire ouvert, ce qui signifie qu'il peut détecter des objets qu'il n'a pas explicitement vus pendant l'entraînement 👀 + Ce qui est génial, c'est qu'il peut répondre à des requêtes d'images et de texte ! Cela est dû au fait que les caractéristiques de l'image et du texte ne sont pas fusionnées. + """, + 'tweet_3': + """ + Si l'on examine l'architecture, les auteurs procèdent tout d'abord à un pré-entraînement contrastif d'encodeurs de vision et de texte (comme pour CLIP). + Ils prennent ce modèle, suppriment la couche de pooling finale et ajoutent une tête de classification et de détection de boîtes, puis procèdent à un finetuning. """, + 'tweet_4': + """ + Lors du finetuning pour la détection d'objets, ils calculent la perte sur les correspondances bipartites. + Plus simplement, la perte est calculée sur les objets prédits par rapport aux objets de la vérité terrain et l'objectif est de trouver une correspondance parfaite entre ces deux ensembles où chaque objet correspond à un objet de la vérité terrain. +
+ OWL-ViT est fortement passable à l'échelle. + La plupart des modèles de langage ou des modèles vision-langage sont facilement extensibles car ils ne nécessitent pas de supervision, mais ce n'est pas le cas pour la détection d'objets : une supervision est toujours nécessaire. + De plus, la seule mise à l'échelle des encodeurs crée un goulot d'étranglement au bout d'un certain temps. + """, + 'tweet_5': + """ + Les auteurs souhaitaient faire passer à l'échelle OWL-ViT avec davantage de données. Ils l'ont donc utilisé pour labéliser des données afin d'entraîner un meilleur détecteur sur ces labels. Puis ils ont finetuné le modèle sur des données annotées par des humains. + """, + 'tweet_6': + """ + Grâce à cela, OWLv2 est en tête des classements sur la détection d'objets à vocabulaire ouvert 👑 + """, + 'tweet_7': + """ + Vous voulez essayer les modèles OWL ? + J'ai créé un [notebook](https://t.co/ick5tA6nyx) pour que vous puissiez voir comment l'utiliser avec 🤗 Transformers. + Si vous voulez jouer avec directement, vous pouvez utiliser ce [Space](https://t.co/oghdLOtoa5). + Tous les modèles et les applications de la série OWL se trouvent dans cette [collection](https://huggingface.co/collections/merve/owl-series-65aaac3114e6582c300544df). + """, + 'ressources' : + """ + Ressources: + [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) de Matthias Minderer, Alexey Gritsenko, Neil Houlsby (2023) + [GitHub](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit) + [Documentation d'Hugging Face](https://huggingface.co/docs/transformers/model_doc/owlv2) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/OWLv2/image_1.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -OWLv2 is scaled version of a model called OWL-ViT, so let's take a look at that first 📝 -OWLViT is an open vocabulary object detector, meaning, it can detect objects it didn't explicitly see during the training 👀 -What's cool is that it can take both image and text queries! This is thanks to how the image and text features aren't fused together. -""") +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/OWLv2/image_2.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown("""Taking a look at the architecture, the authors firstly do contrastive pre-training of a vision and a text encoder (just like CLIP). -They take that model, remove the final pooling layer and attach a lightweight classification and box detection head and fine-tune. -""") +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/OWLv2/image_3.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown("""During fine-tuning for object detection, they calculate the loss over bipartite matches. -Simply put, loss is calculated over the predicted objects against ground truth objects and the goal is to find a perfect match of these two sets where each object is matched to one object in ground truth. - -OWL-ViT is very scalable. -One can easily scale most language models or vision-language models because they require no supervision, but this isn't the case for object detection: you still need supervision. -Moreover, only scaling the encoders creates a bottleneck after a while. -""") +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/OWLv2/image_1.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -The authors wanted to scale OWL-ViT with more data, so they used OWL-ViT for labelling to train a better detector, "self-train" a new detector on the labels, and fine-tune the model on human-annotated data. -""") +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/OWLv2/image_4.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -Thanks to this, OWLv2 scaled very well and is tops leaderboards on open vocabulary object detection 👑 -""") +st.markdown(translations[lang]["tweet_6"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/OWLv2/image_5.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -Want to try OWL models? -I've created a [notebook](https://t.co/ick5tA6nyx) for you to see how to use it with 🤗 Transformers. -If you want to play with it directly, you can use this [Space](https://t.co/oghdLOtoa5). -All the models and the applications of OWL-series is in this [collection](https://huggingface.co/collections/merve/owl-series-65aaac3114e6582c300544df). -""") +st.markdown(translations[lang]["tweet_7"], unsafe_allow_html=True) st.markdown(""" """) -st.info(""" -Ressources: -[Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) -by Matthias Minderer, Alexey Gritsenko, Neil Houlsby (2023) -[GitHub](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit) -[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/owlv2)""", icon="📚") + +st.info(translations[lang]["ressources"], icon="📚") + st.markdown(""" """) st.markdown(""" """) st.markdown(""" """) -col1, col2, col3 = st.columns(3) +col1, col2, col3= st.columns(3) with col1: - if st.button('Previous paper', use_container_width=True): - switch_page("SigLIP") + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("SigLIP") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("SigLIP") with col2: - if st.button('Home', use_container_width=True): - switch_page("Home") + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") with col3: - if st.button('Next paper', use_container_width=True): - switch_page("Backbone") \ No newline at end of file + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("Backbone") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("Backbone") \ No newline at end of file diff --git a/pages/7_Backbone.py b/pages/7_Backbone.py index a3c4db92e123f3651f8c9f2969a6dcedcbc7a4a4..5c7d5fcd2d747731db6703c84ac9636e42307b79 100644 --- a/pages/7_Backbone.py +++ b/pages/7_Backbone.py @@ -1,63 +1,233 @@ import streamlit as st from streamlit_extras.switch_page_button import switch_page -st.title("Backbone") -st.success("""[Original tweet](https://x.com/mervenoyann/status/1749841426177810502) (January 23, 2024)""", icon="ℹ️") +translations = { +'en': {'title': 'Backbone', + 'original_tweet': + """ + [Original tweet](https://x.com/mervenoyann/status/1749841426177810502) (January 23, 2024) + """, + 'tweet_1': + """ + Many cutting-edge computer vision models consist of multiple stages: + ➰ backbone extracts the features, + ➰ neck refines the features, + ➰ head makes the detection for the task. + Implementing this is cumbersome, so 🤗 Transformers has an API for this: Backbone! + """, + 'tweet_2': + """ + Let's see an example of such model. + Assuming we would like to initialize a multi-stage instance segmentation model with ResNet backbone and MaskFormer neck and a head, you can use the backbone API like following (left comments for clarity) 👇 + """, + 'tweet_3': + """ + One can also use a backbone just to get features from any stage. You can initialize any backbone with `AutoBackbone` class. + See below how to initialize a backbone and getting the feature maps at any stage 👇 + """, + 'tweet_4': + """ + Backbone API also supports any timm backbone of your choice! Check out a variation of timm backbones [here](https://t.co/Voiv0QCPB3). + """, + 'tweet_5': + """ + Leaving some links 🔗 + 📖 I've created a [notebook](https://t.co/PNfmBvdrtt) for you to play with it + 📒 [Backbone API docs](https://t.co/Yi9F8qAigO) + 📓 [AutoBackbone docs](https://t.co/PGo9oILHDw) (all written with love by me!💜) + """ + }, +'fr': { + 'title': 'Backbone', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1749841426177810502) (en anglais) (23 janvier 2024) + """, + 'tweet_1': + """ + De nombreux modèles de vision par ordinateur de pointe se composent de plusieurs étapes : + ➰ le backbone extrayant les caractéristiques, + ➰ le cou affinant les caractéristiques, + ➰ la tête effectuant la détection pour la tâche. + L'implémentation est lourde, c'est pourquoi 🤗 Transformers dispose d'une API pour faire tout cela : Backbone ! + """, + 'tweet_2': + """ + Voyons un exemple de ce type de modèle. + En supposant que nous souhaitions initialiser un modèle de segmentation d'instance à plusieurs étapes avec un ResNet comme backbone et un MaskFormer pour le cou et une tête, vous pouvez utiliser l'API Backbone comme suit (j'ai laissé des commentaires pour plus de clarté) 👇 """, + 'tweet_3': + """ + Il est également possible d'utiliser un backbone pour obtenir des fonctionnalités à partir de n'importe quelle étape. + Vous pouvez initialiser n'importe quel backbone avec la classe `AutoBackbone`. + Voir ci-dessous comment initialiser un backbone et obtenir les cartes de caractéristiques à n'importe quel étape 👇 + """, + 'tweet_4': + """ + L'API Backbone prend également en charge n'importe quel backbone de la bibliotque Timm ! Découvrez la liste des backbones disponibles dans Timm [ici](https://t.co/Voiv0QCPB3). """, + 'tweet_5': + """ + Quelques liens utiles (rédigés avec amour par moi !💜) 🔗 + 📖 J'ai créé un [notebook](https://t.co/PNfmBvdrtt) pour que vous puissiez jouer avec. + 📒 [La documentation de l'API Backbone API](https://t.co/Yi9F8qAigO) + 📓 [La documentation AutoBackbone](https://t.co/PGo9oILHDw) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") st.markdown(""" """) -st.markdown("""Many cutting-edge computer vision models consist of multiple stages: -➰ backbone extracts the features, -➰ neck refines the features, -➰ head makes the detection for the task. -Implementing this is cumbersome, so 🤗 Transformers has an API for this: Backbone! -""") +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/Backbone/image_1.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -Let's see an example of such model. -Assuming we would like to initialize a multi-stage instance segmentation model with ResNet backbone and MaskFormer neck and a head, you can use the backbone API like following (left comments for clarity) 👇 -""") +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/Backbone/image_2.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown("""One can also use a backbone just to get features from any stage. You can initialize any backbone with `AutoBackbone` class. -See below how to initialize a backbone and getting the feature maps at any stage 👇 -""") +with st.expander ("Code"): + if lang == "en": + st.code(""" +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig + +# initialize backbone config +backbone_config = ResNetConfig. from_pretrained("microsoft/resnet-50") + +# initialize neck config with backbone config +config = MaskFormerConfig(backbone_config=backbone_config) + +# initialize the head using combined config +model = MaskFormerForInstanceSegmentation(config) + """) + else: + st.code(""" +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig + +# initialiser la configuration du backbone +backbone_config = ResNetConfig. from_pretrained("microsoft/resnet-50") + +# initialiser la configuration du cou avec la configuration du backbone +config = MaskFormerConfig(backbone_config=backbone_config) + +# initialiser la tête avec la configuration combinée +model = MaskFormerForInstanceSegmentation(config) + """) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/Backbone/image_3.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -Backbone API also supports any timm backbone of your choice! Check out a variation of timm backbones [here](https://t.co/Voiv0QCPB3). -""") +with st.expander ("Code"): + if lang == "en": + st.code(""" +from transformers import AutoImageProcessor, AutoBackbone +import torch + +# initialize backbone and processor +processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224") +model = AutoBackbone. from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(0,1,2)) + +# pass inputs through the processor and model +inputs = processor(image, return_tensors="pt") +outputs = model(**inputs ) +feature_maps = outputs.feature_maps + +# get feature maps from stem +list(feature_maps[0].shape) +# >>> [1, 96, 56, 56] + +# get feature maps of first stage +list(feature_maps[1].shape) +# >>> [1, 96, 56, 56] + """) + else: + st.code(""" +from transformers import AutoImageProcessor, AutoBackbone +import torch + +# initialiser le backbone et le processeur +processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224") +model = AutoBackbone. from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(0,1,2)) + +# passer les entrées par le processeur et le modèle +inputs = processor(image, return_tensors="pt") +outputs = model(**inputs ) +feature_maps = outputs.feature_maps + +# obtenir des cartes de caractéristiques [0] +list(feature_maps[0].shape) +# >>> [1, 96, 56, 56] + +# obtenir des cartes de caractéristiques [1] +list(feature_maps[1].shape) +# >>> [1, 96, 56, 56] + """) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/Backbone/image_4.jpeg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -Leaving some links 🔗 -📖 I've created a [notebook](https://t.co/PNfmBvdrtt) for you to play with it -📒 [Backbone API docs](https://t.co/Yi9F8qAigO) -📓 [AutoBackbone docs](https://t.co/PGo9oILHDw) (all written with love by me!💜)""") +with st.expander ("Code"): + st.code(""" +from transformers import TimmBackboneConfig, TimmBackbone + +backbone_config = TimmBackboneConfig("resnet50") +model = TimmBackbone(config=backbone_config) + """) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) + st.markdown(""" """) st.markdown(""" """) st.markdown(""" """) -col1, col2, col3 = st.columns(3) +col1, col2, col3= st.columns(3) with col1: - if st.button('Previous paper', use_container_width=True): - switch_page("OWLv2") + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("OWLv2") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("OWLv2") with col2: - if st.button('Home', use_container_width=True): - switch_page("Home") + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") with col3: - if st.button('Next paper', use_container_width=True): - switch_page("Depth Anything") \ No newline at end of file + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("Depth Anything") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("Depth Anything") \ No newline at end of file diff --git a/pages/8_Depth_Anything.py b/pages/8_Depth_Anything.py index fd944ba0c6d0bfdb34b3a9410ec55dcf056f7c4f..28fcea1a4ed4fccbf02828adb12857d172e80a77 100644 --- a/pages/8_Depth_Anything.py +++ b/pages/8_Depth_Anything.py @@ -1,100 +1,370 @@ import streamlit as st from streamlit_extras.switch_page_button import switch_page -st.title("Depth Anything") -st.success("""[Original tweet](https://twitter.com/mervenoyann/status/1750531698008498431) (January 25, 2024)""", icon="ℹ️") -st.markdown(""" """) +translations = { +'en': {'title': 'Depth Anything', + 'original_tweet': + """ + [Original tweet](https://twitter.com/mervenoyann/status/1750531698008498431) (January 25. 2024) + """, + 'tweet_1': + """ + Explaining a new state-of-the-art monocular depth estimation model: Depth Anything ✨🧶 + It has just been integrated in 🤗 Transformers for super-easy use. + We compared it against DPTs and benchmarked it as well! You can find the usage, benchmark, demos and more below 👇 + """, + 'tweet_2': + """ + The paper starts with highlighting previous depth estimation methods and the limitations regarding the data coverage. 👀 + The model's success heavily depends on unlocking the use of unlabeled datasets, although initially the authors used self-training and failed. +
+ What the authors have done: + ➰ Train a teacher model on labelled dataset + ➰ Guide the student using teacher and also use unlabelled datasets pseudolabelled by the teacher. However, this was the cause of the failure, as both architectures were similar, the outputs were the same. + """, + 'tweet_3': + """ + So the authors have added a more difficult optimization target for student to learn additional knowledge on unlabeled images that went through color jittering, distortions, Gaussian blurring and spatial distortion, so it can learn more invariant representations from them. +
+ The architecture consists of DINOv2 encoder to extract the features followed by DPT decoder. At first, they train the teacher model on labelled images, and then they jointly train the student model and add in the dataset pseudo-labelled by ViT-L. + """, + 'tweet_4': + """Thanks to this, Depth Anything performs very well! I have also benchmarked the inference duration of the model against different models here. I also ran `torch.compile` benchmarks across them and got nice speed-ups 🚀 +
+ On T4 GPU, mean of 30 inferences for each. Inferred using `pipeline` (pre-processing and post-processing included with model inference). +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Model/Batch Size1641
intel/dpt-large2709.652667.799172.617
facebook/dpt-dinov2-small-nyu2534.854654.822159.754
facebook/dpt-dinov2-base-nyu4316.87331090.824266.699
Intel/dpt-beit-large-5127961.3862036.743497.656
depth-anything-small1692.368415.915143.379
+ """, + 'tweet_5': + """ + `torch.compile`’s benchmarks with reduce-overhead mode: we have compiled the model and loaded it to the pipeline for the benchmarks to be fair. -st.markdown("""Explaining a new state-of-the-art monocular depth estimation model: Depth Anything ✨🧶 -It has just been integrated in transformers for super-easy use. -We compared it against DPTs and benchmarked it as well! You can find the usage, benchmark, demos and more below 👇 -""") -st.markdown(""" """) +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Model/Batch Size1641
intel/dpt-large2556.668645.750155.153
facebook/dpt-dinov2-small-nyu2415.25610.967148.526
facebook/dpt-dinov2-base-nyu4057.9091035.672245.692
Intel/dpt-beit-large-5127417.3881795.882426.546
depth-anything-small1664.025384.68897.865
+ """, + 'tweet_6': + """ + You can use Depth Anything easily thanks to 🤗 Transformers with three lines of code! ✨ + We have also built an app for you to [compare different depth estimation models](https://t.co/6uq4osdwWG) 🐝 🌸 + See all the available Depth Anything checkpoints [here](https://t.co/Ex0IIyx7XC). + """, + 'ressources': + """ + Ressources: + [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) + by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao (2024) + [GitHub](https://github.com/LiheYoung/Depth-Anything) + [Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/depth_anything) + """ + }, +'fr': { + 'title': 'Depth Anything', + 'original_tweet': + """ + [Tweet de base](https://twitter.com/mervenoyann/status/1750531698008498431) (en anglais) (25 janvier 2024) + """, + 'tweet_1': + """ + Explication d'un nouveau modèle à l'état de l'art pour l'estimation de la profondeur monoculaire : Depth Anything ✨🧶 + Il vient d'être intégré dans 🤗 Transformers pour une utilisation super-facile. + Nous l'avons comparé aux DPTs et l'avons benchmarké ! Vous pouvez trouver l'utilisation, le benchmark, les démos et plus encore ci-dessous 👇 + """, + 'tweet_2': + """ + Le papier commence par aborder les points forts et faibles des précédentes méthodes d'estimation de la profondeur. 👀 + Le succès du modèle dépend fortement de l'utilisation de jeux de données non étiquetés, bien qu'initialement les auteurs aient utilisé l'auto-apprentissage et aient échoué. +
+ Ce que les auteurs ont fait : + ➰ Entraîner un modèle enseignant sur un jeu de données étiquetées. + ➰ Guider le modèle étudiant à l'aide de l'enseignant ainsi qu'utiliser des jeux de données non étiquetés pseudo-étiquetés par l'enseignant. Cependant, il s'avère que c'est la cause de l'échec, puisque les deux architectures étant similaires, les sorties étaient les mêmes. """, + 'tweet_3': + """ + Les auteurs ont donc ajouté un objectif d'optimisation plus difficile pour que l'étudiant apprenne des connaissances supplémentaires sur des images non étiquetées qui ont subi des changements de couleur, des distorsions, un flou gaussien et des distorsions spatiales, afin qu'il puisse apprendre des représentations davantage invariantes à partir de ces images. +
+ L'architecture consiste en un encodeur DINOv2 pour extraire les caractéristiques, suivi d'un décodeur DPT. Dans un premier temps, ils entraînent le modèle enseignant sur des images étiquetées, puis ils entraînent conjointement le modèle de étudiant et ajoutent le jeu de données pseudo-étiqueté par ViT-L. + """, + 'tweet_4': + """ + Grâce à cela, le modèle Depth Anything fonctionne très bien ! J'ai également comparé la durée d'inférence du modèle avec d'autres modèles (avec et sans `torch.compile` qui permet de belles accélérations) 🚀 +
+ Sur GPU T4, moyenne de 30 inférences pour chacun. Inféré en utilisant `pipeline` (pré-traitement et post-traitement inclus avec l'inférence du modèle). +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Modèle/Taille du batch1641
intel/dpt-large2709.652667.799172.617
facebook/dpt-dinov2-small-nyu2534.854654.822159.754
facebook/dpt-dinov2-base-nyu4316.87331090.824266.699
Intel/dpt-beit-large-5127961.3862036.743497.656
depth-anything-small1692.368415.915143.379
+ """, + 'tweet_5': + """ + Les benchmarks de `torch.compile` avec le mode reduce-overhead : nous avons compilé le modèle et l'avons chargé dans le pipeline pour que les benchmarks soient équitables. -st.video("pages/Depth Anything/video_1.mp4", format="video/mp4") -st.markdown(""" """) +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Modèle/Taille du batch1641
intel/dpt-large2556.668645.750155.153
facebook/dpt-dinov2-small-nyu2415.25610.967148.526
facebook/dpt-dinov2-base-nyu4057.9091035.672245.692
Intel/dpt-beit-large-5127417.3881795.882426.546
depth-anything-small1664.025384.68897.865
+ """, + 'tweet_6': + """ + Vous pouvez utiliser Depth Anything facilement grâce à 🤗 Transformers avec trois lignes de code ! ✨ + Nous avons également créé une application pour vous permettre de [comparer différents modèles d'estimation de la profondeur](https://t.co/6uq4osdwWG) 🐝 🌸 + Tous les checkpoints de Depth Anything sont disponibles [ici](https://t.co/Ex0IIyx7XC). + """, + 'ressources': + """ + Ressources : + [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) + de Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao (2024) + [GitHub](https://github.com/LiheYoung/Depth-Anything) + [Documentation d'Hugging Face](https://huggingface.co/docs/transformers/model_doc/depth_anything) + """ + } +} -st.markdown(""" -The paper starts with highlighting previous depth estimation methods and the limitations regarding the data coverage. 👀 -The model's success heavily depends on unlocking the use of unlabeled datasets, although initially the authors used self-training and failed. -What the authors have done: -➰ Train a teacher model on labelled dataset -➰ Guide the student using teacher and also use unlabelled datasets pseudolabelled by the teacher. However, this was the cause of the failure, as both architectures were similar, the outputs were the same. -""") -st.markdown(""" """) +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' -st.image("pages/Depth Anything/image_1.jpg", use_column_width=True) +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") st.markdown(""" """) -st.markdown(""" -So the authors have added a more difficult optimization target for student to learn additional knowledge on unlabeled images that went through color jittering, distortions, Gaussian blurring and spatial distortion, so it can learn more invariant representations from them. +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) -The architecture consists of DINOv2 encoder to extract the features followed by DPT decoder. At first, they train the teacher model on labelled images, and then they jointly train the student model and add in the dataset pseudo-labelled by ViT-L. -""", unsafe_allow_html=True) +st.video("pages/Depth Anything/video_1.mp4", format="video/mp4") +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/Depth Anything/image_1.jpg", use_column_width=True) st.markdown(""" """) -st.markdown("""Thanks to this, Depth Anything performs very well! I have also benchmarked the inference duration of the model against different models here. I also ran `torch.compile` benchmarks across them and got nice speed-ups 🚀 - -On T4 GPU, mean of 30 inferences for each. Inferred using `pipeline` (pre-processing and post-processing included with model inference). - -| Model/Batch Size | 16 | 4 | 1 | -| ----------------------------- | --------- | -------- | ------- | -| intel/dpt-large | 2709.652 | 667.799 | 172.617 | -| facebook/dpt-dinov2-small-nyu | 2534.854 | 654.822 | 159.754 | -| facebook/dpt-dinov2-base-nyu | 4316.8733 | 1090.824 | 266.699 | -| Intel/dpt-beit-large-512 | 7961.386 | 2036.743 | 497.656 | -| depth-anything-small | 1692.368 | 415.915 | 143.379 | +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) -`torch.compile`’s benchmarks with reduce-overhead mode: we have compiled the model and loaded it to the pipeline for the benchmarks to be fair. +st.image("pages/Depth Anything/image_1.jpg", use_column_width=True) +st.markdown(""" """) -| Model/Batch Size | 16 | 4 | 1 | -| ----------------------------- | -------- | -------- | ------- | -| intel/dpt-large | 2556.668 | 645.750 | 155.153 | -| facebook/dpt-dinov2-small-nyu | 2415.25 | 610.967 | 148.526 | -| facebook/dpt-dinov2-base-nyu | 4057.909 | 1035.672 | 245.692 | -| Intel/dpt-beit-large-512 | 7417.388 | 1795.882 | 426.546 | -| depth-anything-small | 1664.025 | 384.688 | 97.865 | +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) -""") +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/Depth Anything/image_2.jpg", use_column_width=True) st.markdown(""" """) -st.markdown(""" -You can use Depth Anything easily thanks to 🤗 Transformers with three lines of code! ✨ -We have also built an app for you to [compare different depth estimation models](https://t.co/6uq4osdwWG) 🐝 🌸 -See all the available Depth Anything checkpoints [here](https://t.co/Ex0IIyx7XC). -""") +st.markdown(translations[lang]["tweet_6"], unsafe_allow_html=True) st.markdown(""" """) st.image("pages/Depth Anything/image_3.jpg", use_column_width=True) st.markdown(""" """) -st.info(""" -Ressources: -[Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) -by Lihe Yang, Bingyi Kang, Zilong Huang, Xiaogang Xu, Jiashi Feng, Hengshuang Zhao (2024) -[GitHub](https://github.com/LiheYoung/Depth-Anything) -[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/depth_anything)""", icon="📚") +with st.expander ("Code"): + st.code(""" + from transformers import pipeline + + pipe = pipeline(task="depth-estimation", model="LiheYoung/depth-anything-small-hf") + depth = pipe(image)["depth"] + """) +st.markdown(""" """) +st.info(translations[lang]["ressources"], icon="📚") + st.markdown(""" """) st.markdown(""" """) st.markdown(""" """) -col1, col2, col3 = st.columns(3) +col1, col2, col3= st.columns(3) with col1: - if st.button('Previous paper', use_container_width=True): - switch_page("Backbone") + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("Backbone") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("Backbone") with col2: - if st.button('Home', use_container_width=True): - switch_page("Home") + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") with col3: - if st.button('Next paper', use_container_width=True): - switch_page("LLaVA-NeXT") \ No newline at end of file + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("UDOP") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("UDOP") \ No newline at end of file diff --git a/pages/9_UDOP.py b/pages/9_UDOP.py new file mode 100644 index 0000000000000000000000000000000000000000..82f8127609d9d4a7dbfc0e5557638f8d9bbc9f89 --- /dev/null +++ b/pages/9_UDOP.py @@ -0,0 +1,172 @@ +import streamlit as st +from streamlit_extras.switch_page_button import switch_page + + +translations = { +'en': {'title': 'UDOP', + 'original_tweet': + """ + [Original tweet](https://x.com/mervenoyann/status/1767200350530859321) (March 11, 2024) + """, + 'tweet_1': + """ + New foundation model on document understanding and generation in 🤗 Transformers 🤩 + UDOP by Microsoft is a bleeding-edge model that is capable of many tasks, including question answering, document editing and more! 🤯 + Check out the [demo](https://huggingface.co/spaces/merve/UDOP). + Technical details 🧶 + """, + 'tweet_2': + """ + UDOP is a model that combines vision, text and layout. 📝 + This model is very interesting because the input representation truly captures the nature of the document modality: text, where the text is, and the layout of the document matters! + +
+ If you know T5, it resembles that: it's pre-trained on both self-supervised and supervised objectives over text, image and layout. + To switch between tasks, one simply needs to change the task specific prompt at the beginning, e.g. for QA, one prepends with Question answering. + """, + 'tweet_3': + """ + As for the architecture, it's like T5, except it has a single encoder that takes in text, image and layout, and two decoders (text-layout and vision decoders) combined into one. + The vision decoder is a masked autoencoder (thus the capabilities of document editing). + """, + 'tweet_4': + """ + For me, the most interesting capability is document reconstruction, document editing and layout re-arrangement (see below 👇) + This decoder isn't released though because it could be used maliciously to fake document editing. + """, + 'tweet_5': + """ + Overall, the model performs very well on document understanding benchmark (DUE) and also information extraction (FUNSD, CORD) and classification (RVL-CDIP) for vision, text, layout modalities 👇 + """, + 'ressources': + """ + Resources: + [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623) + by Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal (2022) + [GitHub](https://github.com/microsoft/UDOP) + [Hugging Face models](https://huggingface.co/microsoft/udop-large) + [Hugging Face documentation](https://huggingface.co/docs/transformers/en/model_doc/udop) + """ + }, +'fr': { + 'title': 'UDOP', + 'original_tweet': + """ + [Tweet de base](https://x.com/mervenoyann/status/1767200350530859321) (en anglais) (11 mars 2024) + """, + 'tweet_1': + """ + Un nouveau modèle de compréhension de documents et de génération est disponible dans 🤗 Transformers 🤩 + UDOP de Microsoft est un modèle de pointe capable d'effectuer de nombreuses tâches, notamment répondre à des questions, éditer des documents et bien plus encore ! 🤯 + Consultez la [démo](https://huggingface.co/spaces/merve/UDOP). + Détails techniques 🧶 + """, + 'tweet_2': + """ + UDOP est un modèle qui combine la vision, le texte et la mise en page. 📝 + Ce modèle est très intéressant car la représentation en entrée capture véritablement la nature de la modalité du document : le texte, l'endroit où se trouve le texte et la mise en page du document comptent !
+ Si vous connaissez le T5, cela y ressemble : il est pré-entraîné sur des objectifs autosupervisés et supervisés sur le texte, l'image et la mise en page. + Pour passer d'une tâche à l'autre, il suffit de modifier le prompt spécifique à la tâche au début, par exemple, pour le QA, on ajoute "Question answering". + """, + 'tweet_3': + """ + En ce qui concerne l'architecture, elle est similaire à celle du T5, à l'exception d'un seul encodeur qui prend en charge le texte, l'image et la mise en page, et de deux décodeurs (décodeur texte/mise en page et décodeur de vision) combinés en un seul. + Le décodeur de vision est un autoencodeur masqué (d'où les possibilités d'édition de documents). + """, + 'tweet_4': + """ + Pour moi, la capacité la plus intéressante est la reconstruction de documents, l'édition de documents et le réarrangement de la mise en page (voir ci-dessous 👇). + Ce décodeur n'est pas publié car il pourrait être utilisé de manière malveillante pour falsifier l'édition d'un document. + """, + 'tweet_5': + """ + Dans l'ensemble, le modèle est très performant pour la compréhension de documents (DUE) ainsi que pour l'extraction d'informations (FUNSD, CORD) et la classification (RVL-CDIP) pour les modalités de vision, de texte et de mise en page 👇 """, + 'ressources': + """ + Resources : + [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623) + de Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal (2022) + [GitHub](https://github.com/microsoft/UDOP) + [Modèles sur Hugging Face](https://huggingface.co/microsoft/udop-large) + [Hugging Face documentation](https://huggingface.co/docs/transformers/en/model_doc/udop) + """ + } +} + + +def language_selector(): + languages = {'EN': '🇬🇧', 'FR': '🇫🇷'} + selected_lang = st.selectbox('', options=list(languages.keys()), format_func=lambda x: languages[x], key='lang_selector') + return 'en' if selected_lang == 'EN' else 'fr' + +left_column, right_column = st.columns([5, 1]) + +# Add a selector to the right column +with right_column: + lang = language_selector() + +# Add a title to the left column +with left_column: + st.title(translations[lang]["title"]) + +st.success(translations[lang]["original_tweet"], icon="ℹ️") +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_1"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/UDOP/image_1.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_2"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/UDOP/image_2.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_3"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/UDOP/image_3.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_4"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/UDOP/image_4.jpg", use_column_width=True) +st.image("pages/UDOP/image_5.jpg", use_column_width=True) +st.markdown(""" """) + +st.markdown(translations[lang]["tweet_5"], unsafe_allow_html=True) +st.markdown(""" """) + +st.image("pages/UDOP/image_6.jpg", use_column_width=True) +st.markdown(""" """) + +st.info(translations[lang]["ressources"], icon="📚") + +st.markdown(""" """) +st.markdown(""" """) +st.markdown(""" """) +col1, col2, col3= st.columns(3) +with col1: + if lang == "en": + if st.button('Previous paper', use_container_width=True): + switch_page("Depth Anything") + else: + if st.button('Papier précédent', use_container_width=True): + switch_page("Depth Anything") +with col2: + if lang == "en": + if st.button("Home", use_container_width=True): + switch_page("Home") + else: + if st.button("Accueil", use_container_width=True): + switch_page("Home") +with col3: + if lang == "en": + if st.button("Next paper", use_container_width=True): + switch_page("LLaVA-NeXT") + else: + if st.button("Papier suivant", use_container_width=True): + switch_page("LLaVA-NeXT") \ No newline at end of file diff --git a/pages/Aria/image_0.png b/pages/Aria/image_0.png new file mode 100644 index 0000000000000000000000000000000000000000..29ec4c6d889380a59946f053f8924d5e38668d71 --- /dev/null +++ b/pages/Aria/image_0.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f3bfcaac960618859d998466ae695d3dab3ee8b51a8f5d188cc247e099b3bff +size 1612291 diff --git a/pages/Aria/image_1.png b/pages/Aria/image_1.png new file mode 100644 index 0000000000000000000000000000000000000000..0b6c4c699dbc553ffdf4dfe8f03c7f62826d3ca8 Binary files /dev/null and b/pages/Aria/image_1.png differ diff --git a/pages/Aria/image_2.png b/pages/Aria/image_2.png new file mode 100644 index 0000000000000000000000000000000000000000..774f859010e11b9873e686d214e0c5ab436b9232 --- /dev/null +++ b/pages/Aria/image_2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a49f0b152b3785861be507da6ae95ca9872e727c1cd907fd061f7356bb2145 +size 16925075 diff --git a/pages/Aria/image_3.png b/pages/Aria/image_3.png new file mode 100644 index 0000000000000000000000000000000000000000..e93ad322b42313a61d3b9dad697597d8a6f70f25 Binary files /dev/null and b/pages/Aria/image_3.png differ diff --git a/pages/Aria/image_4.png b/pages/Aria/image_4.png new file mode 100644 index 0000000000000000000000000000000000000000..ce0f2cfa67a5aa2cbfe07cc7856d5bd1efe11fbb Binary files /dev/null and b/pages/Aria/image_4.png differ diff --git a/pages/Aria/video_1.mp4 b/pages/Aria/video_1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..1c23cebf9761b0a86f7c37b884ca3c5d35ae6ad1 Binary files /dev/null and b/pages/Aria/video_1.mp4 differ diff --git a/pages/ColPali/image_1.jpg b/pages/ColPali/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cf82dca534c43d78fa2c262d95ea918f29922dc6 Binary files /dev/null and b/pages/ColPali/image_1.jpg differ diff --git a/pages/ColPali/image_2.jpg b/pages/ColPali/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..81503325aebf03e28c0adffe62cae82451bbc440 Binary files /dev/null and b/pages/ColPali/image_2.jpg differ diff --git a/pages/ColPali/image_3.jpg b/pages/ColPali/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..932ed335cddedae0669528b4e0c66dc3d6d4d3c8 Binary files /dev/null and b/pages/ColPali/image_3.jpg differ diff --git a/pages/ColPali/image_4.jpg b/pages/ColPali/image_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d3a19aecdd03d6552331e4125f4e71938f908ba3 Binary files /dev/null and b/pages/ColPali/image_4.jpg differ diff --git a/pages/ColPali/image_5.jpg b/pages/ColPali/image_5.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e4fd7bcd647b2d668b07a111e3db140fc9f6b0d7 Binary files /dev/null and b/pages/ColPali/image_5.jpg differ diff --git a/pages/GOT/image_1.png b/pages/GOT/image_1.png new file mode 100644 index 0000000000000000000000000000000000000000..719c8db5fbf83299063170c53c56549c285505bd Binary files /dev/null and b/pages/GOT/image_1.png differ diff --git a/pages/GOT/image_2.png b/pages/GOT/image_2.png new file mode 100644 index 0000000000000000000000000000000000000000..7d251ffb0f020702b5d6037eb6d29b1a5545c0cc Binary files /dev/null and b/pages/GOT/image_2.png differ diff --git a/pages/GOT/image_3.png b/pages/GOT/image_3.png new file mode 100644 index 0000000000000000000000000000000000000000..e82889eca966e0d4f896e29a6b6de3781a8facb0 Binary files /dev/null and b/pages/GOT/image_3.png differ diff --git a/pages/GOT/image_4.png b/pages/GOT/image_4.png new file mode 100644 index 0000000000000000000000000000000000000000..d60d41ee38b9d74fc683951ff223c00d276a17b3 Binary files /dev/null and b/pages/GOT/image_4.png differ diff --git a/pages/GOT/image_5.png b/pages/GOT/image_5.png new file mode 100644 index 0000000000000000000000000000000000000000..5d04c8a12477deb24f9b641017eca4cc7ba5c054 Binary files /dev/null and b/pages/GOT/image_5.png differ diff --git a/pages/Grounding_DINO/image_9.jpeg b/pages/Grounding_DINO/image_9.jpeg index 1ba01db76dff84a7d467f05295a6d370d7ac659d..ebd7c3d3454322baa65163a4ff2df8c7f8937ff4 100644 Binary files a/pages/Grounding_DINO/image_9.jpeg and b/pages/Grounding_DINO/image_9.jpeg differ diff --git a/pages/KOSMOS-2/image_1.jpg b/pages/KOSMOS-2/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..65c2f42e060910001c3953a40d756b68d8d9fa33 Binary files /dev/null and b/pages/KOSMOS-2/image_1.jpg differ diff --git a/pages/KOSMOS-2/video_1.mp4 b/pages/KOSMOS-2/video_1.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..791066e1f50fcf36ad8fd8f010cdc92043e2315f --- /dev/null +++ b/pages/KOSMOS-2/video_1.mp4 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6335308f16e6de15b2a3e4229258ee23475d7af01bff69a1481b2d656d2054c3 +size 2909151 diff --git a/pages/MiniGemini/image_1.jpg b/pages/MiniGemini/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8201d4c54d6ea633b907e4394e6418fac2710b4d Binary files /dev/null and b/pages/MiniGemini/image_1.jpg differ diff --git a/pages/MiniGemini/image_2.jpg b/pages/MiniGemini/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e766b2ea0912e7da67b880f8dcc0c8a326d0dfc6 Binary files /dev/null and b/pages/MiniGemini/image_2.jpg differ diff --git a/pages/MiniGemini/image_3.jpg b/pages/MiniGemini/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..06483cc981183f6c48f1ef6f44543fc05b3ae0c0 Binary files /dev/null and b/pages/MiniGemini/image_3.jpg differ diff --git a/pages/MiniGemini/image_4.jpg b/pages/MiniGemini/image_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3b059379ef76fe22fa05bd174721a654c46c7f0b Binary files /dev/null and b/pages/MiniGemini/image_4.jpg differ diff --git a/pages/MiniGemini/image_5.jpg b/pages/MiniGemini/image_5.jpg new file mode 100644 index 0000000000000000000000000000000000000000..125a564cc23ff39708693f253c9703d2bc56b133 Binary files /dev/null and b/pages/MiniGemini/image_5.jpg differ diff --git a/pages/MiniGemini/image_6.jpg b/pages/MiniGemini/image_6.jpg new file mode 100644 index 0000000000000000000000000000000000000000..de8ec4067f8c0fc08fb6aac907c2f28c5e79e445 Binary files /dev/null and b/pages/MiniGemini/image_6.jpg differ diff --git a/pages/NVEagle/image_1.jpg b/pages/NVEagle/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5395b3218c75e61bf8cd6fef01989bd515ccc0a2 Binary files /dev/null and b/pages/NVEagle/image_1.jpg differ diff --git a/pages/NVEagle/image_2.jpg b/pages/NVEagle/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7ab7eea98dc52ed6468590ba7902f0b7fa837146 Binary files /dev/null and b/pages/NVEagle/image_2.jpg differ diff --git a/pages/NVEagle/image_3.jpg b/pages/NVEagle/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bf7fd0bbb2d31ac89aee0afdff880483be654bc6 Binary files /dev/null and b/pages/NVEagle/image_3.jpg differ diff --git a/pages/NVEagle/image_4.jpg b/pages/NVEagle/image_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a471eaaf9a2aad32012745217539bddde475f124 Binary files /dev/null and b/pages/NVEagle/image_4.jpg differ diff --git a/pages/NVLM/image_1.png b/pages/NVLM/image_1.png new file mode 100644 index 0000000000000000000000000000000000000000..2ec3a2c2f8a52821f0b84e35b2ad91f3758ef489 Binary files /dev/null and b/pages/NVLM/image_1.png differ diff --git a/pages/NVLM/image_2.png b/pages/NVLM/image_2.png new file mode 100644 index 0000000000000000000000000000000000000000..d27a2733e6a996f6da59a1b376639819b0c47343 Binary files /dev/null and b/pages/NVLM/image_2.png differ diff --git a/pages/NVLM/image_3.png b/pages/NVLM/image_3.png new file mode 100644 index 0000000000000000000000000000000000000000..0d9296d5fd492023bba2c1a1ad8631e146560426 Binary files /dev/null and b/pages/NVLM/image_3.png differ diff --git a/pages/NVLM/image_4.png b/pages/NVLM/image_4.png new file mode 100644 index 0000000000000000000000000000000000000000..a3afb63f3af1e3e7e5d06ead359ddea92fde48b5 Binary files /dev/null and b/pages/NVLM/image_4.png differ diff --git a/pages/UDOP/image_1.jpg b/pages/UDOP/image_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a9507b954ccfeb26b4ac92fcc5ce51b2d58157c0 Binary files /dev/null and b/pages/UDOP/image_1.jpg differ diff --git a/pages/UDOP/image_2.jpg b/pages/UDOP/image_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bc961f4e90e66eb3395302eedc7c0a5d9f8d46db Binary files /dev/null and b/pages/UDOP/image_2.jpg differ diff --git a/pages/UDOP/image_3.jpg b/pages/UDOP/image_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..89b938cdc393fc6cbcbc42cee7b7c618ccda3b72 Binary files /dev/null and b/pages/UDOP/image_3.jpg differ diff --git a/pages/UDOP/image_4.jpg b/pages/UDOP/image_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c427b7bcdd521ed3dc60df4518ddf3df7db7b057 Binary files /dev/null and b/pages/UDOP/image_4.jpg differ diff --git a/pages/UDOP/image_5.jpg b/pages/UDOP/image_5.jpg new file mode 100644 index 0000000000000000000000000000000000000000..271c0c303b173bed2994be3cb1d976a3f50073d5 Binary files /dev/null and b/pages/UDOP/image_5.jpg differ diff --git a/pages/UDOP/image_6.jpg b/pages/UDOP/image_6.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1650f888073b193e2cb5eb4383a569ede00257c5 Binary files /dev/null and b/pages/UDOP/image_6.jpg differ