import streamlit as st from streamlit_extras.switch_page_button import switch_page st.title("UDOP") st.success("""[Original tweet](https://x.com/mervenoyann/status/1767200350530859321) (Mar 11, 2024)""", icon="โ„น๏ธ") st.markdown(""" """) st.markdown("""New foundation model on document understanding and generation in transformers ๐Ÿคฉ UDOP by MSFT is a bleeding-edge model that is capable of many tasks, including question answering, document editing and more! ๐Ÿคฏ Check out the [demo](https://huggingface.co/spaces/merve/UDOP). Technical details ๐Ÿงถ""") st.markdown(""" """) st.image("pages/UDOP/image_1.jpeg", use_column_width=True) st.markdown(""" """) st.markdown(""" UDOP is a model that combines vision, text and layout. ๐Ÿ“ This model is very interesting because the input representation truly captures the nature of the document modality: text, where the text is, and the layout of the document matters!""") st.markdown(""" """) st.markdown(""" If you know T5, it resembles that: it's pre-trained on both self-supervised and supervised objectives over text, image and layout. To switch between tasks, one simply needs to change the task specific prompt at the beginning, e.g. for QA, one prepends with Question answering.""") st.markdown(""" """) st.image("pages/UDOP/image_2.png", use_column_width=True) st.markdown(""" """) st.markdown(""" As for the architecture, it's like T5, except it has a single encoder that takes in text, image and layout, and two decoders (text-layout and vision decoders) combined into one. The vision decoder is a masked autoencoder (thus the capabilities of document editing). """) st.image("pages/UDOP/image_3.jpeg", use_column_width=True) st.markdown(""" """) st.image("pages/UDOP/image_3.jpeg", use_column_width=True) st.markdown(""" """) st.markdown(""" For me, the most interesting capability is document reconstruction, document editing and layout re-arrangement (see below ๐Ÿ‘‡ ) this decoder isn't released though because it could be used maliciously to fake document editing. """) st.markdown(""" """) st.image("pages/UDOP/image_4.jpeg", use_column_width=True) st.markdown(""" """) st.markdown(""" Overall, the model performs very well on document understanding benchmark (DUE) and also information extraction (FUNSD, CORD) and classification (RVL-CDIP) for vision, text, layout modalities ๐Ÿ‘‡ """) st.markdown(""" """) st.image("pages/UDOP/image_5.jpeg", use_column_width=True) st.markdown(""" """) st.info(""" Resources: - [Unifying Vision, Text, and Layout for Universal Document Processing](https://arxiv.org/abs/2212.02623) Zineng Tang, Ziyi Yang, Guoxin Wang, Yuwei Fang, Yang Liu, Chenguang Zhu, Michael Zeng, Cha Zhang, Mohit Bansal (2022) - [GitHub](https://github.com/microsoft/UDOP) - [Hugging Face Models](https://huggingface.co/microsoft/udop-large) - [Hugging Face documentation](https://huggingface.co/docs/transformers/en/model_doc/udop)""", icon="๐Ÿ“š") st.markdown(""" """) st.markdown(""" """) st.markdown(""" """) col1, col2, col3 = st.columns(3) with col1: if st.button('Previous paper', use_container_width=True): switch_page("SAMv2") with col2: if st.button('Home', use_container_width=True): switch_page("Home") with col3: if st.button('Next paper', use_container_width=True): switch_page("MiniGemini")