Spaces:

vinid
/

webplip

Runtime error

App Files Files Community

huangzhii commited on Mar 9, 2023

Commit

1bf0164

•

1 Parent(s): 7987133

update

Browse files

Files changed (10) hide show

.gitattributes +3 -0
.gitignore +2 -0
app.py +2 -1
image2image.py +93 -16
introduction.md +3 -1
text2image.py +93 -34
tweet_eval_embeddings.npy +0 -3
tweet_eval_retrieval.tsv +0 -0
tweet_eval_retrieval_twlnk.tsv +0 -0
zeroshot.py +0 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text
+*.asset filter=lfs diff=lfs merge=lfs -text
+twitter.asset filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ /__pycache__
2	+ *.pyc

app.py CHANGED Viewed

@@ -5,8 +5,9 @@ import streamlit as st
-st.sidebar.title("Explore our PLIP Demo")
 PAGES = {
     "Introduction": home,

+#st.set_page_config(layout="wide")
+st.sidebar.title("Multi-task Vision–Language AI for Pathology")
 PAGES = {
     "Introduction": home,

image2image.py CHANGED Viewed

@@ -5,7 +5,11 @@ import numpy as np
 from PIL import Image
 import requests
 import tokenizers
 from io import BytesIO
 import torch
 from transformers import (
     VisionTextDualEncoderModel,
@@ -15,6 +19,7 @@ from transformers import (
     AutoProcessor
 )
 import streamlit.components.v1 as components
 def embed_images(model, images, processor):
@@ -42,51 +47,123 @@ def load_path_clip():
     processor = AutoProcessor.from_pretrained("vinid/plip")
     return model, processor
-def app():
-    st.title('PLIP Image Search')
-    plip_imgURL = pd.read_csv("tweet_eval_retrieval.tsv", sep="\t")
-    plip_weblink = pd.read_csv("tweet_eval_retrieval_twlnk.tsv", sep="\t")
     model, processor = load_path_clip()
-    image_embedding = load_embeddings("tweet_eval_embeddings.npy")
-    query = st.file_uploader("Choose a file")
     if query:
         image = Image.open(query)
         single_image = embed_images(model, [image], processor)[0].detach().cpu().numpy()
         single_image = single_image/np.linalg.norm(single_image)
         # Sort IDs by cosine-similarity from high to low
         similarity_scores = single_image.dot(image_embedding.T)
-        id_sorted = np.argsort(similarity_scores)[::-1]
-        best_id = id_sorted[0]
-        score = similarity_scores[best_id]
-        target_weblink = plip_weblink.iloc[best_id]["weblink"]
-        st.caption('Most relevant image (similarity = %.4f)' % score)
         components.html('''
             <blockquote class="twitter-tweet">
                 <a href="%s"></a>
             </blockquote>
             <script async src="https://platform.twitter.com/widgets.js" charset="utf-8">
             </script>
-            ''' % target_weblink,
-        height=600)

 from PIL import Image
 import requests
 import tokenizers
+import os
 from io import BytesIO
+import pickle
+import base64
 import torch
 from transformers import (
     VisionTextDualEncoderModel,
     AutoProcessor
 )
 import streamlit.components.v1 as components
+from st_clickable_images import clickable_images #pip install st-clickable-images
 def embed_images(model, images, processor):
     processor = AutoProcessor.from_pretrained("vinid/plip")
     return model, processor
+def init():
+    with open('data/twitter.asset', 'rb') as f:
+        data = pickle.load(f)
+    meta = data['meta'].reset_index(drop=True)
+    image_embedding = data['embedding']
+    print(meta.shape, image_embedding.shape)
+    validation_subset_index = meta['source'].values == 'Val_Tweets'
+    return meta, image_embedding, validation_subset_index
+def app():
+    st.title('Image to Image Retrieval')
+    st.markdown('#### A pathology image search engine that correlate images with images.')
+    meta, image_embedding, validation_subset_index = init()
     model, processor = load_path_clip()
+    st.markdown('Click following examples:')
+    example_path = 'data/example_images'
+    list_of_examples = [os.path.join(example_path, v) for v in os.listdir(example_path)]
+    example_imgs = []
+    for file in list_of_examples:
+        with open(file, "rb") as image:
+            encoded = base64.b64encode(image.read()).decode()
+            example_imgs.append(f"data:image/jpeg;base64,{encoded}")
+    clicked = clickable_images(
+        example_imgs,
+        titles=[f"Image #{str(i)}" for i in range(len(example_imgs))],
+        div_style={"display": "flex", "justify-content": "center", "flex-wrap": "wrap"},
+        img_style={"margin": "5px", "height": "70px"},
+    )
+    isExampleClicked = False
+    if clicked > -1:
+        image = Image.open(list_of_examples[clicked])
+        isExampleClicked = True
+    data_options = ["All twitter data (2006-03-21 — 2023-01-15)",
+                    "Twitter validation data (2022-11-16 — 2023-01-15)"]
+    st.radio(
+        "Or choose dataset for image retrieval 👉",
+        key="datapool",
+        options=data_options,
+    )
+    col1, col2 = st.columns(2)
+    with col1:
+        query = st.file_uploader("Choose a file to upload")
+    proceed = False
     if query:
         image = Image.open(query)
+        proceed = True
+    elif isExampleClicked:
+        proceed = True
+    if proceed:
+        with col2:
+            st.image(image, caption='Your upload')
         single_image = embed_images(model, [image], processor)[0].detach().cpu().numpy()
         single_image = single_image/np.linalg.norm(single_image)
         # Sort IDs by cosine-similarity from high to low
         similarity_scores = single_image.dot(image_embedding.T)
+        topn = 5
+        if st.session_state.datapool == data_options[0]:
+            #Use all twitter data
+            id_sorted = np.argsort(similarity_scores)[::-1]
+            best_ids = id_sorted[:topn]
+            best_scores = similarity_scores[best_ids]
+            target_weblinks = meta["weblink"].values[best_ids]
+        else:
+            #Use validation twitter data
+            similarity_scores = similarity_scores[validation_subset_index]
+            # Sort IDs by cosine-similarity from high to low
+            id_sorted = np.argsort(similarity_scores)[::-1]
+            best_ids = id_sorted[:topn]
+            best_scores = similarity_scores[best_ids]
+            target_weblinks = meta["weblink"].values[validation_subset_index][best_ids]
+        #TODO: Avoid duplicated ID
+        topk_options = ['1st', '2nd', '3rd', '4th', '5th']
+        st.radio(
+            "Choose the most similar  👉",
+            key="top_k",
+            options=topk_options,
+            horizontal=True
+        )
+        topn_txt = st.session_state.top_k
+        topn_value = int(st.session_state.top_k[0])-1
+        st.caption(f'The {topn_txt} relevant image (similarity = {best_scores[topn_value]:.4f})')
         components.html('''
             <blockquote class="twitter-tweet">
                 <a href="%s"></a>
             </blockquote>
             <script async src="https://platform.twitter.com/widgets.js" charset="utf-8">
             </script>
+            ''' % target_weblinks[topn_value],
+        height=800)
+    st.markdown('Disclaimer')
+    st.caption('Please be advised that this function has been developed in compliance with the Twitter policy of data usage and sharing. It is important to note that the results obtained from this function are not intended to constitute medical advice or replace consultation with a qualified medical professional. The use of this function is solely at your own risk and should be consistent with applicable laws, regulations, and ethical considerations. We do not warrant or guarantee the accuracy, completeness, suitability, or usefulness of this function for any particular purpose, and we hereby disclaim any liability arising from any reliance placed on this function or any results obtained from its use. If you wish to review the original Twitter post, you should access the source page directly on Twitter.')

introduction.md CHANGED Viewed

	@@ -1,2 +1,4 @@
1
2	- # ~~Welcome~~ to ~~our~~ ~~PLIP~~ ~~Demo~~


1
2	+ # AI-enabled Multi-task Vision–Language Modeling for Pathology from Large-Scale Public Social Network Knowledge
3	+
4	+ The incomplete understanding of heterogeneous pathology images is limited by the inadequate amount of well-annotated publicly available image–text datasets. In this study, we collected 208,414 well-annotated pathology data. Each has a paired image and text description and this collection is so far the largest public dataset for pathology images. By jointly learning the visual and linguistic representations of the data, we proposed a multi-task AI for pathology, which achieves superior performances across multiple benchmarks and can predict previously unseen data. In addition, this framework allows image retrieval by text inputs. Serving as an image search engine, the ability to retrieve relevant images can be a powerful educational tool. In summary, this large-scale, crowdsourcing, spontaneous, and interactive public social network knowledge enabled us to establish a generic AI for pathology that is capable of handling multiple tasks. This approach has greatly enhanced our understanding and interaction with the enormous amount of pathology data available.

text2image.py CHANGED Viewed

@@ -4,6 +4,7 @@ from plip_support import embed_text
 import numpy as np
 from PIL import Image
 import requests
 import tokenizers
 from io import BytesIO
 import torch
@@ -45,50 +46,106 @@ def load_path_clip():
     processor = AutoProcessor.from_pretrained("vinid/plip")
     return model, processor
 def app():
-    st.title('PLIP Image Search')
-    plip_imgURL = pd.read_csv("tweet_eval_retrieval.tsv", sep="\t")
-    plip_weblink = pd.read_csv("tweet_eval_retrieval_twlnk.tsv", sep="\t")
     model, processor = load_path_clip()
-    image_embedding = load_embeddings("tweet_eval_embeddings.npy")
-    query = st.text_input('Search Query', '')
-    if query:
-        text_embedding = embed_texts(model, [query], processor)[0].detach().cpu().numpy()
-        text_embedding = text_embedding/np.linalg.norm(text_embedding)
         # Sort IDs by cosine-similarity from high to low
-        similarity_scores = text_embedding.dot(image_embedding.T)
         id_sorted = np.argsort(similarity_scores)[::-1]
-        best_id = id_sorted[0]
-        score = similarity_scores[best_id]
-        target_url = plip_imgURL.iloc[best_id]["imageURL"]
-        target_weblink = plip_weblink.iloc[best_id]["weblink"]
-        st.caption('Most relevant image (similarity = %.4f)' % score)
-        #response = requests.get(target_url)
-        #img = Image.open(BytesIO(response.content))
-        #st.image(img)
-        components.html('''
-            <blockquote class="twitter-tweet">
-                <a href="%s"></a>
-            </blockquote>
-            <script async src="https://platform.twitter.com/widgets.js" charset="utf-8">
-            </script>
-            ''' % target_weblink,
-        height=600)
@@ -100,6 +157,8 @@ def app():

 import numpy as np
 from PIL import Image
 import requests
+import pickle
 import tokenizers
 from io import BytesIO
 import torch
     processor = AutoProcessor.from_pretrained("vinid/plip")
     return model, processor
+def init():
+    with open('data/twitter.asset', 'rb') as f:
+        data = pickle.load(f)
+    meta = data['meta'].reset_index(drop=True)
+    image_embedding = data['embedding']
+    print(meta.shape, image_embedding.shape)
+    validation_subset_index = meta['source'].values == 'Val_Tweets'
+    return meta, image_embedding, validation_subset_index
 def app():
+    st.title('Text to Image Retrieval')
+    st.markdown('#### A pathology image search engine that correlate texts directly with images.')
+    st.caption('Note: The searching query matches images only. The twitter text does not used for searching.')
+    meta, image_embedding, validation_subset_index = init()
     model, processor = load_path_clip()
+    data_options = ["All twitter data (2006-03-21 — 2023-01-15)",
+                    "Twitter validation data (2022-11-16 — 2023-01-15)"]
+    st.radio(
+        "Choose dataset for image retrieval 👉",
+        key="datapool",
+        options=data_options,
+    )
+    col1, col2 = st.columns(2)
+    #query = st.text_input('Search Query', '')
+    col1_submit = False
+    show = False
+    with col1:
+        # Create selectbox
+        examples = ['Breast tumor surrounded by fat',
+                    'HER2+ breast tumor',
+                    'Colorectal cancer tumor on epithelium',
+                    'An image of endometrium epithelium',
+                    'Breast cancer DCIS',
+                    'Papillary carcinoma in breast tissue',
+                    ]
+        query_1 = st.selectbox("Please select an example query", options=examples)
+        #st.info(f":white_check_mark: The written option is {query_1} ")
+        col1_submit = True
+        show = True
+    with col2:
+        form = st.form(key='my_form')
+        query_2 = form.text_input(label='Or input your custom query:')
+        submit_button = form.form_submit_button(label='Submit')
+    if submit_button:
+        col1_submit = False
+        show = True
+    if col1_submit:
+        query = query_1
+    else:
+        query = query_2
+    text_embedding = embed_texts(model, [query], processor)[0].detach().cpu().numpy()
+    text_embedding = text_embedding/np.linalg.norm(text_embedding)
+    similarity_scores = text_embedding.dot(image_embedding.T)
+    topn = 5
+    if st.session_state.datapool == data_options[0]:
+        #Use all twitter data
+        id_sorted = np.argsort(similarity_scores)[::-1]
+        best_ids = id_sorted[:topn]
+        best_scores = similarity_scores[best_ids]
+        target_weblinks = meta["weblink"].values[best_ids]
+    else:
+        #Use validation twitter data
+        similarity_scores = similarity_scores[validation_subset_index]
         # Sort IDs by cosine-similarity from high to low
         id_sorted = np.argsort(similarity_scores)[::-1]
+        best_ids = id_sorted[:topn]
+        best_scores = similarity_scores[best_ids]
+        target_weblinks = meta["weblink"].values[validation_subset_index][best_ids]
+    #TODO: Avoid duplicated ID
+    topk_options = ['1st', '2nd', '3rd', '4th', '5th']
+    st.radio(
+        "Choose the most similar  👉",
+        key="top_k",
+        options=topk_options,
+        horizontal=True
+    )
+    topn_txt = st.session_state.top_k
+    topn_value = int(st.session_state.top_k[0])-1
+    st.caption(f'The {topn_txt} relevant image (similarity = {best_scores[topn_value]:.4f})')
+    components.html('''
+        <blockquote class="twitter-tweet">
+            <a href="%s"></a>
+        </blockquote>
+        <script async src="https://platform.twitter.com/widgets.js" charset="utf-8">
+        </script>
+        ''' % target_weblinks[topn_value],
+    height=800)
+    st.markdown('Disclaimer')
+    st.caption('Please be advised that this function has been developed in compliance with the Twitter policy of data usage and sharing. It is important to note that the results obtained from this function are not intended to constitute medical advice or replace consultation with a qualified medical professional. The use of this function is solely at your own risk and should be consistent with applicable laws, regulations, and ethical considerations. We do not warrant or guarantee the accuracy, completeness, suitability, or usefulness of this function for any particular purpose, and we hereby disclaim any liability arising from any reliance placed on this function or any results obtained from its use. If you wish to review the original Twitter post, you should access the source page directly on Twitter.')

tweet_eval_embeddings.npy DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:36e445b069b1d937a0a780ddeab9239df5fd13264e8cd1f6cf033be3210352e1
-size 2401408

tweet_eval_retrieval.tsv DELETED Viewed

The diff for this file is too large to render. See raw diff

tweet_eval_retrieval_twlnk.tsv DELETED Viewed

The diff for this file is too large to render. See raw diff

zeroshot.py DELETED Viewed

File without changes