Spaces:

neonwatty
/

meme_search

Running

App Files Files Community

neonwatty commited on Sep 26

Commit

db14014

•

1 Parent(s): bc066dc

Upload 25 files

Browse files

Files changed (25) hide show

data/.DS_Store +0 -0
data/dbs/memes.db +0 -0
data/dbs/memes.faiss +0 -0
data/dbs/placeholder +0 -0
data/input/test_meme_1.jpg +0 -0
data/input/test_meme_2.jpg +0 -0
data/input/test_meme_3.jpg +0 -0
data/input/test_meme_4.jpg +0 -0
data/input/test_meme_5.jpg +0 -0
data/input/test_meme_6.jpg +0 -0
data/input/test_meme_7.jpg +0 -0
data/input/test_meme_8.jpg +0 -0
data/input/test_meme_9.jpg +0 -0
meme_search/__init__.py +8 -0
meme_search/app.py +57 -0
meme_search/style.css +15 -0
meme_search/utilities/__init__.py +11 -0
meme_search/utilities/add.py +67 -0
meme_search/utilities/chunks.py +68 -0
meme_search/utilities/create.py +19 -0
meme_search/utilities/imgs.py +18 -0
meme_search/utilities/query.py +84 -0
meme_search/utilities/remove.py +59 -0
meme_search/utilities/status.py +28 -0
meme_search/utilities/text_extraction.py +38 -0

data/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

data/dbs/memes.db ADDED Viewed

Binary file (28.7 kB). View file

data/dbs/memes.faiss ADDED Viewed

Binary file (393 kB). View file

data/dbs/placeholder ADDED Viewed

File without changes

data/input/test_meme_1.jpg ADDED Viewed

data/input/test_meme_2.jpg ADDED Viewed

data/input/test_meme_3.jpg ADDED Viewed

data/input/test_meme_4.jpg ADDED Viewed

data/input/test_meme_5.jpg ADDED Viewed

data/input/test_meme_6.jpg ADDED Viewed

data/input/test_meme_7.jpg ADDED Viewed

data/input/test_meme_8.jpg ADDED Viewed

data/input/test_meme_9.jpg ADDED Viewed

meme_search/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import os
+base_dir = os.path.dirname(os.path.abspath(__file__))
+meme_search_root_dir = os.path.dirname(base_dir)
+abs_dir = "."
+vector_db_path = meme_search_root_dir + "/data/dbs/memes.faiss"
+sqlite_db_path = meme_search_root_dir + "/data/dbs/memes.db"

meme_search/app.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import time
+from meme_search import base_dir
+from meme_search.utilities.query import complete_query
+from meme_search.utilities.create import process
+import streamlit as st
+st.set_page_config(page_title="Meme Search")
+# search bar taken from --> https://discuss.streamlit.io/t/creating-a-nicely-formatted-search-field/1804/2
+def local_css(file_name):
+    with open(file_name) as f:
+        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
+def remote_css(url):
+    st.markdown(f'<link href="{url}" rel="stylesheet">', unsafe_allow_html=True)
+local_css(base_dir + "/style.css")
+remote_css("https://fonts.googleapis.com/icon?family=Material+Icons")
+# icon("search")
+with st.container():
+    with st.container(border=True):
+        input_col, button_col = st.columns([6, 2])
+    with button_col:
+        st.empty()
+        refresh_index_button = st.button("refresh index", type="primary")
+        if refresh_index_button:
+            process_start = st.warning("refreshing...")
+            val = process()
+            if val:
+                process_start.empty()
+                success = st.success("index updated!")
+                time.sleep(2)
+                process_start.empty()
+                success.empty()
+            else:
+                process_start.empty()
+                warning = st.warning("no refresh needed!")
+                time.sleep(2)
+                warning.empty()
+    selected = input_col.text_input(label="meme search", placeholder="search for your meme", label_visibility="collapsed")
+    if selected:
+        results = complete_query(selected)
+        img_paths = [v["img_path"] for v in results]
+        with st.container(border=True):
+            for result in results:
+                with st.container(border=True):
+                    st.image(
+                        result["img_path"],
+                        output_format="auto",
+                        caption=f'{result["full_description"]} (query distance = {result["distance"]})',
+                    )

meme_search/style.css ADDED Viewed

	@@ -0,0 +1,15 @@

+body {
+    color: #fff;
+    background-color: #4F8BF9;
+}
+/* .stButton>button {
+    color: #4F8BF9;
+    border-radius: 50%;
+    height: 3em;
+    width: 3em;
+} */
+.stTextInput>div>div>input {
+    color: #4F8BF9;
+}

meme_search/utilities/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import os
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+utilities_base_dir = os.path.dirname(os.path.abspath(__file__))
+meme_search_dir = os.path.dirname(utilities_base_dir)
+meme_search_root_dir = os.path.dirname(meme_search_dir)
+img_dir = meme_search_root_dir + "/data/input/"
+vector_db_path = meme_search_root_dir + "/data/dbs/memes.faiss"
+sqlite_db_path = meme_search_root_dir + "/data/dbs/memes.db"

meme_search/utilities/add.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import sqlite3
+import faiss
+from meme_search.utilities import model
+from meme_search.utilities.text_extraction import extract_text_from_imgs
+from meme_search.utilities.chunks import create_all_img_chunks
+def add_to_chunk_db(img_chunks: list, sqlite_db_path: str) -> None:
+    # Create a lookup table for chunks
+    conn = sqlite3.connect(sqlite_db_path)
+    cursor = conn.cursor()
+    # Create the table
+    cursor.execute("""
+        CREATE TABLE IF NOT EXISTS chunks_reverse_lookup (
+            img_path TEXT,
+            chunk TEXT
+        );
+    """)
+    # Insert data into the table
+    for chunk_index, entry in enumerate(img_chunks):
+        img_path = entry["img_path"]
+        chunk = entry["chunk"]
+        cursor.execute(
+            "INSERT INTO chunks_reverse_lookup (img_path, chunk) VALUES (?, ?)",
+            (img_path, chunk),
+        )
+    conn.commit()
+    conn.close()
+def add_to_vector_db(chunks: list, vector_db_path: str) -> None:
+    # embed inputs
+    embeddings = model.encode(chunks)
+    # dump all_embeddings to faiss index
+    if os.path.exists(vector_db_path):
+        index = faiss.read_index(vector_db_path)
+    else:
+        index = faiss.IndexFlatL2(embeddings.shape[1])
+    index.add(embeddings)
+    faiss.write_index(index, vector_db_path)
+def add_to_dbs(img_chunks: list, sqlite_db_path: str, vector_db_path: str) -> None:
+    try:
+        print("STARTING: add_to_dbs")
+        # add to db for img_chunks
+        add_to_chunk_db(img_chunks, sqlite_db_path)
+        # create vector embedding db for chunks
+        chunks = [v["chunk"] for v in img_chunks]
+        add_to_vector_db(chunks, vector_db_path)
+        print("SUCCESS: add_to_dbs succeeded")
+    except Exception as e:
+        print(f"FAILURE: add_to_dbs failed with exception {e}")
+def add(new_imgs_to_be_indexed: list, sqlite_db_path: str, vector_db_path: str) -> None:
+    moondream_answers = extract_text_from_imgs(new_imgs_to_be_indexed)
+    img_chunks = create_all_img_chunks(new_imgs_to_be_indexed, moondream_answers)
+    add_to_dbs(img_chunks, sqlite_db_path, vector_db_path)

meme_search/utilities/chunks.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import re
+def clean_word(text: str) -> str:
+    # clean input text - keeping only lower case letters, numbers, punctuation, and single quote symbols
+    return re.sub(" +", " ", re.compile("[^a-z0-9,.!?']").sub(" ", text.lower().strip()))
+def chunk_text(text: str) -> list:
+    # split and clean input text
+    text_split = clean_word(text).split(" ")
+    text_split = [v for v in text_split if len(v) > 0]
+    # use two pointers to create chunks
+    chunk_size = 4
+    overlap_size = 2
+    # create next chunk by moving right pointer until chunk_size is reached or line_number changes by more than 1 or end of word_sequence is reached
+    left_pointer = 0
+    right_pointer = chunk_size - 1
+    chunks = []
+    if right_pointer >= len(text_split):
+        chunks = [" ".join(text_split)]
+    else:
+        while right_pointer < len(text_split):
+            # check if chunk_size has been reached
+            # create chunk
+            chunk = text_split[left_pointer : right_pointer + 1]
+            # move left pointer
+            left_pointer += chunk_size - overlap_size
+            # move right pointer
+            right_pointer += chunk_size - overlap_size
+            # store chunk
+            chunks.append(" ".join(chunk))
+        # check if there is final chunk
+        if len(text_split[left_pointer:]) > 0:
+            last_chunk = text_split[left_pointer:]
+            chunks.append(" ".join(last_chunk))
+    # insert the full text
+    if len(chunks) > 1:
+        chunks.insert(0, text.lower())
+    return chunks
+# loop over each meme's moondream based text descriptor and create a short dict containing its full and chunked text
+def create_all_img_chunks(img_paths: list, answers: list) -> list:
+    try:
+        print("STARTING: create_all_img_chunks")
+        img_chunks = []
+        for ind, img_path in enumerate(img_paths):
+            moondream_meme_text = answers[ind]
+            moondream_chunks = chunk_text(moondream_meme_text)
+            for chunk in moondream_chunks:
+                entry = {}
+                entry["img_path"] = img_path
+                entry["chunk"] = chunk
+                img_chunks.append(entry)
+        print("SUCCESS: create_all_img_chunks ran successfully")
+        return img_chunks
+    except Exception as e:
+        print(f"FAILURE: create_all_img_chunks failed with exception {e}")
+        raise e

meme_search/utilities/create.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from meme_search.utilities.status import get_input_directory_status
+from meme_search.utilities.remove import remove
+from meme_search.utilities.add import add
+from meme_search.utilities import img_dir, sqlite_db_path, vector_db_path
+def process() -> bool:
+    old_imgs_to_be_removed, new_imgs_to_be_indexed = get_input_directory_status(img_dir, sqlite_db_path)
+    if len(old_imgs_to_be_removed) == 0 and len(new_imgs_to_be_indexed) == 0:
+        return False
+    if len(old_imgs_to_be_removed) > 0:
+        remove(old_imgs_to_be_removed, sqlite_db_path, vector_db_path)
+    if len(new_imgs_to_be_indexed):
+        add(new_imgs_to_be_indexed, sqlite_db_path, vector_db_path)
+    return True
+if __name__ == "__main__":
+    process()

meme_search/utilities/imgs.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import os
+allowable_extensions = ["jpg", "jpeg", "png"]
+def collect_img_paths(img_dir: str) -> list:
+    try:
+        print("STARTING: collect_img_paths")
+        all_img_paths = [os.path.join(img_dir, name) for name in os.listdir(img_dir) if name.split(".")[-1] in allowable_extensions]
+        all_img_paths = sorted(all_img_paths)
+        all_img_paths = ["./data/input/" + v.split("/")[-1] for v in all_img_paths]
+        print(f"SUCCESS: collect_img_paths ran successfully - image paths loaded from '{img_dir}'")
+        return all_img_paths
+    except Exception as e:
+        print(f"FAILURE: collect_img_paths failed with img_dir {img_dir} with exception {e}")
+        raise e

meme_search/utilities/query.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import faiss
+import sqlite3
+import numpy as np
+from typing import Tuple, Union
+import argparse
+from meme_search.utilities import model
+from meme_search.utilities import vector_db_path, sqlite_db_path
+def query_vector_db(query: str, db_file_path: str, k: int = 10) -> Tuple[list, list]:
+    # connect to db
+    faiss_index = faiss.read_index(db_file_path)
+    # test
+    encoded_query = np.expand_dims(model.encode(query), axis=0)
+    # query db
+    distances, indices = faiss_index.search(encoded_query, k)
+    distances = distances.tolist()[0]
+    indices = indices.tolist()[0]
+    return distances, indices
+def query_for_indices(indices: list) -> list:
+    conn = sqlite3.connect(sqlite_db_path)
+    cursor = conn.cursor()
+    query = f"SELECT rowid, * FROM chunks_reverse_lookup WHERE rowid IN {tuple(indices)}"
+    cursor.execute(query)
+    rows = cursor.fetchall()
+    rows = [{"index": row[0], "img_path": row[1], "chunk": row[2]} for row in rows]
+    rows = sorted(rows, key=lambda x: indices.index(x["index"]))  # re-sort rows according to input indices
+    for row in rows:
+        query = f"SELECT rowid, * FROM chunks_reverse_lookup WHERE rowid=(SELECT MIN(rowid) FROM chunks_reverse_lookup WHERE img_path='{row['img_path']}')"
+        cursor.execute(query)
+        full_description_row = cursor.fetchall()
+        row["full_description"] = full_description_row[0][2]
+    conn.close()
+    return rows
+def query_for_all() -> list:
+    conn = sqlite3.connect(sqlite_db_path)
+    cursor = conn.cursor()
+    query = "SELECT rowid, * FROM chunks_reverse_lookup"
+    cursor.execute(query)
+    rows = cursor.fetchall()
+    rows = [{"index": row[0], "img_path": row[1], "chunk": row[2]} for row in rows]
+    return rows
+def complete_query(query: str, k: int = 10) -> Union[list, None]:
+    try:
+        if len(query.strip()) > 1:
+            print("STARTING: complete_query")
+            # query vector_db, first converting input query to embedding
+            distances, indices = query_vector_db(query, vector_db_path, k=k)
+            # use indices to query sqlite db containing chunk data
+            img_chunks = query_for_indices(indices)  # bump up indices by 1 since sqlite row index starts at 1 not 0
+            # map indices back to correct image in img_chunks
+            imgs_seen = []
+            unique_img_entries = []
+            for ind, entry in enumerate(img_chunks):
+                if entry["img_path"] in imgs_seen:
+                    continue
+                else:
+                    entry["distance"] = round(distances[ind], 2)
+                    unique_img_entries.append(entry)
+                    imgs_seen.append(entry["img_path"])
+            print("SUCCESS: complete_query succeeded")
+            return unique_img_entries
+    except Exception as e:
+        print(f"FAILURE: complete_query failed with exception {e}")
+        raise e
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--query", dest="query", type=str, help="Add query")
+    args = parser.parse_args()
+    query = args.query
+    results = complete_query(query, vector_db_path, sqlite_db_path)

meme_search/utilities/remove.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import sqlite3
+import faiss
+import numpy as np
+def collect_removal_rowids(old_imgs_to_be_removed: list, sqlite_db_path: str) -> list:
+    try:
+        if len(old_imgs_to_be_removed) > 0:
+            conn = sqlite3.connect(sqlite_db_path)
+            cursor = conn.cursor()
+            query = f"""SELECT rowid FROM chunks_reverse_lookup WHERE img_path IN ({','.join(['"'+v+'"' for v in old_imgs_to_be_removed])})"""
+            cursor.execute(query)
+            rows = cursor.fetchall()
+            rowids = [v[0] for v in rows]
+            conn.close()
+            return rowids
+        else:
+            return []
+    except Exception as e:
+        raise ValueError(f"FAILURE: collect_removal_rowids failed with exception {e}")
+def delete_removal_rowids_from_reverse_lookup(rowids: list, sqlite_db_path: str) -> None:
+    try:
+        if len(rowids) > 0:
+            conn = sqlite3.connect(sqlite_db_path)
+            cursor = conn.cursor()
+            if len(rowids) == 1:
+                query = f"""DELETE FROM chunks_reverse_lookup WHERE rowid IN ({str(rowids[0])})"""
+            else:
+                query = f"""DELETE FROM chunks_reverse_lookup WHERE rowid IN ({','.join([str(v) for v in rowids])})"""
+            cursor.execute(query)
+            conn.commit()
+            conn.close()
+            conn = sqlite3.connect(sqlite_db_path)
+            cursor = conn.cursor()
+            cursor.execute("VACUUM;")
+            conn.commit()
+            conn.close()
+    except Exception as e:
+        raise ValueError(f"FAILURE: delete_removal_rowids failed with exception {e}")
+def delete_removal_rowids_from_vector_db(rowids: list, vector_db_path: str) -> None:
+    try:
+        if len(rowids) > 0:
+            index = faiss.read_index(vector_db_path)
+            remove_set = np.array(rowids, dtype=np.int64)
+            index.remove_ids(remove_set)
+            faiss.write_index(index, vector_db_path)
+    except Exception as e:
+        raise ValueError(f"FAILURE: delete_removal_rowids failed with exception {e}")
+def remove(old_imgs_to_be_removed: list, sqlite_db_path: str, vector_db_path: str) -> None:
+    row_ids = collect_removal_rowids(old_imgs_to_be_removed, sqlite_db_path)
+    delete_removal_rowids_from_reverse_lookup(row_ids, sqlite_db_path)
+    delete_removal_rowids_from_vector_db(row_ids, vector_db_path)

meme_search/utilities/status.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import sqlite3
+from meme_search.utilities.imgs import collect_img_paths
+def get_current_indexed_img_names(sqlite_db_path: str):
+    try:
+        print("STARTING: collecting currently indexed names")
+        conn = sqlite3.connect(sqlite_db_path)
+        cursor = conn.cursor()
+        query = f"SELECT DISTINCT(img_path) FROM chunks_reverse_lookup"
+        cursor.execute(query)
+        rows = cursor.fetchall()
+        rows = [v[0] for v in rows]
+        conn.close()
+        print("SUCCESS: get_current_indexed_img_names ran successfully")
+        return rows
+    except Exception as e:
+        raise ValueError(f"FAILURE: get_current_indexed_img_names failed with exception {e}")
+def get_input_directory_status(img_dir: str, sqlite_db_path: str):
+    all_img_paths = collect_img_paths(img_dir)
+    all_img_paths_stubs = ["./" + "/".join(v.split("/")[-3:]).strip() for v in all_img_paths]
+    current_indexed_names = get_current_indexed_img_names(sqlite_db_path)
+    old_imgs_to_be_removed = list(set(current_indexed_names) - set(all_img_paths_stubs))
+    new_imgs_to_be_indexed = list(set(all_img_paths_stubs) - set(current_indexed_names))
+    return old_imgs_to_be_removed, new_imgs_to_be_indexed

meme_search/utilities/text_extraction.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer
+from PIL import Image
+import transformers
+transformers.logging.set_verbosity_error()
+def prompt_moondream(img_path: str, prompt: str) -> str:
+    # copied from moondream demo readme --> https://github.com/vikhyat/moondream/tree/main
+    model_id = "vikhyatk/moondream2"
+    revision = "2024-05-20"
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        trust_remote_code=True,
+        revision=revision,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
+    image = Image.open(img_path)
+    enc_image = model.encode_image(image)
+    moondream_response = model.answer_question(enc_image, prompt, tokenizer)
+    return moondream_response
+def extract_text_from_imgs(img_paths: list) -> list:
+    try:
+        print("STARTING: extract_text_from_imgs")
+        prompt = "Describe this image."
+        answers = []
+        for img_path in img_paths:
+            print(f"INFO: prompting moondream for a description of image: '{img_path}'")
+            answer = prompt_moondream(img_path, prompt)
+            answers.append(answer)
+            print("DONE!")
+        print("SUCCESS: extract_text_from_imgs succeeded")
+        return answers
+    except Exception as e:
+        print(f"FAILURE: extract_text_from_imgs failed with exception {e}")
+        raise e