Spaces:

FoodDesert
/

Prompt_Squirrel

Running

App Files Files Community

FoodDesert commited on Mar 5

Commit

90290aa

•

1 Parent(s): e2d3b05

Upload 2 files

Browse files

Files changed (2) hide show

app.py +105 -19
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 from joblib import load
 import h5py
@@ -11,6 +12,8 @@ import compress_fasttext
 from collections import OrderedDict
 from lark import Lark
 from lark import Token
@@ -69,12 +72,12 @@ You can read more about TF-IDF on its [Wikipedia page](https://en.wikipedia.org/
 ## How does the tag corrector work?
-We collected the tag sets from over 4 million e621 posts, treating the tag set from each image as an individual document.
 We then randomly replace about 10% of the tags in each document with a randomly selected alias from e621's list of aliases for the tag
 (e.g. "canine" gets replaced with one of {k9,canines,mongrel,cannine,cnaine,feral_canine,anthro_canine}).
 We then train a FastText (https://fasttext.cc/) model on the documents.  The result of this training is a function that maps arbitrary words to vectors such that
 the vector for a tag and the vectors for its aliases are all close together (because the model has seen them in similar contexts).
-Since the lists of aliases contain misspellings and rephrasings of tags, the model should be robust to these kinds of problems.
 """
@@ -92,6 +95,9 @@ plain: /([^,\\\[\]():|]|\\.)+/
 parser = Lark(grammar, start='start')
 # Function to extract tags
 def extract_tags(tree):
     tags = []
@@ -107,21 +113,43 @@ def extract_tags(tree):
 # Load the model and data once at startup
-with h5py.File('complete_artist_data.hdf5', 'r') as f:
-    # Deserialize the vectorizer
     vectorizer_bytes = f['vectorizer'][()].tobytes()
     vectorizer_buffer = BytesIO(vectorizer_bytes)
     vectorizer = load(vectorizer_buffer)
-    # Load X_artist
-    X_artist = f['X_artist'][:]
-    # Load artist names and decode to strings
     artist_names = [name.decode() for name in f['artist_names'][:]]
 def clean_tag(tag):
     return ''.join(char for char in tag if ord(char) < 128)
 #Normally returns tag to aliases, but when reverse=True, returns alias to tags
 def build_aliases_dict(filename, reverse=False):
     aliases_dict = {}
@@ -138,7 +166,52 @@ def build_aliases_dict(filename, reverse=False):
     return aliases_dict
-def find_similar_tags(test_tags):
     #Initialize stuff
     if not hasattr(find_similar_tags, "fasttext_small_model"):
@@ -149,12 +222,16 @@ def find_similar_tags(test_tags):
     if not hasattr(find_similar_tags, "alias2tags"):
         find_similar_tags.alias2tags = build_aliases_dict(tag_aliases_file, reverse=True)
     # Find similar tags and prepare data for dataframe.
     results_data = []
     for tag in test_tags:
         modified_tag_for_search = tag.replace(' ','_')
-        similar_words = find_similar_tags.fasttext_small_model.most_similar(modified_tag_for_search)
         result, seen = [], set()
         if modified_tag_for_search in find_similar_tags.tag2aliases:
@@ -176,7 +253,15 @@ def find_similar_tags(test_tags):
                                 result.append((similar_tag.replace('_', ' '), round(similarity, 3)))
                                 seen.add(similar_tag)
         # Append tag and formatted similar tags to results_data
         first_entry_for_tag = True
         for word, sim in result:
             if first_entry_for_tag:
@@ -191,7 +276,7 @@ def find_similar_tags(test_tags):
     return results_data  # Return list of lists for Dataframe
-def find_similar_artists(new_tags_string, top_n):
     try:
         new_tags_string = new_tags_string.lower()
         # Parse the prompt
@@ -201,17 +286,17 @@ def find_similar_artists(new_tags_string, top_n):
         new_image_tags = [tag.replace('_', ' ').strip() for tag in new_image_tags]
         ###unseen_tags = list(set(OrderedDict.fromkeys(new_image_tags)) - set(vectorizer.vocabulary_.keys()))   #We may want this line again later.  These are the tags that were not used to calculate the artists list.
-        unseen_tags_data = find_similar_tags(new_image_tags)
-        X_new_image = vectorizer.transform([','.join(new_image_tags)])
-        similarities = cosine_similarity(X_new_image, X_artist)[0]
         top_artist_indices = np.argsort(similarities)[-top_n:][::-1]
         top_artists = [(artist_names[i], similarities[i]) for i in top_artist_indices]
         top_artists_str = "\n".join([f"{rank+1}. {artist[3:]} ({score:.4f})" for rank, (artist, score) in enumerate(top_artists)])
         dynamic_prompts_formatted_artists = "{" + "|".join([artist for artist, _ in top_artists]) + "}"
         return unseen_tags_data, top_artists_str, dynamic_prompts_formatted_artists
     except ParseError as e:
         return [], "Parse Error: Check for mismatched parentheses or something", ""
@@ -221,7 +306,8 @@ iface = gr.Interface(
     fn=find_similar_artists,
     inputs=[
         gr.Textbox(label="Enter image tags", placeholder="e.g. fox, outside, detailed background, ..."),
-        gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Number of artists")
     ],
     outputs=[
         gr.Dataframe(label="Unseen Tags", headers=["Tag", "Similar Tags", "Similarity"]),

 import gradio as gr
 from sklearn.metrics.pairwise import cosine_similarity
+from scipy.sparse import csr_matrix
 import numpy as np
 from joblib import load
 import h5py
 from collections import OrderedDict
 from lark import Lark
 from lark import Token
+from lark.exceptions import ParseError
 ## How does the tag corrector work?
+We collect the tag sets from over 4 million e621 posts, treating the tag set from each image as an individual document.
 We then randomly replace about 10% of the tags in each document with a randomly selected alias from e621's list of aliases for the tag
 (e.g. "canine" gets replaced with one of {k9,canines,mongrel,cannine,cnaine,feral_canine,anthro_canine}).
 We then train a FastText (https://fasttext.cc/) model on the documents.  The result of this training is a function that maps arbitrary words to vectors such that
 the vector for a tag and the vectors for its aliases are all close together (because the model has seen them in similar contexts).
+Since the lists of aliases contain misspellings and rephrasings of tags, the model should be robust to these kinds of problems as long as they are not too dissimilar from the alias lists.
 """
 parser = Lark(grammar, start='start')
+special_tags = ["score:0", "score:1", "score:2", "score:3", "score:4", "score:5", "score:6", "score:7", "score:8", "score:9"]
 # Function to extract tags
 def extract_tags(tree):
     tags = []
 # Load the model and data once at startup
+with h5py.File('pca_reduced_artist_data.hdf5', 'r') as f:
     vectorizer_bytes = f['vectorizer'][()].tobytes()
+    # Use io.BytesIO to convert bytes back to a file-like object for joblib to load
     vectorizer_buffer = BytesIO(vectorizer_bytes)
     vectorizer = load(vectorizer_buffer)
+    # Assuming you've saved the PCA mean, components, and the transformed X_artist matrix in the file
+    pca_mean = f['pca_mean'][:]
+    pca_components = f['pca_components'][:]
+    X_artist_reduced = f['X_artist_reduced'][:]
     artist_names = [name.decode() for name in f['artist_names'][:]]
+    # Recreate PCA transformation (not the exact PCA object but its transformation ability)
+    def pca_transform(X):
+        return (X - pca_mean) @ pca_components.T
+with h5py.File('conditional_tag_probabilities_matrix.h5', 'r') as f:
+    # Reconstruct the sparse co-occurrence matrix
+    conditional_co_occurrence_matrix = csr_matrix(
+        (f['co_occurrence_data'][:], f['co_occurrence_indices'][:], f['co_occurrence_indptr'][:]),
+        shape=f['co_occurrence_shape'][:]
+    )
+    # Reconstruct the vocabulary
+    conditional_words = f['vocabulary_words'][:]
+    conditional_indices = f['vocabulary_indices'][:]
+    conditional_vocabulary = {key.decode('utf-8'): value for key, value in zip(conditional_words, conditional_indices)}
+    # Load the document count
+    conditional_doc_count = f['doc_count'][()]
+    conditional_smoothing = 100. / conditional_doc_count
 def clean_tag(tag):
     return ''.join(char for char in tag if ord(char) < 128)
 #Normally returns tag to aliases, but when reverse=True, returns alias to tags
 def build_aliases_dict(filename, reverse=False):
     aliases_dict = {}
     return aliases_dict
+#Imagine we are adding smoothing_value to the number of times word_j occurs in each document for smoothing.
+#Note the intention is that sum_i(P(word_i|word_j)) =(approx) # of words in a document rather than 1.
+def conditional_probability(word_i, word_j, co_occurrence_matrix, vocabulary, doc_count, smoothing_value=0.01):
+    word_i_index = vocabulary.get(word_i)
+    word_j_index = vocabulary.get(word_j)
+    if word_i_index is not None and word_j_index is not None:
+        # Directly access the sparse matrix elements
+        word_j_count = co_occurrence_matrix[word_j_index, word_j_index]
+        smoothed_word_j_count =  word_j_count + (smoothing_value * doc_count)
+        word_i_count = co_occurrence_matrix[word_i_index, word_i_index]
+        co_occurrence_count = co_occurrence_matrix[word_i_index, word_j_index]
+        smoothed_co_occurrence_count = co_occurrence_count + (smoothing_value * word_i_count)
+        # Calculate the conditional probability with smoothing
+        conditional_prob = smoothed_co_occurrence_count / smoothed_word_j_count
+        return conditional_prob
+    elif word_i_index is None:
+        return 0
+    else:
+        return None
+#geometric_mean_given_words(target_word, context_words, conditional_co_occurrence_matrix, conditioanl_vocabulary, conditional_doc_count, smoothing_value=conditional_smoothing):
+def geometric_mean_given_words(target_word, context_words, co_occurrence_matrix, vocabulary, doc_count, smoothing_value=0.01):
+    probabilities = []
+    # Collect the conditional probabilities of the target word given each context word, ignoring None values
+    for context_word in context_words:
+        prob = conditional_probability(target_word, context_word, co_occurrence_matrix, vocabulary, doc_count, smoothing_value)
+        if prob is not None:
+            probabilities.append(prob)
+    # Compute the geometric mean of the probabilities, avoiding division by zero
+    if probabilities:  # Check if the list is not empty
+        geometric_mean = np.prod(probabilities) ** (1.0 / len(probabilities))
+    else:
+        geometric_mean = 0.5  # Or assign some default value if all probabilities are None
+    return geometric_mean
+def find_similar_tags(test_tags, similarity_weight):
     #Initialize stuff
     if not hasattr(find_similar_tags, "fasttext_small_model"):
     if not hasattr(find_similar_tags, "alias2tags"):
         find_similar_tags.alias2tags = build_aliases_dict(tag_aliases_file, reverse=True)
+    transformed_tags = [tag.replace(' ', '_') for tag in test_tags]
     # Find similar tags and prepare data for dataframe.
     results_data = []
     for tag in test_tags:
+        if tag in special_tags:
+            continue
         modified_tag_for_search = tag.replace(' ','_')
+        similar_words = find_similar_tags.fasttext_small_model.most_similar(modified_tag_for_search, topn = 100)
         result, seen = [], set()
         if modified_tag_for_search in find_similar_tags.tag2aliases:
                                 result.append((similar_tag.replace('_', ' '), round(similarity, 3)))
                                 seen.add(similar_tag)
+        #Adjust score based on context
+        for i in range(len(result)):
+            word, score = result[i]  # Unpack the tuple
+            geometric_mean = geometric_mean_given_words(word.replace(' ','_'), [context_tag for context_tag in transformed_tags if context_tag != word and context_tag != tag], conditional_co_occurrence_matrix, conditional_vocabulary, conditional_doc_count, smoothing_value=conditional_smoothing)
+            adjusted_score = (similarity_weight * geometric_mean) + ((1-similarity_weight)*score)  # Apply the adjustment function
+            result[i] = (word, adjusted_score)  # Update the tuple with the adjusted score
         # Append tag and formatted similar tags to results_data
+        result = sorted(result, key=lambda x: x[1], reverse=True)[:10]
         first_entry_for_tag = True
         for word, sim in result:
             if first_entry_for_tag:
     return results_data  # Return list of lists for Dataframe
+def find_similar_artists(new_tags_string, top_n, similarity_weight):
     try:
         new_tags_string = new_tags_string.lower()
         # Parse the prompt
         new_image_tags = [tag.replace('_', ' ').strip() for tag in new_image_tags]
         ###unseen_tags = list(set(OrderedDict.fromkeys(new_image_tags)) - set(vectorizer.vocabulary_.keys()))   #We may want this line again later.  These are the tags that were not used to calculate the artists list.
+        unseen_tags_data = find_similar_tags(new_image_tags, similarity_weight)
+        X_new_image_transformed = pca_transform(vectorizer.transform([','.join(new_image_tags)]))
+        similarities = cosine_similarity(np.asarray(X_new_image_transformed), np.asarray(X_artist_reduced))[0]
         top_artist_indices = np.argsort(similarities)[-top_n:][::-1]
         top_artists = [(artist_names[i], similarities[i]) for i in top_artist_indices]
         top_artists_str = "\n".join([f"{rank+1}. {artist[3:]} ({score:.4f})" for rank, (artist, score) in enumerate(top_artists)])
         dynamic_prompts_formatted_artists = "{" + "|".join([artist for artist, _ in top_artists]) + "}"
         return unseen_tags_data, top_artists_str, dynamic_prompts_formatted_artists
     except ParseError as e:
         return [], "Parse Error: Check for mismatched parentheses or something", ""
     fn=find_similar_artists,
     inputs=[
         gr.Textbox(label="Enter image tags", placeholder="e.g. fox, outside, detailed background, ..."),
+        gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Number of artists"),
+        gr.Slider(minimum=0, maximum=1, value=0.5, step=0.1, label="Similarity weight")
     ],
     outputs=[
         gr.Dataframe(label="Unseen Tags", headers=["Tag", "Similar Tags", "Similarity"]),

requirements.txt CHANGED Viewed

@@ -5,3 +5,4 @@ h5py==3.8.0
 joblib==1.2.0
 compress-fasttext
 lark-parser

 joblib==1.2.0
 compress-fasttext
 lark-parser
+scipy