nv_embed

Runtime error

App Files Files Community

Tonic commited on Feb 22

Commit

385c295

•

1 Parent(s): edb37fd

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -7

app.py CHANGED Viewed

@@ -1,8 +1,10 @@
-# import spaces
 import torch
 import torch.nn.functional as F
 from torch import Tensor
 from transformers import AutoTokenizer, AutoModel
 import gradio as gr
 import os
@@ -33,6 +35,12 @@ tasks = {
         'TRECCOVID': 'Given a query on COVID-19, retrieve documents that answer the query',
 }
 tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-mistral-7b-instruct')
 model = AutoModel.from_pretrained('intfloat/e5-mistral-7b-instruct', torch_dtype=torch.float16, device_map=device)
@@ -56,15 +64,33 @@ def load_corpus_from_json(file_path):
     with open(file_path, 'r') as file:
         data = json.load(file)
     return data
-# @spaces.GPU
 def compute_embeddings(selected_task, input_text):
     try:
         task_description = tasks[selected_task]
     except KeyError:
         print(f"Selected task not found: {selected_task}")
         return f"Error: Task '{selected_task}' not found. Please select a valid task."
-    max_length = 2042
     processed_texts = [f'Instruct: {task_description}\nQuery: {input_text}']
     batch_dict = tokenizer(processed_texts, max_length=max_length - 1, return_attention_mask=False, padding=False, truncation=True)
@@ -75,9 +101,20 @@ def compute_embeddings(selected_task, input_text):
     embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
     embeddings = F.normalize(embeddings, p=2, dim=1)
     embeddings_list = embeddings.detach().cpu().numpy().tolist()
     return embeddings_list
-# @spaces.GPU
 def compute_similarity(selected_task, sentence1, sentence2, extra_sentence1, extra_sentence2):
     try:
         task_description = tasks[selected_task]
@@ -105,17 +142,20 @@ def compute_similarity(selected_task, sentence1, sentence2, extra_sentence1, ext
     free_memory(embeddings1, embeddings2, embeddings3, embeddings4)
     similarity_scores = {"Similarity 1-2": similarity1, "Similarity 1-3": similarity2, "Similarity 1-4": similarity3}
     return similarity_scores
-# @spaces.GPU
 def compute_cosine_similarity(emb1, emb2):
     tensor1 = torch.tensor(emb1).to(device).half()
     tensor2 = torch.tensor(emb2).to(device).half()
     similarity = F.cosine_similarity(tensor1, tensor2).item()
     free_memory(tensor1, tensor2)
     return similarity
 def compute_embeddings_batch(input_texts):
     max_length = 2042
     processed_texts = [f'Instruct: {task_description}\nQuery: {text}' for text in input_texts]
@@ -127,6 +167,7 @@ def compute_embeddings_batch(input_texts):
     outputs = model(**batch_dict)
     embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
     embeddings = F.normalize(embeddings, p=2, dim=1)
     return embeddings.detach().cpu().numpy()
 def semantic_search(query_embedding, corpus_embeddings, top_k=5):
@@ -140,6 +181,31 @@ def search_similar_sentences(input_question, corpus_sentences, corpus_embeddings
     results = [(corpus_sentences[i], top_k_scores[i]) for i in top_k_indices]
     return results
 def app_interface():
     corpus_sentences = []
@@ -210,6 +276,31 @@ def app_interface():
                 outputs=search_results_output
             )
         with gr.Row():
             with gr.Column():
                 input_text_box
@@ -219,5 +310,5 @@ def app_interface():
     return demo
-# Run the Gradio app
 app_interface().launch()

+import spaces
 import torch
 import torch.nn.functional as F
 from torch import Tensor
 from transformers import AutoTokenizer, AutoModel
+import threading
+import queue
 import gradio as gr
 import os
         'TRECCOVID': 'Given a query on COVID-19, retrieve documents that answer the query',
 }
+# Global queue for embedding requests
+embedding_request_queue = queue.Queue()
+embedding_response_queue = queue.Queue()
 tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-mistral-7b-instruct')
 model = AutoModel.from_pretrained('intfloat/e5-mistral-7b-instruct', torch_dtype=torch.float16, device_map=device)
     with open(file_path, 'r') as file:
         data = json.load(file)
     return data
+def embedding_worker():
+    while True:
+        # Wait for an item in the queue
+        item = embedding_request_queue.get()
+        if item is None:
+            break
+        selected_task, input_text = item
+        embeddings = compute_embeddings(selected_task, input_text)
+        formatted_response = format_response(embeddings)
+        embedding_response_queue.put(formatted_response)
+        embedding_request_queue.task_done()
+        clear_cuda_cache()
+threading.Thread(target=embedding_worker, daemon=True).start()
+@spaces.GPU
 def compute_embeddings(selected_task, input_text):
     try:
         task_description = tasks[selected_task]
     except KeyError:
         print(f"Selected task not found: {selected_task}")
         return f"Error: Task '{selected_task}' not found. Please select a valid task."
+    max_length = 2048
     processed_texts = [f'Instruct: {task_description}\nQuery: {input_text}']
     batch_dict = tokenizer(processed_texts, max_length=max_length - 1, return_attention_mask=False, padding=False, truncation=True)
     embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
     embeddings = F.normalize(embeddings, p=2, dim=1)
     embeddings_list = embeddings.detach().cpu().numpy().tolist()
+    clear_cuda_cache()
     return embeddings_list
+@spaces.GPU
+def decode_embedding(embedding_str):
+    try:
+        embedding = [float(num) for num in embedding_str.split(',')]
+        embedding_tensor = torch.tensor(embedding, dtype=torch.float16, device=device)
+        decoded_embedding = tokenizer.decode(embedding_tensor[0], skip_special_tokens=True)
+        return decoded_embedding.cpu().numpy().tolist()
+    except Exception as e:
+        return f"Error in decoding: {str(e)}"
+@spaces.GPU
 def compute_similarity(selected_task, sentence1, sentence2, extra_sentence1, extra_sentence2):
     try:
         task_description = tasks[selected_task]
     free_memory(embeddings1, embeddings2, embeddings3, embeddings4)
     similarity_scores = {"Similarity 1-2": similarity1, "Similarity 1-3": similarity2, "Similarity 1-4": similarity3}
+    clear_cuda_cache()
     return similarity_scores
+@spaces.GPU
 def compute_cosine_similarity(emb1, emb2):
     tensor1 = torch.tensor(emb1).to(device).half()
     tensor2 = torch.tensor(emb2).to(device).half()
     similarity = F.cosine_similarity(tensor1, tensor2).item()
     free_memory(tensor1, tensor2)
+    clear_cuda_cache()
     return similarity
+@spaces.GPU
 def compute_embeddings_batch(input_texts):
     max_length = 2042
     processed_texts = [f'Instruct: {task_description}\nQuery: {text}' for text in input_texts]
     outputs = model(**batch_dict)
     embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
     embeddings = F.normalize(embeddings, p=2, dim=1)
+    clear_cuda_cache()
     return embeddings.detach().cpu().numpy()
 def semantic_search(query_embedding, corpus_embeddings, top_k=5):
     results = [(corpus_sentences[i], top_k_scores[i]) for i in top_k_indices]
     return results
+# openai response object formatting
+def format_response(embeddings):
+    return {
+        "data": [
+            {
+                "embedding": embeddings,
+                "index": 0,
+                "object": "embedding"
+            }
+        ],
+        "model": "e5-mistral",
+        "object": "list",
+        "usage": {
+            "prompt_tokens": 17,
+            "total_tokens": 17
+        }
+    }
+def generate_and_format_embeddings(selected_task, input_text):
+    embedding_request_queue.put((selected_task, input_text))
+    response = embedding_response_queue.get()
+    embedding_response_queue.task_done()
+    clear_cuda_cache()
+    return response
 def app_interface():
     corpus_sentences = []
                 outputs=search_results_output
             )
+        with gr.Tab("Connector-like Embeddings"):
+            with gr.Row():
+                input_text_box_connector = gr.Textbox(label="Input Text", placeholder="Enter text or array of texts")
+                model_dropdown_connector = gr.Dropdown(label="Model", choices=["ArguAna", "ClimateFEVER", "DBPedia", "FEVER", "FiQA2018", "HotpotQA", "MSMARCO", "NFCorpus", "NQ", "QuoraRetrieval", "SCIDOCS", "SciFact", "Touche2020", "TRECCOVID"], value="text-embedding-ada-002")
+                encoding_format_connector = gr.Radio(label="Encoding Format", choices=["float", "base64"], value="float")
+                user_connector = gr.Textbox(label="User", placeholder="Enter user identifier (optional)")
+                submit_button_connector = gr.Button("Generate Embeddings")
+            output_display_connector = gr.JSON(label="Embeddings Output")
+            submit_button_connector.click(
+                fn=generate_and_format_embeddings,
+                inputs=[model_dropdown_connector, input_text_box_connector],
+                outputs=output_display_connector
+            )
+#       with gr.Tab("Decode Embedding"):
+#           embedding_input = gr.Textbox(label="Enter Embedding (comma-separated floats)")
+#           decode_button = gr.Button("Decode")
+#           decoded_output = gr.Textbox(label="Decoded Embedding")
+#
+#           decode_button.click(
+#               fn=decode_embedding,
+#               inputs=embedding_input,
+#               outputs=decoded_output
+#           )
         with gr.Row():
             with gr.Column():
                 input_text_box
     return demo
+app_interface().queue()
 app_interface().launch()