Spaces:

johannoriel
/

OlympIA

Sleeping

App Files Files Community

johannoriel commited on Oct 17

Commit

645a356

•

1 Parent(s): d7dc2a6

HF compat

Browse files

Files changed (2) hide show

plugins/ragllm.py +76 -19
requirements.txt +1 -1

plugins/ragllm.py CHANGED Viewed

@@ -10,10 +10,18 @@ from typing import List, Dict, Any
 import requests
 import torch
 from transformers import AutoTokenizer, AutoModel
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 MAX_LENGTH = 512
-CHUNK_SIZE = 200  # Nombre de mots par chunk
 def mean_pooling(model_output, attention_mask):
     token_embeddings = model_output[0]
@@ -43,6 +51,7 @@ translations["en"].update({
     "rag_error_fetching_models_ollama": "Error fetching Ollama models: ",
     "rag_error_calling_llm": "Error calling LLM: ",
     "rag_processing" : "Processing...",
 })
 translations["fr"].update({
@@ -67,28 +76,36 @@ translations["fr"].update({
     "rag_error_fetching_models_ollama": "Erreur lors de la récupération des modèles Ollama : ",
     "rag_error_calling_llm": "Erreur lors de l'appel au LLM : ",
     "rag_processing" : "En cours de traitement...",
 })
 class RagllmPlugin(Plugin):
     def __init__(self, name: str, plugin_manager):
         super().__init__(name, plugin_manager)
-        self.config = self.load_llm_config()
         self.embeddings = None
         self.chunks = None
     def load_llm_config(self) -> Dict:
-        with open('.llm-config.yml', 'r') as file:
-            return yaml.safe_load(file)
     def get_tabs(self):
         return [{"name": "RAG", "plugin": "ragllm"}]
     def get_config_fields(self):
-        return {
             "provider": {
                 "type": "select",
                 "label": t("rag_model_provider"),
-                "options": [("ollama", "Ollama"), ("groq", "Groq")],
                 "default": "ollama"
             },
             "llm_model": {
@@ -132,6 +149,15 @@ class RagllmPlugin(Plugin):
                 "default": 3
             }
         }
     def get_config_ui(self, config):
         updated_config = {}
@@ -201,6 +227,8 @@ class RagllmPlugin(Plugin):
                 return ["ollama/qwen2"]
         elif provider == 'groq':
             return ["groq/llama3-70b-8192", "groq/mixtral-8x7b-32768"]
         else:
             return ["none"]
@@ -211,12 +239,23 @@ class RagllmPlugin(Plugin):
         self.embeddings = np.vstack([self.get_embedding(c, embedder) for c in self.chunks])
     def get_embedding(self, text: str, model: str) -> np.ndarray:
-        tokenizer = AutoTokenizer.from_pretrained(model)
-        model = AutoModel.from_pretrained(model, trust_remote_code=True).to(DEVICE)
-        inputs = tokenizer(text, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt").to(DEVICE)
-        with torch.no_grad():
-            model_output = model(**inputs)
-        return mean_pooling(model_output, inputs['attention_mask']).cpu().numpy()
     def calculate_similarity(self, query_embedding: np.ndarray, method: str) -> np.ndarray:
         if method == 'cosine':
@@ -238,13 +277,31 @@ class RagllmPlugin(Plugin):
     def call_llm(self, prompt: str, sysprompt: str) -> str:
         try:
             llm_model = st.session_state.ragllm_llm_model
-            #print(f"---------------------------------------\nCalling LLM {llm_model} \n with sysprompt {sysprompt} \n and prompt {prompt} \n and context len of {len(context)}")
-            messages = [
-                {"role": "system", "content": sysprompt},
-                {"role": "user", "content": prompt}
-            ]
-            response = completion(model=llm_model, messages=messages)
-            return response['choices'][0]['message']['content']
         except Exception as e:
             return f"{t('rag_error_calling_llm')}{str(e)}"

 import requests
 import torch
 from transformers import AutoTokenizer, AutoModel
+from huggingface_hub import InferenceClient
+from langchain_huggingface import HuggingFaceEmbeddings
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 MAX_LENGTH = 512
+CHUNK_SIZE = 200
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0]
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 def mean_pooling(model_output, attention_mask):
     token_embeddings = model_output[0]
     "rag_error_fetching_models_ollama": "Error fetching Ollama models: ",
     "rag_error_calling_llm": "Error calling LLM: ",
     "rag_processing" : "Processing...",
+    "rag_hf_api_key": "HuggingFace API Token",
 })
 translations["fr"].update({
     "rag_error_fetching_models_ollama": "Erreur lors de la récupération des modèles Ollama : ",
     "rag_error_calling_llm": "Erreur lors de l'appel au LLM : ",
     "rag_processing" : "En cours de traitement...",
+     "rag_hf_api_key": "Token API HuggingFace",
 })
 class RagllmPlugin(Plugin):
     def __init__(self, name: str, plugin_manager):
         super().__init__(name, plugin_manager)
+        try:
+            self.config = self.load_llm_config()
+        except:
+            self.config = {}
         self.embeddings = None
         self.chunks = None
+        self.hf_client = None
     def load_llm_config(self) -> Dict:
+        try:
+            with open('.llm-config.yml', 'r') as file:
+                return yaml.safe_load(file)
+        except:
+            return {}
     def get_tabs(self):
         return [{"name": "RAG", "plugin": "ragllm"}]
     def get_config_fields(self):
+        fields = {
             "provider": {
                 "type": "select",
                 "label": t("rag_model_provider"),
+                "options": [("ollama", "Ollama"), ("groq", "Groq"), ("huggingface", "HuggingFace")],
                 "default": "ollama"
             },
             "llm_model": {
                 "default": 3
             }
         }
+        # Add HuggingFace API key field if provider is huggingface
+        if 'provider' in self.config and self.config.get('provider') == 'huggingface':
+            fields["hf_api_key"] = {
+                "type": "password",
+                "label": t("rag_hf_api_key"),
+                "default": ""
+            }
+        return fields
     def get_config_ui(self, config):
         updated_config = {}
                 return ["ollama/qwen2"]
         elif provider == 'groq':
             return ["groq/llama3-70b-8192", "groq/mixtral-8x7b-32768"]
+        elif provider == 'huggingface':
+            return ["HuggingFaceH4/zephyr-7b-beta"]
         else:
             return ["none"]
         self.embeddings = np.vstack([self.get_embedding(c, embedder) for c in self.chunks])
     def get_embedding(self, text: str, model: str) -> np.ndarray:
+        if self.config.get('provider') == 'huggingface':
+            if not hasattr(self, 'hf_embeddings'):
+                self.hf_embeddings = HuggingFaceEmbeddings(
+                    model_name=model,
+                    task="feature-extraction",
+                    encode_kwargs={'normalize': True}
+                )
+            embedding = self.hf_embeddings.embed_query(text)
+            return np.array(embedding).reshape(1, -1)
+        else:
+            # Original embedding logic
+            tokenizer = AutoTokenizer.from_pretrained(model)
+            model = AutoModel.from_pretrained(model, trust_remote_code=True).to(DEVICE)
+            inputs = tokenizer(text, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt").to(DEVICE)
+            with torch.no_grad():
+                model_output = model(**inputs)
+            return mean_pooling(model_output, inputs['attention_mask']).cpu().numpy()
     def calculate_similarity(self, query_embedding: np.ndarray, method: str) -> np.ndarray:
         if method == 'cosine':
     def call_llm(self, prompt: str, sysprompt: str) -> str:
         try:
             llm_model = st.session_state.ragllm_llm_model
+            if self.config.get('provider') == 'huggingface':
+                if not self.hf_client:
+                    self.hf_client = InferenceClient(token=self.config.get('hf_api_key'))
+                messages = [
+                    {"role": "system", "content": sysprompt},
+                    {"role": "user", "content": prompt}
+                ]
+                response = self.hf_client.text_generation(
+                    model=llm_model,
+                    prompt=prompt,
+                    max_new_tokens=512,
+                    temperature=0.7,
+                    stream=False
+                )
+                return response
+            else:
+                messages = [
+                    {"role": "system", "content": sysprompt},
+                    {"role": "user", "content": prompt}
+                ]
+                response = completion(model=llm_model, messages=messages)
+                return response['choices'][0]['message']['content']
         except Exception as e:
             return f"{t('rag_error_calling_llm')}{str(e)}"

requirements.txt CHANGED Viewed

@@ -9,4 +9,4 @@ PyDictionary
 matplotlib
 litellm
 sentencepiece

 matplotlib
 litellm
 sentencepiece
+langchain_huggingface