Uhhy commited on
Commit
b560d3f
1 Parent(s): 87928b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -47
app.py CHANGED
@@ -7,43 +7,54 @@ import uvicorn
7
  from dotenv import load_dotenv
8
  from difflib import SequenceMatcher
9
  import re
10
- from spaces import GPU
11
- import httpx
12
 
13
- # Cargar variables de entorno
14
  load_dotenv()
15
 
16
- # Inicializar aplicación FastAPI
17
  app = FastAPI()
18
 
19
- # Diccionario global para almacenar los modelos
20
  global_data = {
21
  'models': []
22
  }
23
 
24
- # Configuración de los modelos (incluyendo los nuevos)
25
  model_configs = [
26
  {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
27
  {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
28
- # Otros modelos omitidos por espacio
 
 
 
 
 
 
 
 
 
 
29
  {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-70B Instruct"},
30
  {"repo_id": "Ffftdtd5dtft/codegemma-2b-IQ1_S-GGUF", "filename": "codegemma-2b-iq1_s-imat.gguf", "name": "Codegemma 2B"},
 
 
 
31
  {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}
32
  ]
33
 
34
- # Clase para gestionar modelos
35
  class ModelManager:
36
  def __init__(self):
37
  self.models = []
 
38
 
39
  def load_model(self, model_config):
40
  print(f"Cargando modelo: {model_config['name']}...")
41
  return {"model": Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename']), "name": model_config['name']}
42
 
43
- @GPU(duration=0)
44
  def load_all_models(self):
 
 
 
 
45
  print("Iniciando carga de modelos...")
46
- with ThreadPoolExecutor(max_workers=len(model_configs)) as executor:
47
  futures = [executor.submit(self.load_model, config) for config in model_configs]
48
  models = []
49
  for future in tqdm(as_completed(futures), total=len(model_configs), desc="Cargando modelos", unit="modelo"):
@@ -53,21 +64,23 @@ class ModelManager:
53
  print(f"Modelo cargado exitosamente: {model['name']}")
54
  except Exception as e:
55
  print(f"Error al cargar el modelo: {e}")
 
 
 
56
  print("Todos los modelos han sido cargados.")
57
- return models
58
 
59
- # Instanciar ModelManager y cargar modelos una sola vez
60
  model_manager = ModelManager()
 
61
  global_data['models'] = model_manager.load_all_models()
62
 
63
- # Modelo global para la solicitud de chat
64
  class ChatRequest(BaseModel):
65
  message: str
66
  top_k: int = 50
67
  top_p: float = 0.95
68
  temperature: float = 0.7
69
 
70
- # Función para generar respuestas de chat
71
  def generate_chat_response(request, model_data):
72
  try:
73
  user_input = normalize_input(request.message)
@@ -104,40 +117,44 @@ def remove_repetitive_responses(responses):
104
  unique_responses.append(response)
105
  return unique_responses
106
 
107
- # Manejo de errores en la inicialización de modelos (traza mencionada en el error)
108
- def handle_initialization_error(allow_token):
109
- try:
110
- client = httpx.Client()
111
- pid = 0 # Variable que simula el proceso actual
112
- assert client.allow(allow_token=allow_token, pid=pid) == httpx.codes.OK
113
- except AssertionError:
114
- raise HTTPException(status_code=500, detail="Error en la inicialización del cliente Spaces")
115
-
116
- # Ruta para generar chat en múltiples modelos
117
- @app.post("/chat/")
118
- async def chat(request: ChatRequest):
119
- try:
120
- # Simulación del error `AssertionError` durante la inicialización
121
- allow_token = "test_token"
122
- handle_initialization_error(allow_token)
123
-
124
- with ThreadPoolExecutor() as executor:
125
- futures = [executor.submit(generate_chat_response, request, model) for model in global_data['models']]
126
- responses = [future.result() for future in as_completed(futures)]
127
- unique_responses = remove_repetitive_responses(responses)
128
- return {"responses": unique_responses}
129
- except Exception as e:
130
- raise HTTPException(status_code=500, detail=f"Error procesando la solicitud: {str(e)}")
131
-
132
- # Uso de template `chat_template.default`
133
- chat_template = """
134
- User: {message}
135
- Bot: {response}
136
- """
 
 
 
137
 
138
- # Plantilla de respuesta de chat
139
- def render_chat_template(message, response):
140
- return chat_template.format(message=message, response=response)
 
141
 
142
  if __name__ == "__main__":
143
  uvicorn.run(app, host="0.0.0.0", port=8000)
 
7
  from dotenv import load_dotenv
8
  from difflib import SequenceMatcher
9
  import re
10
+ import spaces
 
11
 
 
12
  load_dotenv()
13
 
 
14
  app = FastAPI()
15
 
 
16
  global_data = {
17
  'models': []
18
  }
19
 
 
20
  model_configs = [
21
  {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
22
  {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
23
+ {"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf", "name": "Gemma 2-9B IT"},
24
+ {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"},
25
+ {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"},
26
+ {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"},
27
+ {"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf", "name": "Qwen2 7B Instruct"},
28
+ {"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"},
29
+ {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"},
30
+ {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "Meta Llama 3.1-70B"},
31
+ {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"},
32
+ {"repo_id": "Ffftdtd5dtft/Hermes-3-Llama-3.1-8B-IQ1_S-GGUF", "filename": "hermes-3-llama-3.1-8b-iq1_s-imat.gguf", "name": "Hermes 3 Llama 3.1-8B"},
33
+ {"repo_id": "Ffftdtd5dtft/Phi-3.5-mini-instruct-Q2_K-GGUF", "filename": "phi-3.5-mini-instruct-q2_k.gguf", "name": "Phi 3.5 Mini Instruct"},
34
  {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-70B Instruct"},
35
  {"repo_id": "Ffftdtd5dtft/codegemma-2b-IQ1_S-GGUF", "filename": "codegemma-2b-iq1_s-imat.gguf", "name": "Codegemma 2B"},
36
+ {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-IQ2_XXS-GGUF", "filename": "phi-3-mini-128k-instruct-iq2_xxs-imat.gguf", "name": "Phi 3 Mini 128K Instruct XXS"},
37
+ {"repo_id": "Ffftdtd5dtft/TinyLlama-1.1B-Chat-v1.0-IQ1_S-GGUF", "filename": "tinyllama-1.1b-chat-v1.0-iq1_s-imat.gguf", "name": "TinyLlama 1.1B Chat"},
38
+ {"repo_id": "Ffftdtd5dtft/Mistral-NeMo-Minitron-8B-Base-IQ1_S-GGUF", "filename": "mistral-nemo-minitron-8b-base-iq1_s-imat.gguf", "name": "Mistral NeMo Minitron 8B Base"},
39
  {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}
40
  ]
41
 
 
42
  class ModelManager:
43
  def __init__(self):
44
  self.models = []
45
+ self.loaded = False
46
 
47
  def load_model(self, model_config):
48
  print(f"Cargando modelo: {model_config['name']}...")
49
  return {"model": Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename']), "name": model_config['name']}
50
 
 
51
  def load_all_models(self):
52
+ if self.loaded:
53
+ print("Modelos ya están cargados. No es necesario volver a cargarlos.")
54
+ return self.models
55
+
56
  print("Iniciando carga de modelos...")
57
+ with ThreadPoolExecutor() as executor:
58
  futures = [executor.submit(self.load_model, config) for config in model_configs]
59
  models = []
60
  for future in tqdm(as_completed(futures), total=len(model_configs), desc="Cargando modelos", unit="modelo"):
 
64
  print(f"Modelo cargado exitosamente: {model['name']}")
65
  except Exception as e:
66
  print(f"Error al cargar el modelo: {e}")
67
+
68
+ self.models = models
69
+ self.loaded = True
70
  print("Todos los modelos han sido cargados.")
71
+ return self.models
72
 
 
73
  model_manager = ModelManager()
74
+
75
  global_data['models'] = model_manager.load_all_models()
76
 
 
77
  class ChatRequest(BaseModel):
78
  message: str
79
  top_k: int = 50
80
  top_p: float = 0.95
81
  temperature: float = 0.7
82
 
83
+ @spaces.GPU(duration=0)
84
  def generate_chat_response(request, model_data):
85
  try:
86
  user_input = normalize_input(request.message)
 
117
  unique_responses.append(response)
118
  return unique_responses
119
 
120
+ def select_best_response(responses):
121
+ print("Filtrando respuestas...")
122
+ responses = remove_repetitive_responses(responses)
123
+ responses = [remove_duplicates(response['response']) for response in responses]
124
+ unique_responses = list(dict.fromkeys(responses))
125
+ sorted_responses = sorted(unique_responses, key=lambda r: len(r), reverse=True)
126
+ return sorted_responses[0]
127
+
128
+ @app.post("/generate_chat")
129
+ async def generate_chat(request: ChatRequest):
130
+ if not request.message.strip():
131
+ raise HTTPException(status_code=400, detail="The message cannot be empty.")
132
+
133
+ print(f"Procesando solicitud: {request.message}")
134
+
135
+ responses = []
136
+ num_models = len(global_data['models'])
137
+
138
+ with ThreadPoolExecutor() as executor:
139
+ futures = [executor.submit(generate_chat_response, request, model_data) for model_data in global_data['models']]
140
+ for future in tqdm(as_completed(futures), total=num_models, desc="Generando respuestas", unit="modelo"):
141
+ try:
142
+ response = future.result()
143
+ responses.append(response)
144
+ except Exception as exc:
145
+ print(f"Error en la generación de respuesta: {exc}")
146
+
147
+ if not responses:
148
+ raise HTTPException(status_code=500, detail="Error: No se generaron respuestas.")
149
+
150
+ best_response = select_best_response(responses)
151
+
152
+ print(f"Mejor respuesta seleccionada: {best_response}")
153
 
154
+ return {
155
+ "best_response": best_response,
156
+ "all_responses": responses
157
+ }
158
 
159
  if __name__ == "__main__":
160
  uvicorn.run(app, host="0.0.0.0", port=8000)