Uhhy commited on
Commit
aec004b
1 Parent(s): 95ed60f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -31
app.py CHANGED
@@ -21,15 +21,15 @@ global_data = {
21
 
22
  # Configuración de los modelos
23
  model_configs = [
24
- {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf"},
25
- {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf"},
26
- {"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf"},
27
- {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf"},
28
- {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf"},
29
- {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf"},
30
- {"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf"},
31
- {"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf"},
32
- {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf"}
33
  ]
34
 
35
  # Clase para gestionar modelos
@@ -38,8 +38,8 @@ class ModelManager:
38
  self.models = []
39
 
40
  def load_model(self, model_config):
41
- print(f"Cargando modelo {model_config['repo_id']}...")
42
- return Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'])
43
 
44
  def load_all_models(self):
45
  print("Iniciando carga de modelos...")
@@ -50,7 +50,7 @@ class ModelManager:
50
  try:
51
  model = future.result()
52
  models.append(model)
53
- print(f"Modelo cargado exitosamente: {model_configs[len(models)-1]['repo_id']}")
54
  except Exception as e:
55
  print(f"Error al cargar el modelo: {e}")
56
  print("Todos los modelos han sido cargados.")
@@ -68,9 +68,10 @@ class ChatRequest(BaseModel):
68
  temperature: float = 0.7
69
 
70
  # Función para generar respuestas de chat
71
- def generate_chat_response(request, llm):
72
  try:
73
  user_input = normalize_input(request.message)
 
74
  response = llm.create_chat_completion(
75
  messages=[{"role": "user", "content": user_input}],
76
  top_k=request.top_k,
@@ -78,54 +79,46 @@ def generate_chat_response(request, llm):
78
  temperature=request.temperature
79
  )
80
  reply = response['choices'][0]['message']['content']
81
- return {"response": reply, "literal": user_input}
82
  except Exception as e:
83
- return {"response": f"Error: {str(e)}", "literal": user_input}
84
 
85
  def normalize_input(input_text):
86
  return input_text.strip()
87
 
88
  def remove_duplicates(text):
89
- # Eliminar patrones repetitivos específicos
90
  text = re.sub(r'(Hello there, how are you\? \[/INST\]){2,}', 'Hello there, how are you? [/INST]', text)
91
  text = re.sub(r'(How are you\? \[/INST\]){2,}', 'How are you? [/INST]', text)
92
-
93
- # Eliminar el marcador [/INST]
94
  text = text.replace('[/INST]', '')
95
-
96
- # Generaliza la eliminación de duplicados
97
  lines = text.split('\n')
98
  unique_lines = list(dict.fromkeys(lines))
99
  return '\n'.join(unique_lines).strip()
100
 
101
  def remove_repetitive_responses(responses):
102
- # Filtra respuestas repetitivas
103
  seen = set()
104
  unique_responses = []
105
  for response in responses:
106
- normalized_response = remove_duplicates(response)
107
  if normalized_response not in seen:
108
  seen.add(normalized_response)
109
- unique_responses.append(normalized_response)
110
  return unique_responses
111
 
112
  def select_best_response(responses):
113
  print("Filtrando respuestas...")
114
  responses = remove_repetitive_responses(responses)
115
- responses = [remove_duplicates(response) for response in responses]
116
  unique_responses = list(set(responses))
117
  coherent_responses = filter_by_coherence(unique_responses)
118
  best_response = filter_by_similarity(coherent_responses)
119
  return best_response
120
 
121
  def filter_by_coherence(responses):
122
- # Ordenar respuestas por longitud y similaridad para coherencia básica
123
  print("Ordenando respuestas por coherencia...")
124
  responses.sort(key=len, reverse=True)
125
  return responses
126
 
127
  def filter_by_similarity(responses):
128
- # Seleccionar la respuesta más coherente y única
129
  print("Filtrando respuestas por similitud...")
130
  responses.sort(key=len, reverse=True)
131
  best_response = responses[0]
@@ -136,9 +129,9 @@ def filter_by_similarity(responses):
136
  break
137
  return best_response
138
 
139
- def worker_function(llm, request):
140
- print(f"Generando respuesta con el modelo {llm}...")
141
- response = generate_chat_response(request, llm)
142
  return response
143
 
144
  @app.post("/generate_chat")
@@ -152,11 +145,11 @@ async def generate_chat(request: ChatRequest):
152
  num_models = len(global_data['models'])
153
 
154
  with ThreadPoolExecutor(max_workers=num_models) as executor:
155
- futures = [executor.submit(worker_function, llm, request) for llm in global_data['models']]
156
  for future in tqdm(as_completed(futures), total=num_models, desc="Generando respuestas", unit="modelo"):
157
  try:
158
  response = future.result()
159
- responses.append(response['response'])
160
  except Exception as exc:
161
  print(f"Error en la generación de respuesta: {exc}")
162
 
 
21
 
22
  # Configuración de los modelos
23
  model_configs = [
24
+ {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
25
+ {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
26
+ {"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf", "name": "Gemma 2-9B IT"},
27
+ {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"},
28
+ {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"},
29
+ {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"},
30
+ {"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf", "name": "Qwen2 7B Instruct"},
31
+ {"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"},
32
+ {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"}
33
  ]
34
 
35
  # Clase para gestionar modelos
 
38
  self.models = []
39
 
40
  def load_model(self, model_config):
41
+ print(f"Cargando modelo: {model_config['name']}...")
42
+ return {"model": Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename']), "name": model_config['name']}
43
 
44
  def load_all_models(self):
45
  print("Iniciando carga de modelos...")
 
50
  try:
51
  model = future.result()
52
  models.append(model)
53
+ print(f"Modelo cargado exitosamente: {model['name']}")
54
  except Exception as e:
55
  print(f"Error al cargar el modelo: {e}")
56
  print("Todos los modelos han sido cargados.")
 
68
  temperature: float = 0.7
69
 
70
  # Función para generar respuestas de chat
71
+ def generate_chat_response(request, model_data):
72
  try:
73
  user_input = normalize_input(request.message)
74
+ llm = model_data['model']
75
  response = llm.create_chat_completion(
76
  messages=[{"role": "user", "content": user_input}],
77
  top_k=request.top_k,
 
79
  temperature=request.temperature
80
  )
81
  reply = response['choices'][0]['message']['content']
82
+ return {"response": reply, "literal": user_input, "model_name": model_data['name']}
83
  except Exception as e:
84
+ return {"response": f"Error: {str(e)}", "literal": user_input, "model_name": model_data['name']}
85
 
86
  def normalize_input(input_text):
87
  return input_text.strip()
88
 
89
  def remove_duplicates(text):
 
90
  text = re.sub(r'(Hello there, how are you\? \[/INST\]){2,}', 'Hello there, how are you? [/INST]', text)
91
  text = re.sub(r'(How are you\? \[/INST\]){2,}', 'How are you? [/INST]', text)
 
 
92
  text = text.replace('[/INST]', '')
 
 
93
  lines = text.split('\n')
94
  unique_lines = list(dict.fromkeys(lines))
95
  return '\n'.join(unique_lines).strip()
96
 
97
  def remove_repetitive_responses(responses):
 
98
  seen = set()
99
  unique_responses = []
100
  for response in responses:
101
+ normalized_response = remove_duplicates(response['response'])
102
  if normalized_response not in seen:
103
  seen.add(normalized_response)
104
+ unique_responses.append(response)
105
  return unique_responses
106
 
107
  def select_best_response(responses):
108
  print("Filtrando respuestas...")
109
  responses = remove_repetitive_responses(responses)
110
+ responses = [remove_duplicates(response['response']) for response in responses]
111
  unique_responses = list(set(responses))
112
  coherent_responses = filter_by_coherence(unique_responses)
113
  best_response = filter_by_similarity(coherent_responses)
114
  return best_response
115
 
116
  def filter_by_coherence(responses):
 
117
  print("Ordenando respuestas por coherencia...")
118
  responses.sort(key=len, reverse=True)
119
  return responses
120
 
121
  def filter_by_similarity(responses):
 
122
  print("Filtrando respuestas por similitud...")
123
  responses.sort(key=len, reverse=True)
124
  best_response = responses[0]
 
129
  break
130
  return best_response
131
 
132
+ def worker_function(model_data, request):
133
+ print(f"Generando respuesta con el modelo: {model_data['name']}...")
134
+ response = generate_chat_response(request, model_data)
135
  return response
136
 
137
  @app.post("/generate_chat")
 
145
  num_models = len(global_data['models'])
146
 
147
  with ThreadPoolExecutor(max_workers=num_models) as executor:
148
+ futures = [executor.submit(worker_function, model_data, request) for model_data in global_data['models']]
149
  for future in tqdm(as_completed(futures), total=num_models, desc="Generando respuestas", unit="modelo"):
150
  try:
151
  response = future.result()
152
+ responses.append(response)
153
  except Exception as exc:
154
  print(f"Error en la generación de respuesta: {exc}")
155