import json import subprocess import requests import time import socket import gradio as gr # Función para verificar si el servidor está activo en el puerto def is_server_active(host, port): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: return s.connect_ex((host, port)) == 0 # Descarga y ejecución del modelo url = "https://huggingface.co/mlabonne/NeuralBeagle14-7B-GGUF/resolve/main/neuralbeagle14-7b.Q4_K_M.gguf?download=true" response = requests.get(url) with open("./model.gguf", mode="wb") as file: file.write(response.content) print("Model downloaded") # Ejecutar el servidor LLM command = ["python3", "-m", "llama_cpp.server", "--model", "./model.gguf", "--host", "0.0.0.0", "--port", "2600", "--n_threads", "2"] server_process = subprocess.Popen(command) # Almacenamos el proceso para poder terminarlo más tarde print("Model server starting...") # Esperar a que el servidor esté activo while not is_server_active("0.0.0.0", 2600): print("Waiting for server to start...") time.sleep(5) print("Model server is ready!") def response(message, history): url = "http://localhost:2600/v1/completions" body = {"prompt": "[INST]"+message+"[/INST]", "max_tokens": 300, "echo": False, "stream": False} response_text = "" try: # Eliminado el timeout para esperar indefinidamente with requests.post(url, json=body, stream=True) as stream_response: for text_chunk in stream_response.iter_content(chunk_size=None): text = text_chunk.decode('utf-8') print("Respuesta cruda:", text) # Imprimir la respuesta cruda para depuración if text.startswith("data: "): text = text.replace("data: ", "") if text.startswith("{") and "choices" in text: try: response_json = json.loads(text) part = response_json["choices"][0]["text"] print(part, end="", flush=True) response_text += part except json.JSONDecodeError as e: print("Error al decodificar JSON:", e) break elif text.strip(): print("Respuesta no JSON:", text) break except requests.exceptions.RequestException as e: print(f"Error al realizar la solicitud: {e}") yield response_text def cleanup_server(): print("Closing server...") server_process.terminate() # Terminar el proceso del servidor server_process.wait() # Esperar a que el proceso termine print("Server closed.") # Configurar y lanzar la interfaz de Gradio gr_interface = gr.ChatInterface( fn=response, title="NeuralBeagle 14-7b - By Maxime Labonne ❤️", theme='syddharth/gray-minimal' ) try: gr_interface.launch(share=True) finally: cleanup_server() # Asegurarse de limpiar el servidor al finalizar