Spaces:
Build error
Build error
import os | |
import gc | |
import io | |
from llama_cpp import Llama | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
from fastapi import FastAPI, Request, HTTPException | |
from fastapi.responses import JSONResponse | |
from tqdm import tqdm | |
from dotenv import load_dotenv | |
from pydantic import BaseModel | |
from huggingface_hub import hf_hub_download, login | |
import spacy | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import uvicorn | |
import psutil | |
import torch | |
load_dotenv() | |
app = FastAPI() | |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") | |
if HUGGINGFACE_TOKEN: | |
login(token=HUGGINGFACE_TOKEN) | |
global_data = { | |
'model_configs' == [ | |
{"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"}, | |
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"}, | |
{"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf", "name": "Gemma 2-9B IT"}, | |
{"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"}, | |
{"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"}, | |
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"}, | |
{"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf", "name": "Qwen2 7B Instruct"}, | |
{"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"}, | |
{"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"}, | |
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "Meta Llama 3.1-70B"}, | |
{"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}, | |
{"repo_id": "Ffftdtd5dtft/Hermes-3-Llama-3.1-8B-IQ1_S-GGUF", "filename": "hermes-3-llama-3.1-8b-iq1_s-imat.gguf", "name": "Hermes 3 Llama 3.1-8B"}, | |
{"repo_id": "Ffftdtd5dtft/Phi-3.5-mini-instruct-Q2_K-GGUF", "filename": "phi-3.5-mini-instruct-q2_k.gguf", "name": "Phi 3.5 Mini Instruct"}, | |
{"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-70B Instruct"}, | |
{"repo_id": "Ffftdtd5dtft/codegemma-2b-IQ1_S-GGUF", "filename": "codegemma-2b-iq1_s-imat.gguf", "name": "Codegemma 2B"}, | |
{"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-IQ2_XXS-GGUF", "filename": "phi-3-mini-128k-instruct-iq2_xxs-imat.gguf", "name": "Phi 3 Mini 128K Instruct XXS"}, | |
{"repo_id": "Ffftdtd5dtft/TinyLlama-1.1B-Chat-v1.0-IQ1_S-GGUF", "filename": "tinyllama-1.1b-chat-v1.0-iq1_s-imat.gguf", "name": "TinyLlama 1.1B Chat"}, | |
{"repo_id": "Ffftdtd5dtft/Mistral-NeMo-Minitron-8B-Base-IQ1_S-GGUF", "filename": "mistral-nemo-minitron-8b-base-iq1_s-imat.gguf", "name": "Mistral NeMo Minitron 8B Base"}, | |
{"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}, | |
] | |
} | |
class ModelManager: | |
def __init__(self): | |
self.models = {} | |
self.load_models() | |
def load_models(self): | |
for config in tqdm(global_data['model_configs'], desc="Loading models"): | |
model_name = config['name'] | |
if model_name not in self.models: | |
try: | |
model_path = hf_hub_download(repo_id=config['repo_id'], use_auth_token=HUGGINGFACE_TOKEN) | |
model = Llama.from_file(model_path, n_ctx=512, n_gpu=1) | |
self.models[model_name] = model | |
except Exception as e: | |
self.models[model_name] = None | |
finally: | |
gc.collect() | |
def get_model(self, model_name: str): | |
return self.models.get(model_name) | |
model_manager = ModelManager() | |
class ChatRequest(BaseModel): | |
message: str | |
async def generate_model_response(model, inputs: str) -> str: | |
try: | |
if model: | |
response = model(inputs, max_tokens=150) | |
return response['choices'][0]['text'].strip() | |
else: | |
return "Model not loaded" | |
except Exception as e: | |
return f"Error: Could not generate a response. Details: {e}" | |
async def process_message(message: str) -> dict: | |
inputs = message.strip() | |
responses = {} | |
with ThreadPoolExecutor(max_workers=min(len(global_data['model_configs']), 4)) as executor: | |
futures = [executor.submit(generate_model_response, model_manager.get_model(config['name']), inputs) for config in global_data['model_configs'] if model_manager.get_model(config['name'])] | |
for i, future in enumerate(tqdm(as_completed(futures), total=len(futures), desc="Generating responses")): | |
try: | |
model_name = global_data['model_configs'][i]['name'] | |
responses[model_name] = future.result() | |
except Exception as e: | |
responses[model_name] = f"Error processing {model_name}: {e}" | |
nlp = spacy.load("en_core_web_sm") | |
stop_words = spacy.lang.en.stop_words.STOP_WORDS | |
def custom_tokenizer(text): | |
doc = nlp(text) | |
return [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct] | |
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer) | |
reference_text = message | |
response_texts = list(responses.values()) | |
tfidf_matrix = vectorizer.fit_transform([reference_text] + response_texts) | |
similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]) | |
best_response_index = similarities.argmax() | |
best_response_model = list(responses.keys())[best_response_index] | |
best_response_text = response_texts[best_response_index] | |
return {"best_response": {"model": best_response_model, "text": best_response_text}, "all_responses": responses} | |
async def api_generate_multimodel(request: Request): | |
try: | |
data = await request.json() | |
message = data.get("message") | |
if not message: | |
raise HTTPException(status_code=400, detail="Missing message") | |
response = await process_message(message) | |
return JSONResponse(response) | |
except HTTPException as e: | |
raise e | |
except Exception as e: | |
return JSONResponse({"error": str(e)}, status_code=500) | |
async def startup_event(): | |
pass | |
async def shutdown_event(): | |
gc.collect() | |
def release_resources(): | |
try: | |
torch.cuda.empty_cache() | |
gc.collect() | |
except Exception as e: | |
pass | |
def resource_manager(): | |
MAX_RAM_PERCENT = 20 | |
MAX_CPU_PERCENT = 20 | |
MAX_GPU_PERCENT = 20 | |
MAX_RAM_MB = 2048 | |
while True: | |
try: | |
virtual_mem = psutil.virtual_memory() | |
current_ram_percent = virtual_mem.percent | |
current_ram_mb = virtual_mem.used / (1024 * 1024) | |
if current_ram_percent > MAX_RAM_PERCENT or current_ram_mb > MAX_RAM_MB: | |
release_resources() | |
current_cpu_percent = psutil.cpu_percent() | |
if current_cpu_percent > MAX_CPU_PERCENT: | |
psutil.Process(os.getpid()).nice() | |
if torch.cuda.is_available(): | |
gpu = torch.cuda.current_device() | |
gpu_mem = torch.cuda.memory_percent(gpu) | |
if gpu_mem > MAX_GPU_PERCENT: | |
release_resources() | |
except Exception as e: | |
pass | |
if __name__ == "__main__": | |
import threading | |
resource_thread = threading.Thread(target=resource_manager) | |
resource_thread.daemon = True | |
resource_thread.start() | |
port = int(os.environ.get("PORT", 7860)) | |
uvicorn.run(app, host="0.0.0.0", port=port) |