Uhhy commited on
Commit
7fa4c88
1 Parent(s): dcc90f1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -0
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from llama_cpp import Llama
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ import uvicorn
6
+ import re
7
+ from dotenv import load_dotenv
8
+ import spaces
9
+
10
+ load_dotenv()
11
+
12
+ app = FastAPI()
13
+
14
+ global_data = {
15
+ 'models': {},
16
+ 'tokens': {
17
+ 'eos': 'eos_token',
18
+ 'pad': 'pad_token',
19
+ 'padding': 'padding_token',
20
+ 'unk': 'unk_token',
21
+ 'bos': 'bos_token',
22
+ 'sep': 'sep_token',
23
+ 'cls': 'cls_token',
24
+ 'mask': 'mask_token'
25
+ }
26
+ }
27
+
28
+ model_configs = [
29
+ {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf", "name": "GPT-2 XL"},
30
+ {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-8B Instruct"},
31
+ {"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf", "name": "Gemma 2-9B IT"},
32
+ {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "Gemma 2-27B"},
33
+ {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf", "name": "Phi-3 Mini 128K Instruct"},
34
+ {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf", "name": "Meta Llama 3.1-8B"},
35
+ {"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf", "name": "Qwen2 7B Instruct"},
36
+ {"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf", "name": "Starcoder2 3B"},
37
+ {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf", "name": "Qwen2 1.5B Instruct"},
38
+ {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "Meta Llama 3.1-70B"},
39
+ {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"},
40
+ {"repo_id": "Ffftdtd5dtft/Hermes-3-Llama-3.1-8B-IQ1_S-GGUF", "filename": "hermes-3-llama-3.1-8b-iq1_s-imat.gguf", "name": "Hermes 3 Llama 3.1-8B"},
41
+ {"repo_id": "Ffftdtd5dtft/Phi-3.5-mini-instruct-Q2_K-GGUF", "filename": "phi-3.5-mini-instruct-q2_k.gguf", "name": "Phi 3.5 Mini Instruct"},
42
+ {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-instruct-q2_k.gguf", "name": "Meta Llama 3.1-70B Instruct"},
43
+ {"repo_id": "Ffftdtd5dtft/codegemma-2b-IQ1_S-GGUF", "filename": "codegemma-2b-iq1_s-imat.gguf", "name": "Codegemma 2B"},
44
+ {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-IQ2_XXS-GGUF", "filename": "phi-3-mini-128k-instruct-iq2_xxs-imat.gguf", "name": "Phi 3 Mini 128K Instruct XXS"},
45
+ {"repo_id": "Ffftdtd5dtft/TinyLlama-1.1B-Chat-v1.0-IQ1_S-GGUF", "filename": "tinyllama-1.1b-chat-v1.0-iq1_s-imat.gguf", "name": "TinyLlama 1.1B Chat"},
46
+ {"repo_id": "Ffftdtd5dtft/Mistral-NeMo-Minitron-8B-Base-IQ1_S-GGUF", "filename": "mistral-nemo-minitron-8b-base-iq1_s-imat.gguf", "name": "Mistral NeMo Minitron 8B Base"},
47
+ {"repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF", "filename": "mistral-nemo-instruct-2407-q2_k.gguf", "name": "Mistral Nemo Instruct 2407"}
48
+ ]
49
+
50
+ class ModelManager:
51
+ def __init__(self):
52
+ self.loaded = False
53
+
54
+ def load_model(self, model_config):
55
+ try:
56
+ return {"model": Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename']), "name": model_config['name']}
57
+ except Exception:
58
+ pass
59
+
60
+ def load_all_models(self):
61
+ if self.loaded:
62
+ return global_data['models']
63
+
64
+ try:
65
+ with ThreadPoolExecutor() as executor:
66
+ futures = [executor.submit(self.load_model, config) for config in model_configs]
67
+ models = []
68
+ for future in as_completed(futures):
69
+ model = future.result()
70
+ if model:
71
+ models.append(model)
72
+
73
+ global_data['models'] = models
74
+ self.loaded = True
75
+ return models
76
+ except Exception:
77
+ pass
78
+
79
+ model_manager = ModelManager()
80
+ model_manager.load_all_models()
81
+
82
+ class ChatRequest(BaseModel):
83
+ message: str
84
+ top_k: int = 50
85
+ top_p: float = 0.95
86
+ temperature: float = 0.7
87
+
88
+ def normalize_input(input_text):
89
+ return input_text.strip()
90
+
91
+ def remove_duplicates(text):
92
+ text = re.sub(r'(Hello there, how are you\? \[/INST\]){2,}', 'Hello there, how are you? [/INST]', text)
93
+ text = re.sub(r'(How are you\? \[/INST\]){2,}', 'How are you? [/INST]', text)
94
+ text = text.replace('[/INST]', '')
95
+ lines = text.split('\n')
96
+ unique_lines = []
97
+ seen_lines = set()
98
+ for line in lines:
99
+ if line not in seen_lines:
100
+ seen_lines.add(line)
101
+ unique_lines.append(line)
102
+ return '\n'.join(unique_lines)
103
+
104
+ def remove_repetitive_responses(responses):
105
+ seen = set()
106
+ unique_responses = []
107
+ for response in responses:
108
+ normalized_response = remove_duplicates(response['response'])
109
+ if normalized_response not in seen:
110
+ seen.add(normalized_response)
111
+ unique_responses.append(response)
112
+ return unique_responses
113
+
114
+ def generate_chat_response(request, model_data):
115
+ model = model_data['model']
116
+ try:
117
+ user_input = normalize_input(request.message)
118
+ response = model(user_input, top_k=request.top_k, top_p=request.top_p, temperature=request.temperature)
119
+ return {"model": model_data['name'], "response": response}
120
+ except Exception:
121
+ pass
122
+
123
+ @spaces.GPU(duration=0)
124
+ async def generate(request: ChatRequest):
125
+ try:
126
+ responses = []
127
+ with ThreadPoolExecutor() as executor:
128
+ futures = [executor.submit(generate_chat_response, request, model_data) for model_data in global_data['models']]
129
+ for future in as_completed(futures):
130
+ try:
131
+ response = future.result()
132
+ if response:
133
+ responses.append(response)
134
+ except Exception:
135
+ pass
136
+
137
+ if not responses:
138
+ raise HTTPException(status_code=500, detail="Error: No responses generated.")
139
+
140
+ responses = remove_repetitive_responses(responses)
141
+ best_response = responses[0] if responses else {}
142
+ return {
143
+ "best_response": best_response,
144
+ "all_responses": responses
145
+ }
146
+ except Exception:
147
+ pass
148
+
149
+ @app.api_route("/{method_name:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH"])
150
+ async def handle_request(method_name: str):
151
+ try:
152
+ return {"message": "Request handled successfully"}
153
+ except Exception:
154
+ raise HTTPException(status_code=500, detail="Error: Internal Server Error")
155
+
156
+ if __name__ == "__main__":
157
+ uvicorn.run(app, host="0.0.0.0", port=7860)