Spaces:
Paused
Paused
Daniel Marques
commited on
Commit
•
9f067a2
1
Parent(s):
b21e4ba
feat: add websocket
Browse files- constants.py +3 -3
- load_models.py +4 -5
constants.py
CHANGED
@@ -32,13 +32,13 @@ CHROMA_SETTINGS = Settings(
|
|
32 |
)
|
33 |
|
34 |
# Context Window and Max New Tokens
|
35 |
-
CONTEXT_WINDOW_SIZE =
|
36 |
MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE # int(CONTEXT_WINDOW_SIZE/4)
|
37 |
|
38 |
#### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
|
39 |
|
40 |
-
N_GPU_LAYERS =
|
41 |
-
N_BATCH =
|
42 |
|
43 |
### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
|
44 |
# N_GPU_LAYERS = 20
|
|
|
32 |
)
|
33 |
|
34 |
# Context Window and Max New Tokens
|
35 |
+
CONTEXT_WINDOW_SIZE = 4096
|
36 |
MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE # int(CONTEXT_WINDOW_SIZE/4)
|
37 |
|
38 |
#### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
|
39 |
|
40 |
+
N_GPU_LAYERS = 40 # Llama-2-70B has 83 layers
|
41 |
+
N_BATCH = 1024
|
42 |
|
43 |
### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
|
44 |
# N_GPU_LAYERS = 20
|
load_models.py
CHANGED
@@ -215,11 +215,10 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging, stre
|
|
215 |
"text-generation",
|
216 |
model=model,
|
217 |
tokenizer=tokenizer,
|
218 |
-
max_length=
|
219 |
-
temperature=0.
|
220 |
-
top_p=0.
|
221 |
-
|
222 |
-
repetition_penalty=1.0,
|
223 |
generation_config=generation_config,
|
224 |
)
|
225 |
|
|
|
215 |
"text-generation",
|
216 |
model=model,
|
217 |
tokenizer=tokenizer,
|
218 |
+
max_length=MAX_NEW_TOKENS,
|
219 |
+
temperature=0.2,
|
220 |
+
# top_p=0.95,
|
221 |
+
repetition_penalty=1.15,
|
|
|
222 |
generation_config=generation_config,
|
223 |
)
|
224 |
|