Spaces:

dkdaniz
/

katara

Paused

Daniel Marques commited on Oct 19, 2023

Commit

9f067a2

•

1 Parent(s): b21e4ba

feat: add websocket

Files changed (2) hide show

constants.py CHANGED Viewed

@@ -32,13 +32,13 @@ CHROMA_SETTINGS = Settings(
 )
 # Context Window and Max New Tokens
-CONTEXT_WINDOW_SIZE = 2048
 MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE  # int(CONTEXT_WINDOW_SIZE/4)
 #### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
-N_GPU_LAYERS = 83  # Llama-2-70B has 83 layers
-N_BATCH = 2048
 ### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
 # N_GPU_LAYERS = 20

 )
 # Context Window and Max New Tokens
+CONTEXT_WINDOW_SIZE = 4096
 MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE  # int(CONTEXT_WINDOW_SIZE/4)
 #### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
+N_GPU_LAYERS = 40  # Llama-2-70B has 83 layers
+N_BATCH = 1024
 ### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
 # N_GPU_LAYERS = 20

load_models.py CHANGED Viewed

@@ -215,11 +215,10 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging, stre
         "text-generation",
         model=model,
         tokenizer=tokenizer,
-        max_length=50,
-        temperature=0.15,
-        top_p=0.1,
-        top_k=40,
-        repetition_penalty=1.0,
         generation_config=generation_config,
     )

         "text-generation",
         model=model,
         tokenizer=tokenizer,
+        max_length=MAX_NEW_TOKENS,
+        temperature=0.2,
+        # top_p=0.95,
+        repetition_penalty=1.15,
         generation_config=generation_config,
     )