Spaces:

gokaygokay
/

Gemma-2-llamacpp

Running on Zero

cbensimon HF staff commited on Jul 6

Commit

6b59e6e

•

1 Parent(s): 2ac2a22

(ZeroGPU) Avoid re-loading model when possible (#1)

- ZeroGPU: Avoid re-loading model when possible (b95c7404443836b83e07d05e55e85725598765ed)

Co-authored-by: Charles Bensimon <[email protected]>

Files changed (1) hide show

app.py CHANGED Viewed

@@ -24,6 +24,9 @@ hf_hub_download(
 )
 @spaces.GPU(duration=120)
 def respond(
     message,
@@ -38,13 +41,19 @@ def respond(
 ):
     chat_template = MessagesFormatterType.GEMMA_2
-    llm = Llama(
-        model_path=f"models/{model}",
-        flash_attn=True,
-        n_gpu_layers=81,
-        n_batch=1024,
-        n_ctx=8192,
-    )
     provider = LlamaCppPythonProvider(llm)
     agent = LlamaCppAgent(

 )
+llm = None
+llm_model = None
 @spaces.GPU(duration=120)
 def respond(
     message,
 ):
     chat_template = MessagesFormatterType.GEMMA_2
+    global llm
+    global llm_model
+    if llm is None or llm_model != model:
+        llm = Llama(
+            model_path=f"models/{model}",
+            flash_attn=True,
+            n_gpu_layers=81,
+            n_batch=1024,
+            n_ctx=8192,
+        )
+        llm_model = model
     provider = LlamaCppPythonProvider(llm)
     agent = LlamaCppAgent(