Phi-3-mini-128k-instruct

Running on Zero

App Files Files Community

eswardivi commited on Apr 23

Commit

00f3401

•

1 Parent(s): 51153f0

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -10

app.py CHANGED Viewed

@@ -10,14 +10,22 @@ from threading import Thread
 import spaces
 import time
 import subprocess
-subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 token = os.environ["HF_TOKEN"]
 model = AutoModelForCausalLM.from_pretrained(
-    use_cache=False,attn_implementation="flash_attention_2",
-    "microsoft/Phi-3-mini-128k-instruct", token=token,trust_remote_code=True
 )
 tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", token=token)
 terminators = [
@@ -36,7 +44,7 @@ model = model.to(device)
 @spaces.GPU(duration=60)
-def chat(message, history, temperature,do_sample, max_tokens):
     chat = []
     for item in history:
         chat.append({"role": "user", "content": item[0]})
@@ -56,10 +64,10 @@ def chat(message, history, temperature,do_sample, max_tokens):
         temperature=temperature,
         eos_token_id=terminators,
     )
     if temperature == 0:
-        generate_kwargs['do_sample'] = False
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
@@ -68,7 +76,7 @@ def chat(message, history, temperature,do_sample, max_tokens):
         partial_text += new_text
         yield partial_text
-    yield partial_text
 demo = gr.ChatInterface(
@@ -82,7 +90,7 @@ demo = gr.ChatInterface(
         gr.Slider(
             minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
         ),
-        gr.Checkbox(label="Sampling",value=True),
         gr.Slider(
             minimum=128,
             maximum=4096,
@@ -94,6 +102,6 @@ demo = gr.ChatInterface(
     ],
     stop_btn="Stop Generation",
     title="Chat With LLMs",
-    description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)"
 )
 demo.launch()

 import spaces
 import time
 import subprocess
+subprocess.run(
+    "pip install flash-attn --no-build-isolation",
+    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    shell=True,
+)
 token = os.environ["HF_TOKEN"]
 model = AutoModelForCausalLM.from_pretrained(
+    "microsoft/Phi-3-mini-128k-instruct",
+    use_cache=False,
+    attn_implementation="flash_attention_2",
+    token=token,
+    trust_remote_code=True,
 )
 tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", token=token)
 terminators = [
 @spaces.GPU(duration=60)
+def chat(message, history, temperature, do_sample, max_tokens):
     chat = []
     for item in history:
         chat.append({"role": "user", "content": item[0]})
         temperature=temperature,
         eos_token_id=terminators,
     )
     if temperature == 0:
+        generate_kwargs["do_sample"] = False
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
         partial_text += new_text
         yield partial_text
+    yield partial_text
 demo = gr.ChatInterface(
         gr.Slider(
             minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
         ),
+        gr.Checkbox(label="Sampling", value=True),
         gr.Slider(
             minimum=128,
             maximum=4096,
     ],
     stop_btn="Stop Generation",
     title="Chat With LLMs",
+    description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)",
 )
 demo.launch()