Llama-3.1-8B-Instruct

Runtime error

vilarin commited on Jun 28

Commit

27dc368

•

1 Parent(s): c434f82

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import os
 import threading
 import time
@@ -18,19 +17,16 @@ OLLAMA_SERVICE_THREAD.start()
 print("Giving ollama serve a moment")
 time.sleep(10)
-subprocess.run("~/ollama run gemma2", shell=True)
 import copy
 import gradio as gr
-import spaces
-from llama_index.llms.ollama import Ollama
-import llama_index
-from llama_index.core.llms import ChatMessage
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
-MODEL_ID = "google/gemma-2-27b-it"
 MODEL_NAME = MODEL_ID.split("/")[-1]
 TITLE = "<h1><center>Chatbox</center></h1>"
@@ -56,33 +52,30 @@ h3 {
     text-align: center;
 }
 """
-@spaces.GPU()
 def stream_chat(message: str, history: list, temperature: float, context_window: int, top_p: float, top_k: int, penalty: float):
     print(f'message is - {message}')
     print(f'history is - {history}')
     conversation = []
     for prompt, answer in history:
         conversation.extend([
-            ChatMessage(
-            role="user", content=prompt
-            ),
-            ChatMessage(role="assistant", content=answer),
         ])
-    messages = [ChatMessage(role="user", content=message)]
     print(f"Conversation is -\n{conversation}")
-    llm = Ollama(model="gemma2", request_timeout=60.0)
-    resp = llm.chat(
-        messages = messages,
-        chat_history = conversation,
-        top_p=top_p,
-        top_k=top_k,
-        repeat_penalty=penalty,
-        context_window=context_window,
     )
-    return resp
 chatbot = gr.Chatbot(height=600)

 import os
 import threading
 import time
 print("Giving ollama serve a moment")
 time.sleep(10)
+subprocess.run("~/ollama pull gemma2", shell=True)
 import copy
 import gradio as gr
+import ollama
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
+MODEL_ID = os.environ.get(MODEL_ID, "google/gemma-2-9b-it")
 MODEL_NAME = MODEL_ID.split("/")[-1]
 TITLE = "<h1><center>Chatbox</center></h1>"
     text-align: center;
 }
 """
 def stream_chat(message: str, history: list, temperature: float, context_window: int, top_p: float, top_k: int, penalty: float):
     print(f'message is - {message}')
     print(f'history is - {history}')
     conversation = []
     for prompt, answer in history:
         conversation.extend([
+            {"role": "user", "content": prompt},
+            {"role": "assistant", "content": answer})
         ])
+    conversation.append({"role": "user", "content": message})
     print(f"Conversation is -\n{conversation}")
+    response = ollama.chat(
+        model="gemma2",
+        messages=conversation,
+        stream=True
     )
+    message = ""
+    for chunk in response:
+        message += chunk["message"]["content"]
+        yield "", message
 chatbot = gr.Chatbot(height=600)