Spaces:

Royrotem100
/

Roy-Rottem-Chatbot

Runtime error

App Files Files Community

Royrotem100 commited on May 15

Commit

b7358fc

•

1 Parent(s): 14bbdd9

Changes app.py to a server

Browse files

Files changed (1) hide show

app.py +34 -17

app.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import os
 import gradio as gr
 from http import HTTPStatus
-import openai
 from typing import Generator, List, Optional, Tuple, Dict
 from urllib.error import HTTPError
 from flask import Flask, request, jsonify
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import threading
 # Load the model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained("./dictalm2.0-instruct")
@@ -31,34 +32,50 @@ def messages_to_history(messages: Messages) -> Tuple[str, History]:
         history.append([q['content'], r['content']])
     return history
-def model_chat(query: Optional[str], history: Optional[History]) -> Generator[Tuple[str, History], None, None]:
-    if query is None:
-        query = ''
-    if history is None:
-        history = []
-    if not query.strip():
-        return
-    messages = history_to_messages(history)
-    messages.append({'role': 'user', 'content': query.strip()})
-    # Combine all messages into one formatted input text
-    formatted_text = "<s>" + "".join(f"[INST] {m['content']} [/INST]" for m in messages if m['role'] == 'user')
     inputs = tokenizer(formatted_text, return_tensors='pt')
     # Generate the output
     outputs = model.generate(inputs['input_ids'], max_length=1024, temperature=0.7, top_p=0.9)
-    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # Simulate streaming by yielding the response in chunks
-    chunk_size = 20  # You can adjust the chunk size
-    for i in range(0, len(full_response), chunk_size):
-        yield full_response[i:i+chunk_size]
 def run_flask():
     app.run(host='0.0.0.0', port=5000)
 # Run Flask in a separate thread
 threading.Thread(target=run_flask).start()
 with gr.Blocks(css='''

 import os
 import gradio as gr
 from http import HTTPStatus
 from typing import Generator, List, Optional, Tuple, Dict
 from urllib.error import HTTPError
 from flask import Flask, request, jsonify
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import threading
+import requests
 # Load the model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained("./dictalm2.0-instruct")
         history.append([q['content'], r['content']])
     return history
+# Flask app setup
+app = Flask(__name__)
+@app.route('/predict', methods=['POST'])
+def predict():
+    data = request.json
+    input_text = data.get('text', '')
+    # Format the input text with instruction tokens
+    formatted_text = f"<s>[INST] {input_text} [/INST]"
+    # Tokenize the input
     inputs = tokenizer(formatted_text, return_tensors='pt')
     # Generate the output
     outputs = model.generate(inputs['input_ids'], max_length=1024, temperature=0.7, top_p=0.9)
+    # Decode the output
+    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return jsonify({"prediction": prediction})
 def run_flask():
     app.run(host='0.0.0.0', port=5000)
 # Run Flask in a separate thread
 threading.Thread(target=run_flask).start()
+def model_chat(query: Optional[str], history: Optional[History]) -> Generator[Tuple[str, History], None, None]:
+    if query is None:
+        query = ''
+    if history is None:
+        history = []
+    if not query.strip():
+        return
+    response = requests.post("http://127.0.0.1:5000/predict", json={"text": query.strip()})
+    if response.status_code == 200:
+        prediction = response.json().get("prediction", "")
+        history.append((query, prediction))
+        yield prediction, history
+    else:
+        yield "Error: Unable to get a response from the model.", history
 with gr.Blocks(css='''