File size: 2,885 Bytes
5956319
74995d7
5956319
 
866e7a0
a505b42
4ea1c4e
866e7a0
 
9c1d271
 
9568922
 
76789b2
 
 
5956319
 
 
 
 
9568922
 
 
 
 
 
5956319
5f7b7b0
9568922
87956ea
3bc8972
866e7a0
87956ea
 
3bc8972
 
 
89bcd26
3bc8972
9c1d271
866e7a0
 
 
 
 
 
 
 
 
 
ef2a575
a505b42
ef2a575
ba9f5b4
3fbf6d6
9a367a8
a505b42
395f92e
46707cf
8e4674b
46707cf
 
4e58f83
d20a350
d2b7f91
866e7a0
4ea1c4e
 
 
866e7a0
4ea1c4e
 
866e7a0
4ea1c4e
 
 
0e4adfe
ea66e79
8d68072
9a9a69d
0e4adfe
 
a505b42
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import spaces
import gradio as gr
import torch
import subprocess
import aiohttp
from gradio import State
import asyncio
import json
import asyncio

# Function to start the ochat server
@spaces.GPU
def start_ochat_server():
    print(f"Is CUDA available: {torch.cuda.is_available()}")
    print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

    command = [
        "python", "-m", "ochat.serving.openai_api_server", 
        "--model", "openchat/openchat_3.5"
    ]

    # Start the server in a separate process
    try:
        subprocess.Popen(command)
        return "ochat server started successfully"
    except Exception as e:
        return f"Failed to start ochat server: {e}"


start_ochat_server()

# Function to send a message to the ochat server and get a response
async def chat_with_ochat(message):
    base_url = "http://localhost:18888"
    chat_url = f"{base_url}/v1/chat/completions"
    headers = {"Content-Type": "application/json"}
    data = {
        "model": "openchat_3.5",
        "messages": [{"role": "user", "content": message}]
    }

    async with aiohttp.ClientSession() as session:
        try:
            async with session.post(chat_url, headers=headers, json=data) as response:
                if response.status == 200:
                    response_data = await response.json()
                    return response_data['choices'][0]['message']['content']
                else:
                    return f"Error: Server responded with status code {response.status}"
        except aiohttp.ClientError as e:
            return f"Error: {e}"

# Create a Gradio Blocks interface with session state
with gr.Blocks(theme=gr.themes.Soft()) as app:
    gr.Markdown("## vLLM OpenChat-3.5 Interface")
    gr.Markdown("### the vLLM server cannot handle concurrent users in spaces. If you get an error, run it on docker.")
    gr.Markdown("This will run better on your own machine: ```docker run -it -p 7860:7860 --platform=linux/amd64 --gpus all \
    registry.hf.space/macadeliccc-openchat-3-5-chatbot:latest python app.py```")


    message = gr.Textbox(label="Your Message", placeholder="Type your message here")
    chatbot = gr.Chatbot()
    clear = gr.Button("Clear")

    history = State([])  # Session state for chat history

    async def user(message, history):
        return "", history + [[message, None]]


    async def bot(history):
        if history and history[-1] and history[-1][0]:
            user_message = history[-1][0]
            bot_response = await chat_with_ochat(user_message)
            history[-1][1] = bot_response  # Update the last entry with the bot's response
        return history

    message.submit(user, [message, chatbot], [message, chatbot], queue=True).then(
        bot, chatbot, chatbot
    )
    clear.click(lambda: None, None, chatbot, queue=False)

app.queue()
app.launch()