Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
import torch | |
import subprocess | |
import aiohttp | |
from gradio import State | |
import asyncio | |
import json | |
import asyncio | |
# Function to start the ochat server | |
async def start_ochat_server(): | |
print(f"Is CUDA available: {torch.cuda.is_available()}") | |
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") | |
command = [ | |
"python", "-m", "ochat.serving.openai_api_server", | |
"--model", "openchat/openchat_3.5" | |
] | |
async def is_server_running(): | |
async with aiohttp.ClientSession() as session: | |
try: | |
async with session.get("http://localhost:18888/docs") as response: | |
return response.status == 200 | |
except aiohttp.ClientError: | |
return False | |
while True: | |
if not await is_server_running(): | |
try: | |
# Start the server in a separate process | |
subprocess.Popen(command) | |
print("ochat server started successfully.") | |
except Exception as e: | |
print(f"Failed to start ochat server: {e}") | |
await asyncio.sleep(60) # Wait for 60 seconds before checking again | |
# Start the server | |
asyncio.run(start_ochat_server()) | |
# Function to send a message to the ochat server and get a response | |
async def chat_with_ochat(message): | |
base_url = "http://localhost:18888" | |
chat_url = f"{base_url}/v1/chat/completions" | |
headers = {"Content-Type": "application/json"} | |
data = { | |
"model": "openchat_3.5", | |
"messages": [{"role": "user", "content": message}] | |
} | |
async with aiohttp.ClientSession() as session: | |
try: | |
async with session.post(chat_url, headers=headers, json=data) as response: | |
if response.status == 200: | |
response_data = await response.json() | |
return response_data['choices'][0]['message']['content'] | |
else: | |
return f"Error: Server responded with status code {response.status}" | |
except aiohttp.ClientError as e: | |
return f"Error: {e}" | |
# Create a Gradio Blocks interface with session state | |
with gr.Blocks(theme=gr.themes.Soft()) as app: | |
gr.Markdown("## vLLM OpenChat-3.5 Interface") | |
gr.Markdown("### the vLLM server cannot handle concurrent users in spaces. If you get an error, run it on docker.") | |
gr.Markdown("This will run better on your own machine: ```docker run -it -p 7860:7860 --platform=linux/amd64 --gpus all \ | |
registry.hf.space/macadeliccc-openchat-3-5-chatbot:latest python app.py```") | |
message = gr.Textbox(label="Your Message", placeholder="Type your message here") | |
chatbot = gr.Chatbot() | |
clear = gr.Button("Clear") | |
history = State([]) # Session state for chat history | |
async def user(message, history): | |
return "", history + [[message, None]] | |
async def bot(history): | |
if history and history[-1] and history[-1][0]: | |
user_message = history[-1][0] | |
bot_response = await chat_with_ochat(user_message) | |
history[-1][1] = bot_response # Update the last entry with the bot's response | |
return history | |
message.submit(user, [message, chatbot], [message, chatbot], queue=True).then( | |
bot, chatbot, chatbot | |
) | |
clear.click(lambda: None, None, chatbot, queue=False) | |
app.queue() | |
app.launch() | |