xinference / app.py
aresnow
modify max tokens
da866b5
# Copyright 2022-2023 XProbe Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import os
import urllib.request
import uuid
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
import gradio as gr
from xinference.locale.utils import Locale
from xinference.model import MODEL_FAMILIES, ModelSpec
from xinference.core.api import SyncSupervisorAPI
if TYPE_CHECKING:
from xinference.types import ChatCompletionChunk, ChatCompletionMessage
MODEL_TO_FAMILIES = dict(
(model_family.model_name, model_family)
for model_family in MODEL_FAMILIES
if model_family.model_name != "baichuan"
)
class GradioApp:
def __init__(
self,
supervisor_address: str,
gladiator_num: int = 2,
max_model_num: int = 2,
use_launched_model: bool = False,
):
self._api = SyncSupervisorAPI(supervisor_address)
self._gladiator_num = gladiator_num
self._max_model_num = max_model_num
self._use_launched_model = use_launched_model
self._locale = Locale()
def _create_model(
self,
model_name: str,
model_size_in_billions: Optional[int] = None,
model_format: Optional[str] = None,
quantization: Optional[str] = None,
):
model_uid = str(uuid.uuid1())
models = self._api.list_models()
if len(models) >= self._max_model_num:
self._api.terminate_model(models[0][0])
return self._api.launch_model(
model_uid, model_name, model_size_in_billions, model_format, quantization
)
async def generate(
self,
model: str,
message: str,
chat: List[List[str]],
max_token: int,
temperature: float,
top_p: float,
window_size: int,
show_finish_reason: bool,
):
if not message:
yield message, chat
else:
try:
model_ref = self._api.get_model(model)
except KeyError:
raise gr.Error(self._locale(f"Please create model first"))
history: "List[ChatCompletionMessage]" = []
for c in chat:
history.append({"role": "user", "content": c[0]})
out = c[1]
finish_reason_idx = out.find(f"[{self._locale('stop reason')}: ")
if finish_reason_idx != -1:
out = out[:finish_reason_idx]
history.append({"role": "assistant", "content": out})
if window_size != 0:
history = history[-(window_size // 2) :]
# chatglm only support even number of conversation history.
if len(history) % 2 != 0:
history = history[1:]
generate_config = dict(
max_tokens=max_token,
temperature=temperature,
top_p=top_p,
stream=True,
)
chat += [[message, ""]]
chat_generator = await model_ref.chat(
message,
chat_history=history,
generate_config=generate_config,
)
chunk: Optional["ChatCompletionChunk"] = None
async for chunk in chat_generator:
assert chunk is not None
delta = chunk["choices"][0]["delta"]
if "content" not in delta:
continue
else:
chat[-1][1] += delta["content"]
yield "", chat
if show_finish_reason and chunk is not None:
chat[-1][
1
] += f"[{self._locale('stop reason')}: {chunk['choices'][0]['finish_reason']}]"
yield "", chat
def _build_chatbot(self, model_uid: str, model_name: str):
with gr.Accordion(self._locale("Parameters"), open=False):
max_token = gr.Slider(
128,
1024,
value=128,
step=1,
label=self._locale("Max tokens"),
info=self._locale("The maximum number of tokens to generate."),
)
temperature = gr.Slider(
0.2,
1,
value=0.8,
step=0.01,
label=self._locale("Temperature"),
info=self._locale("The temperature to use for sampling."),
)
top_p = gr.Slider(
0.2,
1,
value=0.95,
step=0.01,
label=self._locale("Top P"),
info=self._locale("The top-p value to use for sampling."),
)
window_size = gr.Slider(
0,
50,
value=10,
step=1,
label=self._locale("Window size"),
info=self._locale("Window size of chat history."),
)
show_finish_reason = gr.Checkbox(
label=f"{self._locale('Show stop reason')}"
)
chat = gr.Chatbot(label=model_name)
text = gr.Textbox(visible=False)
model_uid = gr.Textbox(model_uid, visible=False)
text.change(
self.generate,
[
model_uid,
text,
chat,
max_token,
temperature,
top_p,
window_size,
show_finish_reason,
],
[text, chat],
)
return (
text,
chat,
max_token,
temperature,
top_p,
show_finish_reason,
window_size,
model_uid,
)
def _build_chat_column(self):
with gr.Column():
with gr.Row():
model_name = gr.Dropdown(
choices=list(MODEL_TO_FAMILIES.keys()),
label=self._locale("model name"),
scale=2,
)
model_format = gr.Dropdown(
choices=[],
interactive=False,
label=self._locale("model format"),
scale=2,
)
model_size_in_billions = gr.Dropdown(
choices=[],
interactive=False,
label=self._locale("model size in billions"),
scale=1,
)
quantization = gr.Dropdown(
choices=[],
interactive=False,
label=self._locale("quantization"),
scale=1,
)
create_model = gr.Button(value=self._locale("create"))
def select_model_name(model_name: str):
if model_name:
model_family = MODEL_TO_FAMILIES[model_name]
formats = [model_family.model_format]
model_sizes_in_billions = [
str(b) for b in model_family.model_sizes_in_billions
]
quantizations = model_family.quantizations
return (
gr.Dropdown.update(
choices=formats,
interactive=True,
value=model_family.model_format,
),
gr.Dropdown.update(
choices=model_sizes_in_billions[:1],
interactive=True,
value=model_sizes_in_billions[0],
),
gr.Dropdown.update(
choices=quantizations,
interactive=True,
value=quantizations[0],
),
)
else:
return (
gr.Dropdown.update(),
gr.Dropdown.update(),
gr.Dropdown.update(),
)
model_name.change(
select_model_name,
inputs=[model_name],
outputs=[model_format, model_size_in_billions, quantization],
)
components = self._build_chatbot("", "")
model_text = components[0]
chat, model_uid = components[1], components[-1]
def select_model(
_model_name: str,
_model_format: str,
_model_size_in_billions: str,
_quantization: str,
progress=gr.Progress(),
):
model_family = MODEL_TO_FAMILIES[_model_name]
cache_path, meta_path = model_family.generate_cache_path(
int(_model_size_in_billions), _quantization
)
if not (os.path.exists(cache_path) and os.path.exists(meta_path)):
if os.path.exists(cache_path):
os.remove(cache_path)
url = model_family.url_generator(
int(_model_size_in_billions), _quantization
)
full_name = (
f"{str(model_family)}-{_model_size_in_billions}b-{_quantization}"
)
try:
urllib.request.urlretrieve(
url,
cache_path,
reporthook=lambda block_num, block_size, total_size: progress(
block_num * block_size / total_size,
desc=self._locale("Downloading"),
),
)
# write a meta file to record if download finished
with open(meta_path, "w") as f:
f.write(full_name)
except:
if os.path.exists(cache_path):
os.remove(cache_path)
model_uid = self._create_model(
_model_name, int(_model_size_in_billions), _model_format, _quantization
)
return gr.Chatbot.update(
label="-".join(
[_model_name, _model_size_in_billions, _model_format, _quantization]
),
value=[],
), gr.Textbox.update(value=model_uid)
def clear_chat(
_model_name: str,
_model_format: str,
_model_size_in_billions: str,
_quantization: str,
):
full_name = "-".join(
[_model_name, _model_size_in_billions, _model_format, _quantization]
)
return str(uuid.uuid4()), gr.Chatbot.update(
label=full_name,
value=[],
)
invisible_text = gr.Textbox(visible=False)
create_model.click(
clear_chat,
inputs=[model_name, model_format, model_size_in_billions, quantization],
outputs=[invisible_text, chat],
)
invisible_text.change(
select_model,
inputs=[model_name, model_format, model_size_in_billions, quantization],
outputs=[chat, model_uid],
postprocess=False,
)
return chat, model_text
def _build_arena(self):
with gr.Box():
with gr.Row():
chat_and_text = [
self._build_chat_column() for _ in range(self._gladiator_num)
]
chats = [c[0] for c in chat_and_text]
texts = [c[1] for c in chat_and_text]
msg = gr.Textbox(label=self._locale("Input"))
def update_message(text_in: str):
return "", text_in, text_in
msg.submit(update_message, inputs=[msg], outputs=[msg] + texts)
gr.ClearButton(components=[msg] + chats + texts)
def _build_single(self):
chat, model_text = self._build_chat_column()
msg = gr.Textbox(label=self._locale("Input"))
def update_message(text_in: str):
return "", text_in
msg.submit(update_message, inputs=[msg], outputs=[msg, model_text])
gr.ClearButton(components=[chat, msg, model_text])
def _build_single_with_launched(
self, models: List[Tuple[str, ModelSpec]], default_index: int
):
uid_to_model_spec: Dict[str, ModelSpec] = dict((m[0], m[1]) for m in models)
choices = [
"-".join(
[
s.model_name,
str(s.model_size_in_billions),
s.model_format,
s.quantization,
]
)
for s in uid_to_model_spec.values()
]
choice_to_uid = dict(zip(choices, uid_to_model_spec.keys()))
model_selection = gr.Dropdown(
label=self._locale("select model"),
choices=choices,
value=choices[default_index],
)
components = self._build_chatbot(
models[default_index][0], choices[default_index]
)
model_text = components[0]
model_uid = components[-1]
chat = components[1]
def select_model(model_name):
uid = choice_to_uid[model_name]
return gr.Chatbot.update(label=model_name), uid
model_selection.change(
select_model, inputs=[model_selection], outputs=[chat, model_uid]
)
return chat, model_text
def _build_arena_with_launched(self, models: List[Tuple[str, ModelSpec]]):
chat_and_text = []
with gr.Row():
for i in range(self._gladiator_num):
with gr.Column():
chat_and_text.append(self._build_single_with_launched(models, i))
chats = [c[0] for c in chat_and_text]
texts = [c[1] for c in chat_and_text]
msg = gr.Textbox(label=self._locale("Input"))
def update_message(text_in: str):
return "", text_in, text_in
msg.submit(update_message, inputs=[msg], outputs=[msg] + texts)
gr.ClearButton(components=[msg] + chats + texts)
def build(self):
if self._use_launched_model:
models = self._api.list_models()
with gr.Blocks() as blocks:
if len(models) >= 2:
with gr.Tab(self._locale("Arena")):
self._build_arena_with_launched(models)
with gr.Tab(self._locale("Chat")):
chat, model_text = self._build_single_with_launched(models, 0)
msg = gr.Textbox(label=self._locale("Input"))
def update_message(text_in: str):
return "", text_in
msg.submit(update_message, inputs=[msg], outputs=[msg, model_text])
gr.ClearButton(components=[chat, msg, model_text])
else:
with gr.Blocks() as blocks:
with gr.Tab(self._locale("Chat")):
self._build_single()
with gr.Tab(self._locale("Arena")):
self._build_arena()
blocks.queue(concurrency_count=40)
return blocks
async def launch_xinference():
import xoscar as xo
from xinference.core.service import SupervisorActor
from xinference.core.api import AsyncSupervisorAPI
from xinference.deploy.worker import start_worker_components
pool = await xo.create_actor_pool(address="0.0.0.0", n_process=0)
supervisor_address = pool.external_address
await xo.create_actor(
SupervisorActor, address=supervisor_address, uid=SupervisorActor.uid()
)
await start_worker_components(
address=supervisor_address, supervisor_address=supervisor_address
)
api = AsyncSupervisorAPI(supervisor_address)
supported_models = ["chatglm2", "chatglm", "vicuna-v1.3", "orca"]
for model in supported_models:
await api.launch_model(str(uuid.uuid4()), model)
gradio_block = GradioApp(supervisor_address, use_launched_model=True).build()
gradio_block.launch()
if __name__ == "__main__":
loop = asyncio.get_event_loop()
task = loop.create_task(launch_xinference())
try:
loop.run_until_complete(task)
except KeyboardInterrupt:
task.cancel()
loop.run_until_complete(task)
# avoid displaying exception-unhandled warnings
task.exception()