Spaces:
No application file
No application file
File size: 1,431 Bytes
f3305db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
"""
Launch an OpenAI API server with multiple model workers.
"""
import os
import argparse
def launch_process(cmd):
os.popen(cmd)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--multimodal", action="store_true", default=False)
args = parser.parse_args()
launch_process("python3 -m fastchat.serve.controller")
launch_process("python3 -m fastchat.serve.openai_api_server")
if args.multimodal:
models = [
("liuhaotian/llava-v1.5-7b", "sglang_worker"),
]
else:
models = [
("lmsys/vicuna-7b-v1.5", "model_worker"),
("lmsys/fastchat-t5-3b-v1.0", "model_worker"),
("THUDM/chatglm-6b", "model_worker"),
("mosaicml/mpt-7b-chat", "model_worker"),
("meta-llama/Llama-2-7b-chat-hf", "vllm_worker"),
]
for i, (model_path, worker_name) in enumerate(models):
cmd = (
f"CUDA_VISIBLE_DEVICES={i} python3 -m fastchat.serve.{worker_name} "
f"--model-path {model_path} --port {40000+i} "
f"--worker-address http://localhost:{40000+i} "
)
if "llava" in model_path.lower():
cmd += f"--tokenizer-path llava-hf/llava-1.5-7b-hf"
if worker_name == "vllm_worker":
cmd += "--tokenizer hf-internal-testing/llama-tokenizer"
launch_process(cmd)
while True:
pass
|