File size: 2,166 Bytes
e92d2df
 
 
 
 
 
 
 
0f859ae
e92d2df
0f859ae
 
 
 
 
 
 
 
 
 
 
 
e92d2df
0f859ae
 
 
 
 
 
 
 
 
 
111a8d9
 
 
 
33a3d6d
 
d2e1c79
33a3d6d
ffb63fa
 
 
 
 
 
 
 
 
 
e92d2df
0f859ae
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
########################
#install nginx
#sudo apt update
#sudo apt install nginx
#sudo vi /etc/nginx/sites-available/default  #edit
#sudo systemctl start nginx

###########################
#lauch local server
cd /data/2024/1018chatbotarena/llama.cpp/download
export CUDA_VISIBLE_DEVICES=0
python -m vllm.entrypoints.openai.api_server --model cyberagent/calm3-22b-chat \
--max-model-len 4096 --port 8011 \
--gpu-memory-utilization 0.4 --trust-remote-code \
--quantization bitsandbytes --load-format bitsandbytes \
--api-key $VLLM_API_KEY

#vllm tanuki8
export CUDA_VISIBLE_DEVICES=0
python -m vllm.entrypoints.openai.api_server --model weblab-GENIAC/Tanuki-8B-dpo-v1.0 --max-model-len 4096 --port 8012 --gpu-memory-utilization 0.2 --trust-remote-code --quantization bitsandbytes --load-format bitsandbytes --api-key $VLLM_API_KEY

#llama.cpp swallow 8b
export CUDA_VISIBLE_DEVICES=0
../llama-server -m tokyotech-llm-Llama-3.1-Swallow-8B-Instruct-v0.1-Q8_0.gguf  --n_gpu_layers 100 --port 8010

#llmjp13b
export CUDA_VISIBLE_DEVICES=0
 ../llama-server -m llm-jp-3-13b-instruct-Q8_0.gguf --n_gpu_layers 100 --port 8016

#swallow70
export CUDA_VISIBLE_DEVICES=1
python -m vllm.entrypoints.openai.api_server --model tokyotech-llm/Llama-3.1-Swallow-70B-Instruct-v0.1 --max-model-len 4096 --port 8019 --gpu-memory-utilization 0.6 --trust-remote-code --quantization bitsandbytes --load-format bitsandbytes --api-key $VLLM_API_KEY

#gemma
#export CUDA_VISIBLE_DEVICES=1
#../llama-server -m gemma-2-2B-jpn-it-BF16.gguf --n_gpu_layers 100 --port 8020

#tanuki 8x8b
export CUDA_VISIBLE_DEVICES=1
python -m vllm.entrypoints.openai.api_server --model team-hatakeyama-phase2/Tanuki-8x8B-dpo-v1.0-AWQ --max-model-len 4096 --port 8020 --gpu-memory-utilization 0.35 --trust-remote-code --quantization awq --api-key $VLLM_API_KEY

###################
#server2
export CUDA_VISIBLE_DEVICES=0
python -m vllm.entrypoints.openai.api_server --model ibm-granite/granite-3.0-8b-instruct --max-model-len 4096 \
--port 8020 --gpu-memory-utilization 0.4 --trust-remote-code \
--quantization fp8 \
--api-key $VLLM_API_KEY



#########################
#launch ngrok
ngrok http http://localhost:8765