Spaces:

kanhatakeyama
/

chatbotarena-ja

Running

a100 kh

granite

ffb63fa 24 days ago

2.17 kB

	########################
	#install nginx
	#sudo apt update
	#sudo apt install nginx
	#sudo vi /etc/nginx/sites-available/default #edit
	#sudo systemctl start nginx

	###########################
	#lauch local server
	cd /data/2024/1018chatbotarena/llama.cpp/download
	export CUDA_VISIBLE_DEVICES=0
	python -m vllm.entrypoints.openai.api_server --model cyberagent/calm3-22b-chat \
	--max-model-len 4096 --port 8011 \
	--gpu-memory-utilization 0.4 --trust-remote-code \
	--quantization bitsandbytes --load-format bitsandbytes \
	--api-key $VLLM_API_KEY

	#vllm tanuki8
	export CUDA_VISIBLE_DEVICES=0
	python -m vllm.entrypoints.openai.api_server --model weblab-GENIAC/Tanuki-8B-dpo-v1.0 --max-model-len 4096 --port 8012 --gpu-memory-utilization 0.2 --trust-remote-code --quantization bitsandbytes --load-format bitsandbytes --api-key $VLLM_API_KEY

	#llama.cpp swallow 8b
	export CUDA_VISIBLE_DEVICES=0
	../llama-server -m tokyotech-llm-Llama-3.1-Swallow-8B-Instruct-v0.1-Q8_0.gguf --n_gpu_layers 100 --port 8010

	#llmjp13b
	export CUDA_VISIBLE_DEVICES=0
	../llama-server -m llm-jp-3-13b-instruct-Q8_0.gguf --n_gpu_layers 100 --port 8016

	#swallow70
	export CUDA_VISIBLE_DEVICES=1
	python -m vllm.entrypoints.openai.api_server --model tokyotech-llm/Llama-3.1-Swallow-70B-Instruct-v0.1 --max-model-len 4096 --port 8019 --gpu-memory-utilization 0.6 --trust-remote-code --quantization bitsandbytes --load-format bitsandbytes --api-key $VLLM_API_KEY

	#gemma
	#export CUDA_VISIBLE_DEVICES=1
	#../llama-server -m gemma-2-2B-jpn-it-BF16.gguf --n_gpu_layers 100 --port 8020

	#tanuki 8x8b
	export CUDA_VISIBLE_DEVICES=1
	python -m vllm.entrypoints.openai.api_server --model team-hatakeyama-phase2/Tanuki-8x8B-dpo-v1.0-AWQ --max-model-len 4096 --port 8020 --gpu-memory-utilization 0.35 --trust-remote-code --quantization awq --api-key $VLLM_API_KEY

	###################
	#server2
	export CUDA_VISIBLE_DEVICES=0
	python -m vllm.entrypoints.openai.api_server --model ibm-granite/granite-3.0-8b-instruct --max-model-len 4096 \
	--port 8020 --gpu-memory-utilization 0.4 --trust-remote-code \
	--quantization fp8 \
	--api-key $VLLM_API_KEY



	#########################
	#launch ngrok
	ngrok http http://localhost:8765