limcheekin's picture
chore: added OPENBLAS_NUM_THREADS to specify the number of threads used by the OpenBLAS.
36e1e32
raw
history blame
954 Bytes
# Grab a fresh copy of the Python image
FROM python:3.11-slim
# Install build and runtime dependencies
RUN apt-get update && \
apt-get install -y \
libopenblas-dev \
ninja-build \
build-essential \
pkg-config \
curl
RUN pip install -U pip setuptools wheel && \
CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" FORCE_CMAKE=1 pip install --verbose llama-cpp-python[server]
# Download model
RUN mkdir model && \
curl -L https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf -o model/gguf-model.bin
COPY ./start_server.sh ./
COPY ./main.py ./
COPY ./index.html ./
# Make the server start script executable
RUN chmod +x ./start_server.sh
# Set environment variable for the host
ENV HOST=0.0.0.0
ENV PORT=7860
ENV OPENBLAS_NUM_THREADS=1
# Expose a port for the server
EXPOSE ${PORT}
# Run the server start script
CMD ["/bin/sh", "./start_server.sh"]