Spaces:
Paused
Paused
File size: 1,833 Bytes
7ff382b 6bcce8b 3e50743 7131f6c 7ff382b 6cba908 7ff382b 2f314f4 7ff382b 00bd4d6 7ff382b 00bd4d6 7ff382b 998196e 636bd97 6520f0e 477cb33 6db6156 2298350 3db094b 6520f0e 2514e86 7a1592b 6db6156 6520f0e 7131f6c 823ee94 7131f6c 6db6156 823ee94 79d7c2e 6db6156 6520f0e 6db6156 6520f0e 6db6156 6520f0e 78d322f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# Base image
FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive
# Update and install necessary dependencies
RUN apt update && \
apt install --no-install-recommends -y \
build-essential \
nvidia-cuda-toolkit \
python3 \
python3-pip \
wget \
curl \
git \
cmake \
zlib1g-dev \
libblas-dev && \
apt clean && \
rm -rf /var/lib/apt/lists/*
# Setting up CUDA environment variables (this may not be necessary since you're using the official nvidia/cuda image, but it's good to be explicit)
ENV PATH="/usr/local/cuda/bin:$PATH" \
LD_LIBRARY_PATH="/usr/local/cuda/lib64:$LD_LIBRARY_PATH" \
CUDA_HOME="/usr/local/cuda"
WORKDIR /app
# Download ggml and mmproj models from HuggingFace
RUN wget https://huggingface.co/mys/ggml_bakllava-1/resolve/main/ggml-model-q4_k.gguf && \
wget https://huggingface.co/mys/ggml_bakllava-1/resolve/main/mmproj-model-f16.gguf
# Clone and build llava-server with CUDA support
RUN git clone https://github.com/ggerganov/llama.cpp.git && \
cd llama.cpp && \
git submodule init && \
git submodule update && \
make LLAMA_CUBLAS=1
# Create a non-root user for security reasons
RUN useradd -m -u 1000 user && \
mkdir -p /home/user/app && \
cp /app/ggml-model-q4_k.gguf /home/user/app && \
cp /app/mmproj-model-f16.gguf /home/user/app
RUN chown user:user /home/user/app/ggml-model-q4_k.gguf && \
chown user:user /home/user/app/mmproj-model-f16.gguf
USER user
ENV HOME=/home/user
WORKDIR $HOME/app
# Expose the port
EXPOSE 8080
# Start the llava-server with models
CMD ["/app/llama.cpp/server", "--model", "ggml-model-q4_k.gguf", "--mmproj", "mmproj-model-f16.gguf", "--host", "0.0.0.0", "--threads", "4", "-ngl", "30", "-ts", "100,0"]
|