FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04 WORKDIR /app # Install Python and system dependencies RUN apt-get update && apt-get install -y \ python3.11 \ python3-pip \ git \ && rm -rf /var/lib/apt/lists/* # Upgrade pip RUN pip3 install --no-cache-dir --upgrade pip # Install vLLM and dependencies COPY requirements.txt . RUN pip3 install --no-cache-dir -r requirements.txt # Copy application code COPY server.py . # Create directory for model cache RUN mkdir -p /workspace/huggingface_cache # Environment variables ENV HF_HOME=/workspace/huggingface_cache ENV VLLM_HOST=0.0.0.0 ENV VLLM_PORT=8000 # Expose port EXPOSE 8000 # Run the server CMD ["python3", "server.py"]