vllm/Dockerfile

FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04

WORKDIR /app

# Install Python and system dependencies
RUN apt-get update && apt-get install -y \
    python3.11 \
    python3-pip \
    git \
    && rm -rf /var/lib/apt/lists/*

# Upgrade pip
RUN pip3 install --no-cache-dir --upgrade pip

# Install vLLM and dependencies
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt

# Copy application code
COPY server.py .

# Create directory for model cache
RUN mkdir -p /workspace/huggingface_cache

# Environment variables
ENV HF_HOME=/workspace/huggingface_cache
ENV VLLM_HOST=0.0.0.0
ENV VLLM_PORT=8000

# Expose port
EXPOSE 8000

# Run the server
CMD ["python3", "server.py"]
Initial commit: RunPod multi-modal AI orchestration stack - Multi-modal AI infrastructure for RunPod RTX 4090 - Automatic model orchestration (text, image, music) - Text: vLLM + Qwen 2.5 7B Instruct - Image: Flux.1 Schnell via OpenEDAI - Music: MusicGen Medium via AudioCraft - Cost-optimized sequential loading on single GPU - Template preparation scripts for rapid deployment - Comprehensive documentation (README, DEPLOYMENT, TEMPLATE) 2025-11-21 14:34:55 +01:00			`FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04`

			`WORKDIR /app`

			`# Install Python and system dependencies`
			`RUN apt-get update && apt-get install -y \`
			`python3.11 \`
			`python3-pip \`
			`git \`
			`&& rm -rf /var/lib/apt/lists/*`

			`# Upgrade pip`
			`RUN pip3 install --no-cache-dir --upgrade pip`

			`# Install vLLM and dependencies`
			`COPY requirements.txt .`
			`RUN pip3 install --no-cache-dir -r requirements.txt`

			`# Copy application code`
			`COPY server.py .`

			`# Create directory for model cache`
			`RUN mkdir -p /workspace/huggingface_cache`

			`# Environment variables`
			`ENV HF_HOME=/workspace/huggingface_cache`
			`ENV VLLM_HOST=0.0.0.0`
			`ENV VLLM_PORT=8000`

			`# Expose port`
			`EXPOSE 8000`

			`# Run the server`
			`CMD ["python3", "server.py"]`