gpu-server-compose.yaml

# GPU Server Docker Compose Configuration
# Deploy on RunPod GPU server (10.8.0.2)
# Services accessible from VPS (10.8.0.1) via WireGuard VPN

version: '3.8'

services:
  # =============================================================================
  # vLLM - High-performance LLM Inference Server
  # =============================================================================
  vllm:
    image: vllm/vllm-openai:latest
    container_name: gpu_vllm
    restart: unless-stopped
    runtime: nvidia
    environment:
      NVIDIA_VISIBLE_DEVICES: all
      CUDA_VISIBLE_DEVICES: "0"
      HF_TOKEN: ${HF_TOKEN:-}
    volumes:
      - ${MODELS_PATH:-/workspace/models}:/root/.cache/huggingface
    command:
      - --model
      - meta-llama/Meta-Llama-3.1-8B-Instruct  # Change model here
      - --host
      - 0.0.0.0
      - --port
      - 8000
      - --tensor-parallel-size
      - "1"
      - --gpu-memory-utilization
      - "0.85"  # Leave 15% for other tasks
      - --max-model-len
      - "8192"
      - --dtype
      - auto
      - --trust-remote-code
    ports:
      - "8000:8000"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 120s  # Model loading takes time
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    labels:
      - "service=vllm"
      - "stack=gpu-ai"

  # =============================================================================
  # ComfyUI - Advanced Stable Diffusion Interface
  # =============================================================================
  comfyui:
    image: ghcr.io/ai-dock/comfyui:latest
    container_name: gpu_comfyui
    restart: unless-stopped
    runtime: nvidia
    environment:
      NVIDIA_VISIBLE_DEVICES: all
      TZ: ${TIMEZONE:-Europe/Berlin}
      # ComfyUI auto-installs custom nodes on first run
      COMFYUI_FLAGS: "--listen 0.0.0.0 --port 8188"
    volumes:
      - comfyui_data:/data
      - ${MODELS_PATH:-/workspace/models}/comfyui:/opt/ComfyUI/models
      - comfyui_output:/opt/ComfyUI/output
      - comfyui_input:/opt/ComfyUI/input
    ports:
      - "8188:8188"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8188/"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    labels:
      - "service=comfyui"
      - "stack=gpu-ai"

  # =============================================================================
  # Axolotl - LLM Fine-tuning Framework
  # =============================================================================
  # Note: This service uses "profiles" - only starts when explicitly requested
  # Start with: docker compose --profile training up -d axolotl
  axolotl:
    image: winglian/axolotl:main-py3.11-cu121-2.2.2
    container_name: gpu_training
    runtime: nvidia
    volumes:
      - ./training/configs:/workspace/configs
      - ./training/data:/workspace/data
      - ./training/output:/workspace/output
      - ${MODELS_PATH:-/workspace/models}:/workspace/models
      - training_cache:/root/.cache
    environment:
      NVIDIA_VISIBLE_DEVICES: all
      WANDB_API_KEY: ${WANDB_API_KEY:-}
      HF_TOKEN: ${HF_TOKEN:-}
    working_dir: /workspace
    # Default command - override when running specific training
    command: sleep infinity
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    profiles:
      - training
    labels:
      - "service=axolotl"
      - "stack=gpu-ai"

  # =============================================================================
  # JupyterLab - Interactive Development Environment
  # =============================================================================
  jupyter:
    image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel
    container_name: gpu_jupyter
    restart: unless-stopped
    runtime: nvidia
    volumes:
      - ./notebooks:/workspace/notebooks
      - ${MODELS_PATH:-/workspace/models}:/workspace/models
      - jupyter_cache:/root/.cache
    ports:
      - "8888:8888"
    environment:
      NVIDIA_VISIBLE_DEVICES: all
      JUPYTER_ENABLE_LAB: "yes"
      JUPYTER_TOKEN: ${JUPYTER_TOKEN:-pivoine-ai-2025}
      HF_TOKEN: ${HF_TOKEN:-}
    command: |
      bash -c "
      pip install --quiet jupyterlab transformers datasets accelerate bitsandbytes peft trl sentencepiece protobuf &&
      jupyter lab --ip=0.0.0.0 --port=8888 --allow-root --no-browser --NotebookApp.token='${JUPYTER_TOKEN:-pivoine-ai-2025}'
      "
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8888/"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    labels:
      - "service=jupyter"
      - "stack=gpu-ai"

  # =============================================================================
  # Netdata - System & GPU Monitoring
  # =============================================================================
  netdata:
    image: netdata/netdata:latest
    container_name: gpu_netdata
    restart: unless-stopped
    runtime: nvidia
    hostname: gpu-runpod
    cap_add:
      - SYS_PTRACE
      - SYS_ADMIN
    security_opt:
      - apparmor:unconfined
    environment:
      NVIDIA_VISIBLE_DEVICES: all
      TZ: ${TIMEZONE:-Europe/Berlin}
    volumes:
      - /sys:/host/sys:ro
      - /proc:/host/proc:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
      - /etc/os-release:/host/etc/os-release:ro
      - netdata_config:/etc/netdata
      - netdata_cache:/var/cache/netdata
      - netdata_lib:/var/lib/netdata
    ports:
      - "19999:19999"
    labels:
      - "service=netdata"
      - "stack=gpu-ai"

# =============================================================================
# Volumes
# =============================================================================
volumes:
  # ComfyUI data
  comfyui_data:
    driver: local
  comfyui_output:
    driver: local
  comfyui_input:
    driver: local

  # Training data
  training_cache:
    driver: local

  # Jupyter data
  jupyter_cache:
    driver: local

  # Netdata data
  netdata_config:
    driver: local
  netdata_cache:
    driver: local
  netdata_lib:
    driver: local

# =============================================================================
# Networks
# =============================================================================
networks:
  default:
    driver: bridge
    ipam:
      config:
        - subnet: 172.25.0.0/24
Initial commit: RunPod multi-modal AI orchestration stack - Multi-modal AI infrastructure for RunPod RTX 4090 - Automatic model orchestration (text, image, music) - Text: vLLM + Qwen 2.5 7B Instruct - Image: Flux.1 Schnell via OpenEDAI - Music: MusicGen Medium via AudioCraft - Cost-optimized sequential loading on single GPU - Template preparation scripts for rapid deployment - Comprehensive documentation (README, DEPLOYMENT, TEMPLATE) 2025-11-21 14:34:55 +01:00			`# GPU Server Docker Compose Configuration`
			`# Deploy on RunPod GPU server (10.8.0.2)`
			`# Services accessible from VPS (10.8.0.1) via WireGuard VPN`

			`version: '3.8'`

			`services:`
			`# =============================================================================`
			`# vLLM - High-performance LLM Inference Server`
			`# =============================================================================`
			`vllm:`
			`image: vllm/vllm-openai:latest`
			`container_name: gpu_vllm`
			`restart: unless-stopped`
			`runtime: nvidia`
			`environment:`
			`NVIDIA_VISIBLE_DEVICES: all`
			`CUDA_VISIBLE_DEVICES: "0"`
			`HF_TOKEN: ${HF_TOKEN:-}`
			`volumes:`
			`- ${MODELS_PATH:-/workspace/models}:/root/.cache/huggingface`
			`command:`
			`- --model`
			`- meta-llama/Meta-Llama-3.1-8B-Instruct # Change model here`
			`- --host`
			`- 0.0.0.0`
			`- --port`
			`- 8000`
			`- --tensor-parallel-size`
			`- "1"`
			`- --gpu-memory-utilization`
			`- "0.85" # Leave 15% for other tasks`
			`- --max-model-len`
			`- "8192"`
			`- --dtype`
			`- auto`
			`- --trust-remote-code`
			`ports:`
			`- "8000:8000"`
			`healthcheck:`
			`test: ["CMD", "curl", "-f", "http://localhost:8000/health"]`
			`interval: 30s`
			`timeout: 10s`
			`retries: 3`
			`start_period: 120s # Model loading takes time`
			`deploy:`
			`resources:`
			`reservations:`
			`devices:`
			`- driver: nvidia`
			`count: 1`
			`capabilities: [gpu]`
			`labels:`
			`- "service=vllm"`
			`- "stack=gpu-ai"`

			`# =============================================================================`
			`# ComfyUI - Advanced Stable Diffusion Interface`
			`# =============================================================================`
			`comfyui:`
			`image: ghcr.io/ai-dock/comfyui:latest`
			`container_name: gpu_comfyui`
			`restart: unless-stopped`
			`runtime: nvidia`
			`environment:`
			`NVIDIA_VISIBLE_DEVICES: all`
			`TZ: ${TIMEZONE:-Europe/Berlin}`
			`# ComfyUI auto-installs custom nodes on first run`
			`COMFYUI_FLAGS: "--listen 0.0.0.0 --port 8188"`
			`volumes:`
			`- comfyui_data:/data`
			`- ${MODELS_PATH:-/workspace/models}/comfyui:/opt/ComfyUI/models`
			`- comfyui_output:/opt/ComfyUI/output`
			`- comfyui_input:/opt/ComfyUI/input`
			`ports:`
			`- "8188:8188"`
			`healthcheck:`
			`test: ["CMD", "curl", "-f", "http://localhost:8188/"]`
			`interval: 30s`
			`timeout: 10s`
			`retries: 3`
			`start_period: 60s`
			`deploy:`
			`resources:`
			`reservations:`
			`devices:`
			`- driver: nvidia`
			`count: 1`
			`capabilities: [gpu]`
			`labels:`
			`- "service=comfyui"`
			`- "stack=gpu-ai"`

			`# =============================================================================`
			`# Axolotl - LLM Fine-tuning Framework`
			`# =============================================================================`
			`# Note: This service uses "profiles" - only starts when explicitly requested`
			`# Start with: docker compose --profile training up -d axolotl`
			`axolotl:`
			`image: winglian/axolotl:main-py3.11-cu121-2.2.2`
			`container_name: gpu_training`
			`runtime: nvidia`
			`volumes:`
			`- ./training/configs:/workspace/configs`
			`- ./training/data:/workspace/data`
			`- ./training/output:/workspace/output`
			`- ${MODELS_PATH:-/workspace/models}:/workspace/models`
			`- training_cache:/root/.cache`
			`environment:`
			`NVIDIA_VISIBLE_DEVICES: all`
			`WANDB_API_KEY: ${WANDB_API_KEY:-}`
			`HF_TOKEN: ${HF_TOKEN:-}`
			`working_dir: /workspace`
			`# Default command - override when running specific training`
			`command: sleep infinity`
			`deploy:`
			`resources:`
			`reservations:`
			`devices:`
			`- driver: nvidia`
			`count: 1`
			`capabilities: [gpu]`
			`profiles:`
			`- training`
			`labels:`
			`- "service=axolotl"`
			`- "stack=gpu-ai"`

			`# =============================================================================`
			`# JupyterLab - Interactive Development Environment`
			`# =============================================================================`
			`jupyter:`
			`image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel`
			`container_name: gpu_jupyter`
			`restart: unless-stopped`
			`runtime: nvidia`
			`volumes:`
			`- ./notebooks:/workspace/notebooks`
			`- ${MODELS_PATH:-/workspace/models}:/workspace/models`
			`- jupyter_cache:/root/.cache`
			`ports:`
			`- "8888:8888"`
			`environment:`
			`NVIDIA_VISIBLE_DEVICES: all`
			`JUPYTER_ENABLE_LAB: "yes"`
			`JUPYTER_TOKEN: ${JUPYTER_TOKEN:-pivoine-ai-2025}`
			`HF_TOKEN: ${HF_TOKEN:-}`
			`command: \|`
			`bash -c "`
			`pip install --quiet jupyterlab transformers datasets accelerate bitsandbytes peft trl sentencepiece protobuf &&`
			`jupyter lab --ip=0.0.0.0 --port=8888 --allow-root --no-browser --NotebookApp.token='${JUPYTER_TOKEN:-pivoine-ai-2025}'`
			`"`
			`healthcheck:`
			`test: ["CMD", "curl", "-f", "http://localhost:8888/"]`
			`interval: 30s`
			`timeout: 10s`
			`retries: 3`
			`start_period: 60s`
			`deploy:`
			`resources:`
			`reservations:`
			`devices:`
			`- driver: nvidia`
			`count: 1`
			`capabilities: [gpu]`
			`labels:`
			`- "service=jupyter"`
			`- "stack=gpu-ai"`

			`# =============================================================================`
			`# Netdata - System & GPU Monitoring`
			`# =============================================================================`
			`netdata:`
			`image: netdata/netdata:latest`
			`container_name: gpu_netdata`
			`restart: unless-stopped`
			`runtime: nvidia`
			`hostname: gpu-runpod`
			`cap_add:`
			`- SYS_PTRACE`
			`- SYS_ADMIN`
			`security_opt:`
			`- apparmor:unconfined`
			`environment:`
			`NVIDIA_VISIBLE_DEVICES: all`
			`TZ: ${TIMEZONE:-Europe/Berlin}`
			`volumes:`
			`- /sys:/host/sys:ro`
			`- /proc:/host/proc:ro`
			`- /var/run/docker.sock:/var/run/docker.sock:ro`
			`- /etc/os-release:/host/etc/os-release:ro`
			`- netdata_config:/etc/netdata`
			`- netdata_cache:/var/cache/netdata`
			`- netdata_lib:/var/lib/netdata`
			`ports:`
			`- "19999:19999"`
			`labels:`
			`- "service=netdata"`
			`- "stack=gpu-ai"`

			`# =============================================================================`
			`# Volumes`
			`# =============================================================================`
			`volumes:`
			`# ComfyUI data`
			`comfyui_data:`
			`driver: local`
			`comfyui_output:`
			`driver: local`
			`comfyui_input:`
			`driver: local`

			`# Training data`
			`training_cache:`
			`driver: local`

			`# Jupyter data`
			`jupyter_cache:`
			`driver: local`

			`# Netdata data`
			`netdata_config:`
			`driver: local`
			`netdata_cache:`
			`driver: local`
			`netdata_lib:`
			`driver: local`

			`# =============================================================================`
			`# Networks`
			`# =============================================================================`
			`networks:`
			`default:`
			`driver: bridge`
			`ipam:`
			`config:`
			`- subnet: 172.25.0.0/24`