# GPU Server Docker Compose Configuration
# Deploy on RunPod GPU server (10.8.0.2)
# Services accessible from VPS (10.8.0.1) via WireGuard VPN

version: '3.8'

services:
  # =============================================================================
  # vLLM - High-performance LLM Inference Server
  # =============================================================================
  vllm:
    image: vllm/vllm-openai:latest
    container_name: gpu_vllm
    restart: unless-stopped
    runtime: nvidia
    environment:
      NVIDIA_VISIBLE_DEVICES: all
      CUDA_VISIBLE_DEVICES: "0"
      HF_TOKEN: ${HF_TOKEN:-}
    volumes:
      - ${MODELS_PATH:-/workspace/models}:/root/.cache/huggingface
    command:
      - --model
      - meta-llama/Meta-Llama-3.1-8B-Instruct  # Change model here
      - --host
      - 0.0.0.0
      - --port
      - 8000
      - --tensor-parallel-size
      - "1"
      - --gpu-memory-utilization
      - "0.85"  # Leave 15% for other tasks
      - --max-model-len
      - "8192"
      - --dtype
      - auto
      - --trust-remote-code
    ports:
      - "8000:8000"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 120s  # Model loading takes time
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    labels:
      - "service=vllm"
      - "stack=gpu-ai"

  # =============================================================================
  # ComfyUI - Advanced Stable Diffusion Interface
  # =============================================================================
  comfyui:
    image: ghcr.io/ai-dock/comfyui:latest
    container_name: gpu_comfyui
    restart: unless-stopped
    runtime: nvidia
    environment:
      NVIDIA_VISIBLE_DEVICES: all
      TZ: ${TIMEZONE:-Europe/Berlin}
      # ComfyUI auto-installs custom nodes on first run
      COMFYUI_FLAGS: "--listen 0.0.0.0 --port 8188"
    volumes:
      - comfyui_data:/data
      - ${MODELS_PATH:-/workspace/models}/comfyui:/opt/ComfyUI/models
      - comfyui_output:/opt/ComfyUI/output
      - comfyui_input:/opt/ComfyUI/input
    ports:
      - "8188:8188"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8188/"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    labels:
      - "service=comfyui"
      - "stack=gpu-ai"

  # =============================================================================
  # Axolotl - LLM Fine-tuning Framework
  # =============================================================================
  # Note: This service uses "profiles" - only starts when explicitly requested
  # Start with: docker compose --profile training up -d axolotl
  axolotl:
    image: winglian/axolotl:main-py3.11-cu121-2.2.2
    container_name: gpu_training
    runtime: nvidia
    volumes:
      - ./training/configs:/workspace/configs
      - ./training/data:/workspace/data
      - ./training/output:/workspace/output
      - ${MODELS_PATH:-/workspace/models}:/workspace/models
      - training_cache:/root/.cache
    environment:
      NVIDIA_VISIBLE_DEVICES: all
      WANDB_API_KEY: ${WANDB_API_KEY:-}
      HF_TOKEN: ${HF_TOKEN:-}
    working_dir: /workspace
    # Default command - override when running specific training
    command: sleep infinity
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    profiles:
      - training
    labels:
      - "service=axolotl"
      - "stack=gpu-ai"

  # =============================================================================
  # JupyterLab - Interactive Development Environment
  # =============================================================================
  jupyter:
    image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel
    container_name: gpu_jupyter
    restart: unless-stopped
    runtime: nvidia
    volumes:
      - ./notebooks:/workspace/notebooks
      - ${MODELS_PATH:-/workspace/models}:/workspace/models
      - jupyter_cache:/root/.cache
    ports:
      - "8888:8888"
    environment:
      NVIDIA_VISIBLE_DEVICES: all
      JUPYTER_ENABLE_LAB: "yes"
      JUPYTER_TOKEN: ${JUPYTER_TOKEN:-pivoine-ai-2025}
      HF_TOKEN: ${HF_TOKEN:-}
    command: |
      bash -c "
      pip install --quiet jupyterlab transformers datasets accelerate bitsandbytes peft trl sentencepiece protobuf &&
      jupyter lab --ip=0.0.0.0 --port=8888 --allow-root --no-browser --NotebookApp.token='${JUPYTER_TOKEN:-pivoine-ai-2025}'
      "
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8888/"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    labels:
      - "service=jupyter"
      - "stack=gpu-ai"

  # =============================================================================
  # Netdata - System & GPU Monitoring
  # =============================================================================
  netdata:
    image: netdata/netdata:latest
    container_name: gpu_netdata
    restart: unless-stopped
    runtime: nvidia
    hostname: gpu-runpod
    cap_add:
      - SYS_PTRACE
      - SYS_ADMIN
    security_opt:
      - apparmor:unconfined
    environment:
      NVIDIA_VISIBLE_DEVICES: all
      TZ: ${TIMEZONE:-Europe/Berlin}
    volumes:
      - /sys:/host/sys:ro
      - /proc:/host/proc:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
      - /etc/os-release:/host/etc/os-release:ro
      - netdata_config:/etc/netdata
      - netdata_cache:/var/cache/netdata
      - netdata_lib:/var/lib/netdata
    ports:
      - "19999:19999"
    labels:
      - "service=netdata"
      - "stack=gpu-ai"

# =============================================================================
# Volumes
# =============================================================================
volumes:
  # ComfyUI data
  comfyui_data:
    driver: local
  comfyui_output:
    driver: local
  comfyui_input:
    driver: local

  # Training data
  training_cache:
    driver: local

  # Jupyter data
  jupyter_cache:
    driver: local

  # Netdata data
  netdata_config:
    driver: local
  netdata_cache:
    driver: local
  netdata_lib:
    driver: local

# =============================================================================
# Networks
# =============================================================================
networks:
  default:
    driver: bridge
    ipam:
      config:
        - subnet: 172.25.0.0/24