Initial commit: RunPod multi-modal AI orchestration stack

- Multi-modal AI infrastructure for RunPod RTX 4090 - Automatic model orchestration (text, image, music) - Text: vLLM + Qwen 2.5 7B Instruct - Image: Flux.1 Schnell via OpenEDAI - Music: MusicGen Medium via AudioCraft - Cost-optimized sequential loading on single GPU - Template preparation scripts for rapid deployment - Comprehensive documentation (README, DEPLOYMENT, TEMPLATE)
2025-11-21 14:34:55 +01:00
commit 277f1c95bd
35 changed files with 7654 additions and 0 deletions
--- a/gpu-server-compose.yaml
+++ b/gpu-server-compose.yaml
@@ -0,0 +1,237 @@
+# GPU Server Docker Compose Configuration
+# Deploy on RunPod GPU server (10.8.0.2)
+# Services accessible from VPS (10.8.0.1) via WireGuard VPN
+
+version: '3.8'
+
+services:
+  # =============================================================================
+  # vLLM - High-performance LLM Inference Server
+  # =============================================================================
+  vllm:
+    image: vllm/vllm-openai:latest
+    container_name: gpu_vllm
+    restart: unless-stopped
+    runtime: nvidia
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      CUDA_VISIBLE_DEVICES: "0"
+      HF_TOKEN: ${HF_TOKEN:-}
+    volumes:
+      - ${MODELS_PATH:-/workspace/models}:/root/.cache/huggingface
+    command:
+      - --model
+      - meta-llama/Meta-Llama-3.1-8B-Instruct  # Change model here
+      - --host
+      - 0.0.0.0
+      - --port
+      - 8000
+      - --tensor-parallel-size
+      - "1"
+      - --gpu-memory-utilization
+      - "0.85"  # Leave 15% for other tasks
+      - --max-model-len
+      - "8192"
+      - --dtype
+      - auto
+      - --trust-remote-code
+    ports:
+      - "8000:8000"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 120s  # Model loading takes time
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    labels:
+      - "service=vllm"
+      - "stack=gpu-ai"
+
+  # =============================================================================
+  # ComfyUI - Advanced Stable Diffusion Interface
+  # =============================================================================
+  comfyui:
+    image: ghcr.io/ai-dock/comfyui:latest
+    container_name: gpu_comfyui
+    restart: unless-stopped
+    runtime: nvidia
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      TZ: ${TIMEZONE:-Europe/Berlin}
+      # ComfyUI auto-installs custom nodes on first run
+      COMFYUI_FLAGS: "--listen 0.0.0.0 --port 8188"
+    volumes:
+      - comfyui_data:/data
+      - ${MODELS_PATH:-/workspace/models}/comfyui:/opt/ComfyUI/models
+      - comfyui_output:/opt/ComfyUI/output
+      - comfyui_input:/opt/ComfyUI/input
+    ports:
+      - "8188:8188"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8188/"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    labels:
+      - "service=comfyui"
+      - "stack=gpu-ai"
+
+  # =============================================================================
+  # Axolotl - LLM Fine-tuning Framework
+  # =============================================================================
+  # Note: This service uses "profiles" - only starts when explicitly requested
+  # Start with: docker compose --profile training up -d axolotl
+  axolotl:
+    image: winglian/axolotl:main-py3.11-cu121-2.2.2
+    container_name: gpu_training
+    runtime: nvidia
+    volumes:
+      - ./training/configs:/workspace/configs
+      - ./training/data:/workspace/data
+      - ./training/output:/workspace/output
+      - ${MODELS_PATH:-/workspace/models}:/workspace/models
+      - training_cache:/root/.cache
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      WANDB_API_KEY: ${WANDB_API_KEY:-}
+      HF_TOKEN: ${HF_TOKEN:-}
+    working_dir: /workspace
+    # Default command - override when running specific training
+    command: sleep infinity
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    profiles:
+      - training
+    labels:
+      - "service=axolotl"
+      - "stack=gpu-ai"
+
+  # =============================================================================
+  # JupyterLab - Interactive Development Environment
+  # =============================================================================
+  jupyter:
+    image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel
+    container_name: gpu_jupyter
+    restart: unless-stopped
+    runtime: nvidia
+    volumes:
+      - ./notebooks:/workspace/notebooks
+      - ${MODELS_PATH:-/workspace/models}:/workspace/models
+      - jupyter_cache:/root/.cache
+    ports:
+      - "8888:8888"
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      JUPYTER_ENABLE_LAB: "yes"
+      JUPYTER_TOKEN: ${JUPYTER_TOKEN:-pivoine-ai-2025}
+      HF_TOKEN: ${HF_TOKEN:-}
+    command: |
+      bash -c "
+      pip install --quiet jupyterlab transformers datasets accelerate bitsandbytes peft trl sentencepiece protobuf &&
+      jupyter lab --ip=0.0.0.0 --port=8888 --allow-root --no-browser --NotebookApp.token='${JUPYTER_TOKEN:-pivoine-ai-2025}'
+      "
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8888/"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    labels:
+      - "service=jupyter"
+      - "stack=gpu-ai"
+
+  # =============================================================================
+  # Netdata - System & GPU Monitoring
+  # =============================================================================
+  netdata:
+    image: netdata/netdata:latest
+    container_name: gpu_netdata
+    restart: unless-stopped
+    runtime: nvidia
+    hostname: gpu-runpod
+    cap_add:
+      - SYS_PTRACE
+      - SYS_ADMIN
+    security_opt:
+      - apparmor:unconfined
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      TZ: ${TIMEZONE:-Europe/Berlin}
+    volumes:
+      - /sys:/host/sys:ro
+      - /proc:/host/proc:ro
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+      - /etc/os-release:/host/etc/os-release:ro
+      - netdata_config:/etc/netdata
+      - netdata_cache:/var/cache/netdata
+      - netdata_lib:/var/lib/netdata
+    ports:
+      - "19999:19999"
+    labels:
+      - "service=netdata"
+      - "stack=gpu-ai"
+
+# =============================================================================
+# Volumes
+# =============================================================================
+volumes:
+  # ComfyUI data
+  comfyui_data:
+    driver: local
+  comfyui_output:
+    driver: local
+  comfyui_input:
+    driver: local
+
+  # Training data
+  training_cache:
+    driver: local
+
+  # Jupyter data
+  jupyter_cache:
+    driver: local
+
+  # Netdata data
+  netdata_config:
+    driver: local
+  netdata_cache:
+    driver: local
+  netdata_lib:
+    driver: local
+
+# =============================================================================
+# Networks
+# =============================================================================
+networks:
+  default:
+    driver: bridge
+    ipam:
+      config:
+        - subnet: 172.25.0.0/24