From 8de88d96ac4be5b82f00bf1dd731a061d162f28e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= <valknar@pivoine.art>
Date: Fri, 21 Nov 2025 12:57:06 +0100
Subject: [PATCH] docs(ai): add comprehensive GPU setup documentation and
 configs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add setup guides (SETUP_GUIDE, TAILSCALE_SETUP, DOCKER_GPU_SETUP, etc.)
- Add deployment configurations (litellm-config-gpu.yaml, gpu-server-compose.yaml)
- Add GPU_DEPLOYMENT_LOG.md with current infrastructure details
- Add GPU_EXPANSION_PLAN.md with complete provider comparison
- Add deploy-gpu-stack.sh automation script

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 ai/DOCKER_GPU_SETUP.md     |  430 ++++++++++++
 ai/GPU_DEPLOYMENT_LOG.md   |  173 +++++
 ai/GPU_EXPANSION_PLAN.md   | 1306 ++++++++++++++++++++++++++++++++++++
 ai/README_GPU_SETUP.md     |  444 ++++++++++++
 ai/SETUP_GUIDE.md          |  261 +++++++
 ai/TAILSCALE_SETUP.md      |  417 ++++++++++++
 ai/WIREGUARD_SETUP.md      |  393 +++++++++++
 ai/deploy-gpu-stack.sh     |  229 +++++++
 ai/gpu-server-compose.yaml |  237 +++++++
 ai/litellm-config-gpu.yaml |  199 ++++++
 10 files changed, 4089 insertions(+)
 create mode 100644 ai/DOCKER_GPU_SETUP.md
 create mode 100644 ai/GPU_DEPLOYMENT_LOG.md
 create mode 100644 ai/GPU_EXPANSION_PLAN.md
 create mode 100644 ai/README_GPU_SETUP.md
 create mode 100644 ai/SETUP_GUIDE.md
 create mode 100644 ai/TAILSCALE_SETUP.md
 create mode 100644 ai/WIREGUARD_SETUP.md
 create mode 100755 ai/deploy-gpu-stack.sh
 create mode 100644 ai/gpu-server-compose.yaml
 create mode 100644 ai/litellm-config-gpu.yaml

diff --git a/ai/DOCKER_GPU_SETUP.md b/ai/DOCKER_GPU_SETUP.md
new file mode 100644
index 0000000..e60d103
--- /dev/null
+++ b/ai/DOCKER_GPU_SETUP.md
@@ -0,0 +1,430 @@
+# Docker & NVIDIA Container Toolkit Setup
+
+## Day 5: Docker Configuration on GPU Server
+
+This guide sets up Docker with GPU support on your RunPod server.
+
+---
+
+## Step 1: Install Docker
+
+### Quick Install (Recommended)
+
+```bash
+# SSH into GPU server
+ssh gpu-pivoine
+
+# Download and run Docker install script
+curl -fsSL https://get.docker.com -o get-docker.sh
+sh get-docker.sh
+
+# Verify installation
+docker --version
+docker compose version
+```
+
+Expected output:
+```
+Docker version 24.0.7, build afdd53b
+Docker Compose version v2.23.0
+```
+
+### Manual Install (Alternative)
+
+```bash
+# Add Docker's official GPG key
+apt-get update
+apt-get install -y ca-certificates curl gnupg
+install -m 0755 -d /etc/apt/keyrings
+curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg
+chmod a+r /etc/apt/keyrings/docker.gpg
+
+# Add repository
+echo \
+  "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
+  $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+  tee /etc/apt/sources.list.d/docker.list > /dev/null
+
+# Install Docker
+apt-get update
+apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
+
+# Start Docker
+systemctl enable docker
+systemctl start docker
+```
+
+---
+
+## Step 2: Install NVIDIA Container Toolkit
+
+This enables Docker containers to use the GPU.
+
+```bash
+# Add NVIDIA repository
+distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
+  gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+
+curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \
+  sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+  tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+
+# Install toolkit
+apt-get update
+apt-get install -y nvidia-container-toolkit
+
+# Configure Docker to use NVIDIA runtime
+nvidia-ctk runtime configure --runtime=docker
+
+# Restart Docker
+systemctl restart docker
+```
+
+---
+
+## Step 3: Test GPU Access in Docker
+
+### Test 1: Basic CUDA Container
+
+```bash
+docker run --rm --runtime=nvidia --gpus all \
+  nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
+```
+
+Expected output: Same as `nvidia-smi` output showing your RTX 4090.
+
+### Test 2: PyTorch Container
+
+```bash
+docker run --rm --runtime=nvidia --gpus all \
+  pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime \
+  python -c "import torch; print('CUDA:', torch.cuda.is_available(), 'Device:', torch.cuda.get_device_name(0))"
+```
+
+Expected output:
+```
+CUDA: True Device: NVIDIA GeForce RTX 4090
+```
+
+### Test 3: Multi-GPU Query (if you have multiple GPUs)
+
+```bash
+docker run --rm --runtime=nvidia --gpus all \
+  nvidia/cuda:12.1.0-base-ubuntu22.04 \
+  bash -c "echo 'GPU Count:' && nvidia-smi --list-gpus"
+```
+
+---
+
+## Step 4: Configure Docker Compose with GPU Support
+
+Docker Compose needs to know about NVIDIA runtime.
+
+### Create daemon.json
+
+```bash
+cat > /etc/docker/daemon.json << 'EOF'
+{
+  "runtimes": {
+    "nvidia": {
+      "path": "nvidia-container-runtime",
+      "runtimeArgs": []
+    }
+  },
+  "default-runtime": "nvidia",
+  "log-driver": "json-file",
+  "log-opts": {
+    "max-size": "10m",
+    "max-file": "3"
+  }
+}
+EOF
+
+# Restart Docker
+systemctl restart docker
+```
+
+---
+
+## Step 5: Create GPU Project Structure
+
+```bash
+cd /workspace
+
+# Create directory structure
+mkdir -p gpu-stack/{vllm,comfyui,training,jupyter,monitoring}
+cd gpu-stack
+
+# Create .env file
+cat > .env << 'EOF'
+# GPU Stack Environment Variables
+
+# Timezone
+TIMEZONE=Europe/Berlin
+
+# VPN Network
+VPS_IP=10.8.0.1
+GPU_IP=10.8.0.2
+
+# Model Storage
+MODELS_PATH=/workspace/models
+
+# Hugging Face (optional, for private models)
+HF_TOKEN=
+
+# PostgreSQL (on VPS)
+DB_HOST=10.8.0.1
+DB_PORT=5432
+DB_USER=valknar
+DB_PASSWORD=ragnarok98
+DB_NAME=openwebui
+
+# Weights & Biases (optional, for training logging)
+WANDB_API_KEY=
+EOF
+
+chmod 600 .env
+```
+
+---
+
+## Step 6: Test Full Stack (Quick Smoke Test)
+
+Let's deploy a minimal vLLM container to verify everything works:
+
+```bash
+cd /workspace/gpu-stack
+
+# Create test compose file
+cat > test-compose.yaml << 'EOF'
+services:
+  test-vllm:
+    image: vllm/vllm-openai:latest
+    container_name: test_vllm
+    runtime: nvidia
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+    command:
+      - --model
+      - facebook/opt-125m  # Tiny model for testing
+      - --host
+      - 0.0.0.0
+      - --port
+      - 8000
+    ports:
+      - "8000:8000"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+EOF
+
+# Start test
+docker compose -f test-compose.yaml up -d
+
+# Wait 30 seconds for model download
+sleep 30
+
+# Check logs
+docker compose -f test-compose.yaml logs
+
+# Test inference
+curl http://localhost:8000/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "facebook/opt-125m",
+    "prompt": "Hello, my name is",
+    "max_tokens": 10
+  }'
+```
+
+Expected output (JSON response with generated text).
+
+**Clean up test:**
+```bash
+docker compose -f test-compose.yaml down
+```
+
+---
+
+## Step 7: Install Additional Tools
+
+```bash
+# Python tools
+apt install -y python3-pip python3-venv
+
+# Monitoring tools
+apt install -y htop nvtop iotop
+
+# Network tools
+apt install -y iperf3 tcpdump
+
+# Development tools
+apt install -y build-essential
+
+# Git LFS (for large model files)
+apt install -y git-lfs
+git lfs install
+```
+
+---
+
+## Step 8: Configure Automatic Updates (Optional)
+
+```bash
+# Install unattended-upgrades
+apt install -y unattended-upgrades
+
+# Configure
+dpkg-reconfigure -plow unattended-upgrades
+
+# Enable automatic security updates
+cat > /etc/apt/apt.conf.d/50unattended-upgrades << 'EOF'
+Unattended-Upgrade::Allowed-Origins {
+    "${distro_id}:${distro_codename}-security";
+};
+Unattended-Upgrade::Automatic-Reboot "false";
+Unattended-Upgrade::Remove-Unused-Dependencies "true";
+EOF
+```
+
+---
+
+## Troubleshooting
+
+### Docker can't access GPU
+
+**Problem:** `docker: Error response from daemon: could not select device driver "" with capabilities: [[gpu]]`
+
+**Solution:**
+```bash
+# Verify NVIDIA runtime is configured
+docker info | grep -i runtime
+
+# Should show nvidia in runtimes list
+# If not, reinstall nvidia-container-toolkit
+
+# Check daemon.json
+cat /etc/docker/daemon.json
+
+# Restart Docker
+systemctl restart docker
+```
+
+### Permission denied on docker commands
+
+**Solution:**
+```bash
+# Add your user to docker group (if not root)
+usermod -aG docker $USER
+
+# Or always use sudo
+sudo docker ...
+```
+
+### Out of disk space
+
+**Check usage:**
+```bash
+df -h
+du -sh /var/lib/docker
+docker system df
+```
+
+**Clean up:**
+```bash
+# Remove unused images
+docker image prune -a
+
+# Remove unused volumes
+docker volume prune
+
+# Full cleanup
+docker system prune -a --volumes
+```
+
+---
+
+## Verification Checklist
+
+Before deploying the full stack:
+
+- [ ] Docker installed and running
+- [ ] `docker --version` shows 24.x or newer
+- [ ] `docker compose version` works
+- [ ] NVIDIA Container Toolkit installed
+- [ ] `docker run --gpus all nvidia/cuda:12.1.0-base nvidia-smi` works
+- [ ] PyTorch container can see GPU
+- [ ] Test vLLM deployment successful
+- [ ] /workspace directory structure created
+- [ ] .env file configured with VPN IPs
+- [ ] Additional tools installed (nvtop, htop, etc.)
+
+---
+
+## Performance Monitoring Commands
+
+**GPU Monitoring:**
+```bash
+# Real-time GPU stats
+watch -n 1 nvidia-smi
+
+# Or with nvtop (prettier)
+nvtop
+
+# GPU memory usage
+nvidia-smi --query-gpu=memory.used,memory.total --format=csv
+```
+
+**Docker Stats:**
+```bash
+# Container resource usage
+docker stats
+
+# Specific container
+docker stats vllm --no-stream
+```
+
+**System Resources:**
+```bash
+# Overall system
+htop
+
+# I/O stats
+iotop
+
+# Network
+iftop
+```
+
+---
+
+## Next: Deploy Production Stack
+
+Now you're ready to deploy the full GPU stack with vLLM, ComfyUI, and training tools.
+
+**Proceed to:** Deploying the production docker-compose.yaml
+
+**Save your progress:**
+
+```bash
+cat >> /workspace/SERVER_INFO.md << 'EOF'
+
+## Docker Configuration
+- Docker Version: [docker --version]
+- NVIDIA Runtime: Enabled
+- GPU Access in Containers: ✓
+- Test vLLM Deployment: Successful
+- Directory: /workspace/gpu-stack
+
+## Tools Installed
+- nvtop: GPU monitoring
+- htop: System monitoring
+- Docker Compose: v2.x
+- Git LFS: Large file support
+EOF
+```
diff --git a/ai/GPU_DEPLOYMENT_LOG.md b/ai/GPU_DEPLOYMENT_LOG.md
new file mode 100644
index 0000000..428d4a4
--- /dev/null
+++ b/ai/GPU_DEPLOYMENT_LOG.md
@@ -0,0 +1,173 @@
+# GPU Server Deployment Log
+
+## Current Deployment (2025-11-21)
+
+### Infrastructure
+- **Provider**: RunPod (Spot Instance)
+- **GPU**: NVIDIA RTX 4090 24GB
+- **Disk**: 50GB local SSD (expanded from 20GB)
+- **Network Volume**: 922TB at `/workspace`
+- **Region**: Europe
+- **Cost**: ~$0.50/hour (~$360/month if running 24/7)
+
+### Network Configuration
+- **VPN**: Tailscale (replaces WireGuard due to RunPod UDP restrictions)
+- **GPU Server Tailscale IP**: 100.100.108.13
+- **VPS Tailscale IP**: (get with `tailscale ip -4` on VPS)
+
+### SSH Access
+```
+Host gpu-pivoine
+    HostName 213.173.102.232
+    Port 29695
+    User root
+    IdentityFile ~/.ssh/id_ed25519
+```
+
+**Note**: RunPod Spot instances can be terminated and restarted with new ports/IPs. Update SSH config accordingly.
+
+### Software Stack
+- **Python**: 3.11.10
+- **vLLM**: 0.6.4.post1 (installed with pip)
+- **PyTorch**: 2.5.1 with CUDA 12.4
+- **Tailscale**: Installed via official script
+
+### vLLM Deployment
+
+**Custom Server**: `ai/simple_vllm_server.py`
+- Uses `AsyncLLMEngine` directly to bypass multiprocessing issues
+- OpenAI-compatible API endpoints:
+  - `GET /v1/models` - List available models
+  - `POST /v1/completions` - Text completion
+  - `POST /v1/chat/completions` - Chat completion
+- Default model: Qwen/Qwen2.5-7B-Instruct
+- Cache directory: `/workspace/huggingface_cache`
+
+**Deployment Command**:
+```bash
+# Copy server script to GPU server
+scp ai/simple_vllm_server.py gpu-pivoine:/workspace/
+
+# Start server
+ssh gpu-pivoine "cd /workspace && nohup python3 simple_vllm_server.py > vllm.log 2>&1 &"
+
+# Check status
+ssh gpu-pivoine "curl http://localhost:8000/v1/models"
+```
+
+**Server Configuration** (environment variables):
+- `VLLM_HOST`: 0.0.0.0 (default)
+- `VLLM_PORT`: 8000 (default)
+
+### Model Configuration
+- **Model**: Qwen/Qwen2.5-7B-Instruct (no auth required)
+- **Context Length**: 4096 tokens
+- **GPU Memory**: 85% utilization
+- **Tensor Parallel**: 1 (single GPU)
+
+### Known Issues & Solutions
+
+#### Issue 1: vLLM Multiprocessing Errors
+**Problem**: Default vLLM v1 engine fails with ZMQ/CUDA multiprocessing errors on RunPod.
+**Solution**: Custom `AsyncLLMEngine` FastAPI server bypasses multiprocessing layer entirely.
+
+#### Issue 2: Disk Space (Solved)
+**Problem**: Original 20GB disk filled up with Hugging Face cache.
+**Solution**: Expanded to 50GB and use `/workspace` for model cache.
+
+#### Issue 3: Gated Models
+**Problem**: Llama models require Hugging Face authentication.
+**Solution**: Use Qwen 2.5 7B Instruct (no auth required) or set `HF_TOKEN` environment variable.
+
+#### Issue 4: Spot Instance Volatility
+**Problem**: RunPod Spot instances can be terminated anytime.
+**Solution**: Accept as trade-off for cost savings. Document SSH details for quick reconnection.
+
+### Monitoring
+
+**Check vLLM logs**:
+```bash
+ssh gpu-pivoine "tail -f /workspace/vllm.log"
+```
+
+**Check GPU usage**:
+```bash
+ssh gpu-pivoine "nvidia-smi"
+```
+
+**Check Tailscale status**:
+```bash
+ssh gpu-pivoine "tailscale status"
+```
+
+**Test API locally (on GPU server)**:
+```bash
+ssh gpu-pivoine "curl http://localhost:8000/v1/models"
+```
+
+**Test API via Tailscale (from VPS)**:
+```bash
+curl http://100.100.108.13:8000/v1/models
+```
+
+### LiteLLM Integration
+
+Update VPS LiteLLM config at `ai/litellm-config-gpu.yaml`:
+
+```yaml
+# Replace old WireGuard IP (10.8.0.2) with Tailscale IP
+- model_name: qwen-2.5-7b
+  litellm_params:
+    model: openai/qwen-2.5-7b
+    api_base: http://100.100.108.13:8000/v1  # Tailscale IP
+    api_key: dummy
+    rpm: 1000
+    tpm: 100000
+```
+
+Restart LiteLLM:
+```bash
+arty restart litellm
+```
+
+### Troubleshooting
+
+**Server not responding**:
+1. Check if process is running: `pgrep -f simple_vllm_server`
+2. Check logs: `tail -100 /workspace/vllm.log`
+3. Check GPU availability: `nvidia-smi`
+4. Restart server: `pkill -f simple_vllm_server && python3 /workspace/simple_vllm_server.py &`
+
+**Tailscale not connected**:
+1. Check status: `tailscale status`
+2. Check daemon: `ps aux | grep tailscaled`
+3. Restart: `tailscale down && tailscale up`
+
+**Model download failing**:
+1. Check disk space: `df -h`
+2. Check cache directory: `ls -lah /workspace/huggingface_cache`
+3. Clear cache if needed: `rm -rf /workspace/huggingface_cache/*`
+
+### Next Steps
+1. ✅ Deploy vLLM with Qwen 2.5 7B
+2. ⏳ Test API endpoints locally and via Tailscale
+3. ⏳ Update VPS LiteLLM configuration
+4. ⏳ Test end-to-end: Open WebUI → LiteLLM → vLLM
+5. ⏹️ Monitor performance and costs
+6. ⏹️ Consider adding more models (Mistral, DeepSeek Coder)
+7. ⏹️ Set up auto-stop for idle periods to save costs
+
+### Cost Optimization Ideas
+1. **Auto-stop**: Configure RunPod to auto-stop after 30 minutes idle
+2. **Spot Instances**: Already using Spot for 50% cost reduction
+3. **Scheduled Operation**: Run only during business hours (8 hours/day = $120/month)
+4. **Smaller Models**: Use Mistral 7B or quantized models for lighter workloads
+5. **Pay-as-you-go**: Manually start/stop pod as needed
+
+### Performance Benchmarks
+*To be measured after deployment*
+
+Expected (based on RTX 4090):
+- Qwen 2.5 7B: 50-80 tokens/second
+- Context processing: ~2-3 seconds for 1000 tokens
+- First token latency: ~200-300ms
diff --git a/ai/GPU_EXPANSION_PLAN.md b/ai/GPU_EXPANSION_PLAN.md
new file mode 100644
index 0000000..d34ea01
--- /dev/null
+++ b/ai/GPU_EXPANSION_PLAN.md
@@ -0,0 +1,1306 @@
+# GPU-Enhanced AI Stack Expansion Plan
+
+## Executive Summary
+
+This document outlines a comprehensive plan to extend the current AI stack (LiteLLM, Open WebUI, Crawl4AI) with dedicated GPU hosting capabilities for:
+- **LLM Model Hosting**: Self-hosted models (Llama, Mistral, Qwen, etc.)
+- **Model Training**: Fine-tuning and training workflows
+- **Image Generation**: Stable Diffusion, FLUX via ComfyUI
+- **Video Generation**: AnimateDiff, CogVideo, etc.
+
+**Current Architecture**: CPU-based stack on pivoine.art VPS → Claude API via LiteLLM
+**Target Architecture**: Hybrid stack with GPU server(s) for self-hosted models + API-based models
+
+---
+
+## Phase 1: Current Stack Analysis
+
+### Existing Components
+
+1. **ai_postgres** (pgvector/pgvector:pg16)
+   - PostgreSQL with pgvector for RAG
+   - Stores: conversations, embeddings, LiteLLM logs
+
+2. **webui** (Open WebUI)
+   - User-facing ChatGPT-like interface
+   - URL: https://ai.pivoine.art
+   - Features: RAG, web search, document upload
+   - Connected to LiteLLM proxy
+
+3. **litellm** (LiteLLM proxy)
+   - Currently proxies Anthropic Claude API
+   - OpenAI-compatible endpoint at http://litellm:4000
+   - Supports multiple providers via config
+
+4. **crawl4ai**
+   - Internal web scraping for LLM content prep
+   - Port 11235 (internal only)
+
+5. **facefusion** (CPU-only)
+   - Face swapping/enhancement
+   - Currently CPU-based (slow)
+   - Protected by Authelia SSO
+
+### Current Limitations
+
+- ❌ No self-hosted LLMs (relies on expensive API calls)
+- ❌ No GPU acceleration for facefusion
+- ❌ No image generation capabilities
+- ❌ No model training/fine-tuning capabilities
+- ❌ No video generation
+- ❌ High operational costs for API usage
+
+---
+
+## Phase 2: GPU Provider Comparison
+
+### Provider Options
+
+#### 1. **RunPod** ⭐ RECOMMENDED
+**Pros:**
+- Pay-per-second GPU billing
+- Wide GPU selection (RTX 4090, A100, H100)
+- Docker-first platform
+- Global locations
+- Easy HTTP/SSH tunneling
+- Volume persistence
+
+**Pricing (Approximate):**
+- RTX 4090 (24GB): ~$0.50/hour ($360/month 24/7)
+- RTX 3090 (24GB): ~$0.35/hour ($250/month)
+- A6000 (48GB): ~$0.80/hour ($576/month)
+- A100 (40GB): ~$1.50/hour ($1,080/month)
+
+**Best for:** On-demand workloads, experimentation, cost-conscious hosting
+
+---
+
+#### 2. **Lambda Labs**
+**Pros:**
+- Flat monthly pricing
+- High-end GPUs (A100, H100)
+- Jupyter notebooks included
+- Fast network
+
+**Pricing:**
+- 1x A100 (40GB): $1.10/hour ($792/month)
+- 8x A100 (40GB): $8.00/hour (~$5,760/month)
+
+**Best for:** Research, high-utilization workloads
+
+---
+
+#### 3. **Vast.ai**
+**Pros:**
+- Marketplace model (cheapest)
+- Many GPU options
+- Spot pricing available
+
+**Cons:**
+- Variable reliability
+- Setup complexity
+- Community-hosted machines
+
+**Pricing:**
+- RTX 4090: ~$0.25-0.40/hour
+- A100: ~$0.80-1.20/hour
+
+**Best for:** Budget-conscious, experimental workloads
+
+---
+
+#### 4. **Google Cloud Platform (GCP)**
+**Pros:**
+- Enterprise reliability
+- Auto-scaling
+- Integration with Google services
+- Preemptible instances available
+
+**Pricing:**
+- T4 (16GB): ~$0.35/hour
+- V100 (16GB): ~$2.48/hour
+- A100 (40GB): ~$2.93/hour
+- TPU options available
+
+**Best for:** Enterprise workloads, auto-scaling needs
+
+---
+
+#### 5. **AWS**
+**Pros:**
+- Global infrastructure
+- Broad GPU selection
+- Spot instances for cost savings
+- Enterprise support
+
+**Pricing:**
+- g4dn.xlarge (T4 16GB): ~$0.526/hour
+- p3.2xlarge (V100 16GB): ~$3.06/hour
+- p4d.24xlarge (8x A100 40GB): ~$32.77/hour
+
+**Best for:** Enterprise, existing AWS infrastructure
+
+---
+
+#### 6. **Hugging Face Spaces / Inference Endpoints**
+**Pros:**
+- Managed model hosting
+- Auto-scaling
+- Simple deployment
+- Community models
+
+**Pricing:**
+- CPU: $0.03/hour
+- T4: $0.60/hour
+- A10G: $1.00/hour
+- A100: $4.00/hour
+
+**Best for:** Quick model deployment, serverless inference
+
+---
+
+### Recommendation: **RunPod** for Primary GPU Server
+
+**Rationale:**
+1. **Cost-effective**: Pay-per-second billing, ~$0.50/hour for RTX 4090
+2. **Docker-native**: Easy integration with existing compose stack
+3. **Flexibility**: Start/stop as needed, scale up for training
+4. **Community**: Large user base, good documentation
+5. **Network**: Built-in HTTP/SSH tunneling
+
+**Supplementary**: Use Hugging Face for specific model hosting if needed
+
+---
+
+## Phase 3: Architecture Design
+
+### Network Topology
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│ pivoine.art VPS (CPU-based)                                 │
+├─────────────────────────────────────────────────────────────┤
+│                                                              │
+│  ┌──────────┐      ┌──────────┐      ┌──────────┐         │
+│  │ Open     │─────▶│ LiteLLM  │◀────▶│ ai_      │         │
+│  │ WebUI    │      │ Proxy    │      │ postgres │         │
+│  └──────────┘      └──────────┘      └──────────┘         │
+│       │                  │                                  │
+│       │                  │                                  │
+└───────┼──────────────────┼──────────────────────────────────┘
+        │                  │
+        │                  ▼
+        │         ┌─────────────────┐
+        │         │ Anthropic API   │
+        │         │ (Claude)        │
+        │         └─────────────────┘
+        │
+        ▼
+┌────────────────────────────────────────────────────────────┐
+│ GPU Server (RunPod)                                        │
+├────────────────────────────────────────────────────────────┤
+│                                                             │
+│  ┌──────────┐  ┌──────────┐  ┌──────────┐  ┌──────────┐ │
+│  │ vLLM     │  │ ComfyUI  │  │ Model    │  │ JupyterLab│ │
+│  │ (LLMs)   │  │ (SD/FLUX)│  │ Training │  │           │ │
+│  └──────────┘  └──────────┘  └──────────┘  └──────────┘ │
+│       │              │             │              │       │
+│       └──────────────┴─────────────┴──────────────┘       │
+│                      │                                     │
+│              ┌───────────────┐                            │
+│              │ Model Storage │                            │
+│              │ (Persistent)  │                            │
+│              └───────────────┘                            │
+│                                                            │
+└────────────────────────────────────────────────────────────┘
+         │
+         ▼ (Tunneled via WireGuard or Tailscale)
+┌────────────────────────────────────────────────────────────┐
+│ Integration Options:                                       │
+├────────────────────────────────────────────────────────────┤
+│ 1. LiteLLM adds vLLM endpoint (http://gpu.internal:8000)  │
+│ 2. ComfyUI exposed via subdomain (comfy.ai.pivoine.art)   │
+│ 3. Model storage synced via rclone/restic                 │
+└────────────────────────────────────────────────────────────┘
+```
+
+### Connection Methods
+
+#### Option A: WireGuard VPN (RECOMMENDED)
+- Create WireGuard tunnel between VPS and GPU server
+- GPU services accessible via private IPs
+- Secure, low overhead, easy to manage
+- Already have wg-easy in your stack
+
+**Setup:**
+1. Deploy WireGuard on GPU server
+2. Add GPU server as VPN peer
+3. Configure LiteLLM to use VPN IPs
+
+#### Option B: SSH Tunnel
+- SSH reverse tunnel from GPU to VPS
+- Simple, no additional software
+- Higher latency
+
+#### Option C: Tailscale
+- Zero-config VPN mesh
+- Easy setup, good UX
+- Proprietary (but free tier available)
+
+---
+
+## Phase 4: Service Implementation Plans
+
+### 4.1 LLM Hosting with vLLM
+
+**vLLM** is the industry-standard for high-performance LLM inference.
+
+#### Features:
+- PagedAttention for efficient KV cache
+- Continuous batching
+- OpenAI-compatible API
+- Tensor parallelism for multi-GPU
+- Quantization support (AWQ, GPTQ)
+
+#### Docker Compose Configuration:
+
+```yaml
+services:
+  vllm:
+    image: vllm/vllm-openai:latest
+    container_name: gpu_vllm
+    restart: unless-stopped
+    runtime: nvidia
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      CUDA_VISIBLE_DEVICES: 0
+    volumes:
+      - vllm_models:/root/.cache/huggingface
+    command:
+      - --model
+      - meta-llama/Meta-Llama-3.1-8B-Instruct  # or any model
+      - --host
+      - 0.0.0.0
+      - --port
+      - 8000
+      - --tensor-parallel-size
+      - '1'
+      - --gpu-memory-utilization
+      - '0.9'
+      - --max-model-len
+      - '8192'
+    ports:
+      - "8000:8000"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+```
+
+#### Recommended Models for RTX 4090 (24GB):
+
+**Text Generation:**
+- Llama 3.1 8B Instruct (8GB VRAM, fast)
+- Qwen2.5 14B Instruct (14GB VRAM, multilingual)
+- Mistral 7B Instruct v0.3 (7GB VRAM)
+- Nous Hermes 2 Mixtral 8x7B (with quantization, 16GB)
+
+**Code:**
+- DeepSeek Coder 6.7B (7GB VRAM)
+- CodeLlama 13B (13GB VRAM)
+- Qwen2.5-Coder 14B (14GB VRAM)
+
+#### Integration with LiteLLM:
+
+Add to `ai/litellm-config.yaml`:
+
+```yaml
+model_list:
+  # Existing Anthropic
+  - model_name: claude-sonnet-4-5
+    litellm_params:
+      model: anthropic/claude-sonnet-4-5-20250929
+      api_key: os.environ/ANTHROPIC_API_KEY
+
+  # New vLLM models
+  - model_name: llama-3.1-8b
+    litellm_params:
+      model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct
+      api_base: http://gpu.internal:8000/v1
+      api_key: dummy
+
+  - model_name: qwen-2.5-14b
+    litellm_params:
+      model: openai/Qwen/Qwen2.5-14B-Instruct
+      api_base: http://gpu.internal:8000/v1
+      api_key: dummy
+```
+
+---
+
+### 4.2 ComfyUI for Image/Video Generation
+
+**ComfyUI** is a node-based UI for Stable Diffusion with advanced workflows.
+
+#### Features:
+- Node-based workflow editor
+- Support for SD 1.5, SDXL, SD3, FLUX
+- ControlNet, LoRA, embeddings
+- Video generation (AnimateDiff, SVD)
+- API for automation
+
+#### Docker Compose Configuration:
+
+```yaml
+services:
+  comfyui:
+    image: ghcr.io/ai-dock/comfyui:latest
+    container_name: gpu_comfyui
+    restart: unless-stopped
+    runtime: nvidia
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      # Custom nodes auto-install
+      COMFYUI_FLAGS: --listen 0.0.0.0 --port 8188
+    volumes:
+      - comfyui_data:/data
+      - comfyui_models:/opt/ComfyUI/models
+      - comfyui_output:/opt/ComfyUI/output
+    ports:
+      - "8188:8188"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+```
+
+#### Model Downloads (via ComfyUI Manager):
+
+**Stable Diffusion Models:**
+- FLUX.1-dev (12GB, newest, best quality)
+- FLUX.1-schnell (12GB, fast)
+- SDXL Base 1.0 (6.9GB)
+- SD 1.5 (4GB, fast, wide LoRA support)
+
+**ControlNet Models:**
+- controlnet-canny-sdxl
+- controlnet-depth-sdxl
+- controlnet-openpose-sdxl
+
+**LoRA Models** (download from Civitai):
+- Style LoRAs (anime, realistic, etc.)
+- Character LoRAs
+- Concept LoRAs
+
+#### Traefik Integration:
+
+Add subdomain routing for ComfyUI:
+
+```yaml
+labels:
+  - 'traefik.enable=true'
+  - 'traefik.http.routers.comfyui-web-secure.rule=Host(`comfy.ai.pivoine.art`)'
+  - 'traefik.http.routers.comfyui-web-secure.tls.certresolver=resolver'
+  - 'traefik.http.routers.comfyui-web-secure.entrypoints=web-secure'
+  - 'traefik.http.routers.comfyui-web-secure.middlewares=net-authelia,security-headers@file'
+  - 'traefik.http.services.comfyui.loadbalancer.server.port=8188'
+```
+
+#### Open WebUI Integration:
+
+ComfyUI has a REST API that can be called from Open WebUI using function calling.
+
+Example workflow API call:
+```python
+import requests
+
+def generate_image(prompt: str, negative_prompt: str = ""):
+    workflow = {
+        # ComfyUI workflow JSON
+    }
+    response = requests.post(
+        "http://comfyui:8188/prompt",
+        json={"prompt": workflow}
+    )
+    return response.json()
+```
+
+---
+
+### 4.3 Model Training Infrastructure
+
+For fine-tuning LLMs and training custom models.
+
+#### Option A: Axolotl (Recommended)
+
+**Axolotl** is a user-friendly fine-tuning framework supporting:
+- LoRA, QLoRA
+- Full fine-tuning
+- RLHF/DPO
+- Multi-GPU training
+
+```yaml
+services:
+  axolotl:
+    image: winglian/axolotl:main-py3.11-cu121-2.2.2
+    container_name: gpu_training
+    runtime: nvidia
+    volumes:
+      - ./training/configs:/workspace/configs
+      - ./training/data:/workspace/data
+      - ./training/output:/workspace/output
+      - training_cache:/root/.cache
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      WANDB_API_KEY: ${WANDB_API_KEY:-}  # Optional: Weights & Biases logging
+    command: |
+      bash -c "
+      accelerate launch -m axolotl.cli.train /workspace/configs/config.yaml
+      "
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+```
+
+#### Training Workflow:
+1. Prepare dataset (JSONL format)
+2. Create Axolotl config (LoRA, batch size, epochs)
+3. Start training container
+4. Monitor via Weights & Biases or TensorBoard
+5. Export LoRA adapters
+6. Merge with base model or use in vLLM
+
+#### Example Config:
+```yaml
+# training/configs/lora-llama3.yaml
+base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: /workspace/data/train.jsonl
+    type: completion
+    field: text
+
+output_dir: /workspace/output/llama3-lora
+
+adapter: lora
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0.05
+lora_target_modules:
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+learning_rate: 0.0002
+
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+warmup_steps: 100
+```
+
+#### Option B: JupyterLab for Custom Training
+
+For research and custom training scripts:
+
+```yaml
+services:
+  jupyter:
+    image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel
+    container_name: gpu_jupyter
+    runtime: nvidia
+    volumes:
+      - ./notebooks:/workspace
+      - jupyter_cache:/root/.cache
+    ports:
+      - "8888:8888"
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      JUPYTER_ENABLE_LAB: "yes"
+    command: |
+      bash -c "
+      pip install jupyterlab transformers datasets accelerate bitsandbytes peft &&
+      jupyter lab --ip=0.0.0.0 --allow-root --no-browser --NotebookApp.token=''
+      "
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+```
+
+---
+
+### 4.4 Model Storage Strategy
+
+#### Storage Requirements:
+
+**Per Model Type:**
+- LLM 7B: ~14GB (FP16)
+- LLM 13B: ~26GB
+- SDXL: ~7GB
+- FLUX: ~12GB
+- ControlNet: ~2.5GB each
+- LoRA: ~100-500MB each
+
+**Total Estimated:**
+- 3-4 LLMs: ~80GB
+- SD models + LoRAs: ~50GB
+- Training checkpoints: ~100GB
+- **Total: 250-300GB minimum**
+
+#### RunPod Storage Options:
+
+1. **Network Volume** (Recommended)
+   - Persistent across pod restarts
+   - Shared between multiple pods
+   - ~$0.10/GB/month
+   - 500GB = $50/month
+
+2. **Container Disk**
+   - Included with pod
+   - Lost when pod stops
+   - Good for temporary storage
+
+3. **External Storage (rclone)**
+   - Sync to/from VPS or cloud storage
+   - Backup models to Backblaze B2 or Wasabi
+   - Good for disaster recovery
+
+#### Model Management:
+
+Use **Hugging Face Hub** as model cache:
+
+```bash
+# Download models on first run
+huggingface-cli download meta-llama/Meta-Llama-3.1-8B-Instruct \
+  --local-dir /models/llama-3.1-8b
+
+# Or let vLLM/ComfyUI auto-download
+```
+
+**Model Sync Script:**
+```bash
+#!/bin/bash
+# sync-models.sh - Sync models from VPS to GPU server
+
+rclone sync \
+  /mnt/hidrive/AI/models \
+  gpu:/workspace/models \
+  --progress \
+  --transfers 4
+```
+
+---
+
+## Phase 5: Implementation Roadmap
+
+### Week 1: Infrastructure Setup
+
+**Day 1-2: RunPod Account & GPU Server**
+- [ ] Create RunPod account
+- [ ] Deploy RTX 4090 pod with Ubuntu 22.04 + PyTorch template
+- [ ] Configure persistent network volume (500GB)
+- [ ] Set up SSH access
+
+**Day 3-4: Network Configuration**
+- [ ] Deploy WireGuard on GPU server
+- [ ] Add GPU server as peer to existing VPN (vpn/compose.yaml)
+- [ ] Test connectivity between VPS and GPU server
+- [ ] Configure firewall rules
+
+**Day 5: Docker Setup on GPU Server**
+- [ ] Install Docker + NVIDIA Container Toolkit
+- [ ] Create docker-compose.yaml for GPU services
+- [ ] Test GPU access in containers
+
+---
+
+### Week 2: LLM Hosting
+
+**Day 1-2: vLLM Deployment**
+- [ ] Deploy vLLM container
+- [ ] Download Llama 3.1 8B Instruct
+- [ ] Test inference locally
+- [ ] Benchmark performance (tokens/sec)
+
+**Day 3-4: LiteLLM Integration**
+- [ ] Update litellm-config.yaml with vLLM endpoint
+- [ ] Test via Open WebUI
+- [ ] Configure model routing (cheap models → vLLM, complex → Claude)
+- [ ] Set up usage monitoring
+
+**Day 5: Model Expansion**
+- [ ] Download Qwen 2.5 14B
+- [ ] Download Mistral 7B Instruct
+- [ ] Test model switching in Open WebUI
+- [ ] Document performance characteristics
+
+---
+
+### Week 3: Image Generation
+
+**Day 1-2: ComfyUI Setup**
+- [ ] Deploy ComfyUI container
+- [ ] Download FLUX.1-schnell
+- [ ] Download SDXL
+- [ ] Install ComfyUI Manager
+
+**Day 3-4: Model Downloads**
+- [ ] Download ControlNet models
+- [ ] Download VAE models
+- [ ] Download popular LoRAs from Civitai
+- [ ] Organize model directory
+
+**Day 5: Integration & Workflows**
+- [ ] Create basic text-to-image workflow
+- [ ] Create ControlNet workflow
+- [ ] Test API access
+- [ ] Add Traefik subdomain (comfy.ai.pivoine.art)
+
+---
+
+### Week 4: Training Infrastructure
+
+**Day 1-2: Axolotl Setup**
+- [ ] Deploy Axolotl container
+- [ ] Create sample dataset
+- [ ] Test LoRA fine-tuning with tiny model
+- [ ] Verify GPU utilization
+
+**Day 3-4: JupyterLab Setup**
+- [ ] Deploy JupyterLab container
+- [ ] Install ML libraries
+- [ ] Create example notebooks
+- [ ] Test custom training scripts
+
+**Day 5: Documentation & Testing**
+- [ ] Write training guides
+- [ ] Test end-to-end workflows
+- [ ] Benchmark training speeds
+- [ ] Document best practices
+
+---
+
+### Ongoing: Optimization & Expansion
+
+**Month 2:**
+- Monitor costs and optimize GPU utilization
+- Implement model caching strategies
+- Add more models based on usage patterns
+- Set up automated model updates
+- Implement usage quotas per user
+
+**Month 3+:**
+- Consider multi-GPU setup for larger models
+- Implement model quantization (AWQ/GPTQ)
+- Add video generation (AnimateDiff, CogVideo)
+- Explore voice synthesis (XTTS, Bark)
+- Custom model training for specific use cases
+
+---
+
+## Phase 6: Cost Analysis
+
+### Scenario A: Single RTX 4090 (24/7)
+
+**GPU Server (RunPod):**
+- RTX 4090 pod: $0.50/hour × 720 hours = $360/month
+- 500GB network volume: $50/month
+- **Subtotal: $410/month**
+
+**VPS (Existing):**
+- No change in cost
+
+**Total: ~$410/month**
+
+**Savings:**
+- Claude API costs reduced by ~80% (self-hosted for routine tasks)
+- Break-even if currently spending >$500/month on API calls
+
+---
+
+### Scenario B: Pay-as-you-go (8 hours/day)
+
+**GPU Server (RunPod):**
+- RTX 4090: $0.50/hour × 8 hours × 30 days = $120/month
+- Storage: $50/month
+- **Subtotal: $170/month**
+
+**Best for:**
+- Development/experimentation
+- Burst workloads
+- Image generation on-demand
+
+---
+
+### Scenario C: Dual GPU (Training + Inference)
+
+**GPU Server 1 (Inference):**
+- RTX 4090 24/7: $360/month
+
+**GPU Server 2 (Training - On-demand):**
+- A100 40GB: $1.50/hour × 40 hours/month = $60/month
+- Used only for fine-tuning sessions
+
+**Storage:**
+- 1TB network volume: $100/month
+
+**Total: ~$520/month**
+
+---
+
+### Cost Optimization Tips
+
+1. **Auto-stop idle pods**: RunPod can auto-stop after X minutes idle
+2. **Use spot instances**: ~50% cheaper but can be interrupted
+3. **Quantized models**: 4-bit models use 4x less VRAM → cheaper GPUs
+4. **Batch processing**: Queue image gen jobs to maximize GPU usage
+5. **Model sharing**: One vLLM instance can serve multiple models via adapters
+6. **Monitoring**: Track per-model costs to optimize routing
+
+---
+
+## Phase 7: Monitoring & Operations
+
+### Metrics to Track
+
+**GPU Utilization:**
+- nvidia-smi metrics (utilization %, memory usage)
+- Temperature and power draw
+- Per-process GPU usage
+
+**Model Performance:**
+- Tokens per second (LLM inference)
+- Images per second (SD/FLUX)
+- Training time per epoch
+
+**Costs:**
+- GPU hours consumed
+- Storage usage
+- API vs self-hosted breakdown
+
+### Monitoring Stack
+
+**Option A: Netdata (Already deployed)**
+
+Add GPU monitoring to existing Netdata:
+
+```yaml
+# On GPU server
+services:
+  netdata:
+    image: netdata/netdata:latest
+    container_name: gpu_netdata
+    runtime: nvidia
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+    volumes:
+      - /sys:/host/sys:ro
+      - /proc:/host/proc:ro
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+    command: |
+      bash -c "
+      # Enable nvidia_smi plugin
+      /usr/libexec/netdata/plugins.d/charts.d.plugin nvidia_smi
+      "
+```
+
+**Option B: Prometheus + Grafana**
+
+For detailed metrics:
+
+```yaml
+services:
+  prometheus:
+    image: prom/prometheus:latest
+    volumes:
+      - ./prometheus.yml:/etc/prometheus/prometheus.yml
+      - prometheus_data:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+
+  dcgm-exporter:
+    image: nvidia/dcgm-exporter:latest
+    runtime: nvidia
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+
+  grafana:
+    image: grafana/grafana:latest
+    ports:
+      - "3000:3000"
+    volumes:
+      - grafana_data:/var/lib/grafana
+```
+
+Import Grafana dashboard #12219 for GPU metrics.
+
+---
+
+## Phase 8: Backup & Disaster Recovery
+
+### What to Backup
+
+1. **Models** (250-300GB)
+   - Base models can be re-downloaded
+   - Custom fine-tuned models: CRITICAL
+   - LoRAs: CRITICAL
+
+2. **Training Data** (~10-50GB)
+   - Datasets
+   - Preprocessing scripts
+
+3. **Configurations** (<1GB)
+   - Docker compose files
+   - Training configs
+   - Workflow JSONs
+
+### Backup Strategy
+
+**Tier 1: Critical (Daily)**
+- Fine-tuned models
+- Training checkpoints
+- Custom datasets
+
+**Backup to:**
+- Restic → HiDrive (already configured)
+- Backblaze B2 (~$6/TB/month)
+
+```bash
+# Add to core/compose.yaml backrest config
+- gpu_models:/volumes/gpu_models:ro
+- gpu_checkpoints:/volumes/gpu_checkpoints:ro
+```
+
+**Tier 2: Nice-to-have (Weekly)**
+- Base models (can re-download)
+- ComfyUI outputs
+
+**Tier 3: Ephemeral (No backup)**
+- Inference cache
+- Temporary generations
+
+---
+
+## Phase 9: Security Considerations
+
+### GPU Server Security
+
+1. **Firewall:**
+   - Only allow WireGuard port (51820)
+   - All services accessed via VPN
+   - No public exposure
+
+2. **SSH:**
+   - Key-based auth only
+   - Disable password auth
+   - Change default port
+
+3. **Docker:**
+   - Rootless Docker (optional but recommended)
+   - Limited container capabilities
+   - No privileged containers except for nvidia-runtime
+
+4. **Secrets:**
+   - Store API keys in .env
+   - Use Docker secrets for sensitive data
+   - Rotate keys periodically
+
+### Access Control
+
+- **ComfyUI**: Protected by Authelia SSO (already configured)
+- **vLLM**: Internal only, accessed via LiteLLM proxy
+- **JupyterLab**: Password-protected or Authelia
+- **Training**: No public access, VPN only
+
+---
+
+## Phase 10: Advanced Features (Future)
+
+### Multi-GPU Scaling
+
+**Tensor Parallelism** (vLLM):
+- Split large models across multiple GPUs
+- Example: 70B model on 2x A100s
+
+```yaml
+command:
+  - --model
+  - meta-llama/Meta-Llama-3.1-70B-Instruct
+  - --tensor-parallel-size
+  - '2'  # Use 2 GPUs
+```
+
+**Pipeline Parallelism** (training):
+- Split model layers across GPUs
+- Useful for very large models
+
+### Model Serving Optimization
+
+**vLLM Features:**
+- Speculative decoding (faster generation)
+- Prefix caching (faster for repeated prompts)
+- Multi-LoRA serving (multiple adapters, one base model)
+
+**Example multi-LoRA:**
+```yaml
+command:
+  - --model
+  - meta-llama/Meta-Llama-3.1-8B-Instruct
+  - --enable-lora
+  - --max-loras
+  - '4'
+  - --lora-modules
+  - customer-support=/models/loras/support-lora
+  - creative-writing=/models/loras/writing-lora
+```
+
+### Video Generation
+
+**AnimateDiff in ComfyUI:**
+- Generate short videos from text prompts
+- Animate static images
+- ~8GB VRAM for 512x512 16-frame videos
+
+**CogVideo:**
+- High-quality video generation
+- Requires A100 or H100
+- 5-second clips at 720p
+
+### Voice Synthesis
+
+**XTTS v2:**
+- High-quality voice cloning
+- Multi-language support
+- ~6GB VRAM
+
+**Bark:**
+- Text-to-speech with emotions
+- Sound effects
+- ~10GB VRAM
+
+---
+
+## Appendix A: Quick Start Commands
+
+### Initial GPU Server Setup
+
+```bash
+# SSH into RunPod instance
+ssh root@gpu.runpod.io -p 12345
+
+# Install Docker
+curl -fsSL https://get.docker.com -o get-docker.sh
+sh get-docker.sh
+
+# Install NVIDIA Container Toolkit
+distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
+curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \
+  sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+  tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+apt-get update
+apt-get install -y nvidia-container-toolkit
+systemctl restart docker
+
+# Test GPU access
+docker run --rm --runtime=nvidia --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
+```
+
+### Deploy vLLM (Quick Test)
+
+```bash
+# Create directory
+mkdir -p /workspace/vllm
+cd /workspace/vllm
+
+# Run vLLM
+docker run -d \
+  --name vllm \
+  --runtime=nvidia \
+  --gpus all \
+  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  -p 8000:8000 \
+  vllm/vllm-openai:latest \
+  --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+  --dtype auto \
+  --max-model-len 8192
+
+# Test inference
+curl http://localhost:8000/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "prompt": "Once upon a time",
+    "max_tokens": 50
+  }'
+```
+
+### Deploy ComfyUI (Quick Test)
+
+```bash
+docker run -d \
+  --name comfyui \
+  --runtime=nvidia \
+  --gpus all \
+  -v /workspace/comfyui:/data \
+  -p 8188:8188 \
+  ghcr.io/ai-dock/comfyui:latest
+
+# Access at http://gpu-ip:8188
+```
+
+---
+
+## Appendix B: Sample Docker Compose (Full GPU Stack)
+
+```yaml
+# gpu-server/compose.yaml
+version: '3.8'
+
+services:
+  # vLLM for LLM inference
+  vllm:
+    image: vllm/vllm-openai:latest
+    container_name: gpu_vllm
+    restart: unless-stopped
+    runtime: nvidia
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      CUDA_VISIBLE_DEVICES: 0
+    volumes:
+      - vllm_models:/root/.cache/huggingface
+    command:
+      - --model
+      - meta-llama/Meta-Llama-3.1-8B-Instruct
+      - --host
+      - 0.0.0.0
+      - --port
+      - 8000
+      - --gpu-memory-utilization
+      - '0.9'
+    ports:
+      - "8000:8000"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+
+  # ComfyUI for image generation
+  comfyui:
+    image: ghcr.io/ai-dock/comfyui:latest
+    container_name: gpu_comfyui
+    restart: unless-stopped
+    runtime: nvidia
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+    volumes:
+      - comfyui_data:/data
+      - comfyui_models:/opt/ComfyUI/models
+      - comfyui_output:/opt/ComfyUI/output
+    ports:
+      - "8188:8188"
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+
+  # Axolotl for model training
+  axolotl:
+    image: winglian/axolotl:main-py3.11-cu121-2.2.2
+    container_name: gpu_training
+    runtime: nvidia
+    volumes:
+      - ./training/configs:/workspace/configs
+      - ./training/data:/workspace/data
+      - ./training/output:/workspace/output
+      - training_cache:/root/.cache
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    # Only start when training
+    profiles:
+      - training
+
+  # JupyterLab for research
+  jupyter:
+    image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel
+    container_name: gpu_jupyter
+    restart: unless-stopped
+    runtime: nvidia
+    volumes:
+      - ./notebooks:/workspace
+      - jupyter_cache:/root/.cache
+    ports:
+      - "8888:8888"
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      JUPYTER_ENABLE_LAB: "yes"
+    command: |
+      bash -c "
+      pip install jupyterlab transformers datasets accelerate bitsandbytes peft &&
+      jupyter lab --ip=0.0.0.0 --allow-root --no-browser
+      "
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+
+  # Netdata monitoring
+  netdata:
+    image: netdata/netdata:latest
+    container_name: gpu_netdata
+    restart: unless-stopped
+    runtime: nvidia
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+    volumes:
+      - /sys:/host/sys:ro
+      - /proc:/host/proc:ro
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+    ports:
+      - "19999:19999"
+    cap_add:
+      - SYS_PTRACE
+      - SYS_ADMIN
+    security_opt:
+      - apparmor:unconfined
+
+volumes:
+  vllm_models:
+  comfyui_data:
+  comfyui_models:
+  comfyui_output:
+  training_cache:
+  jupyter_cache:
+```
+
+---
+
+## Appendix C: Cost Calculator
+
+**Monthly GPU Costs:**
+
+| GPU Model | VRAM | $/hour | 24/7 Month | 8hr/day | Use Case |
+|-----------|------|--------|------------|---------|----------|
+| RTX 3090 | 24GB | $0.35 | $252 | $84 | Development, small models |
+| RTX 4090 | 24GB | $0.50 | $360 | $120 | Production inference, SD |
+| A6000 | 48GB | $0.80 | $576 | $192 | Large models, training |
+| A100 40GB | 40GB | $1.50 | $1,080 | $360 | Enterprise, training |
+| A100 80GB | 80GB | $2.50 | $1,800 | $600 | Massive models, research |
+
+**Storage Costs:**
+- Network volume: $0.10/GB/month
+- 500GB = $50/month
+- 1TB = $100/month
+
+**Total Estimated Monthly:**
+- RTX 4090 + 500GB storage = $410/month (24/7)
+- RTX 4090 + 500GB storage = $170/month (8hr/day)
+
+**Break-even Analysis:**
+- If spending >$500/month on API calls → GPU server saves money
+- If spending <$200/month → stick with APIs
+
+---
+
+## Appendix D: Model Recommendations by Use Case
+
+### General Chat (24/7 Inference)
+**Best:** Qwen 2.5 14B Instruct
+- Excellent multilingual support
+- Fast inference
+- Good reasoning
+
+**Alternative:** Mistral 7B Instruct v0.3
+- Fastest inference
+- Lower VRAM
+
+### Code Generation
+**Best:** Qwen 2.5 Coder 14B
+- SOTA coding performance
+- Multi-language support
+
+**Alternative:** DeepSeek Coder 6.7B
+- Faster, lighter
+
+### Creative Writing
+**Best:** Nous Hermes 2 Mixtral 8x7B (quantized)
+- Creative, engaging
+- Follows instructions well
+
+### Image Generation (Realistic)
+**Best:** FLUX.1-dev
+- Highest quality
+- Best prompt following
+
+**Alternative:** SDXL + RealVisXL LoRA
+- Faster generation
+- Good quality
+
+### Image Generation (Anime)
+**Best:** SDXL + AnimagineXL LoRA
+- Anime-specific training
+- Vibrant colors
+
+### Video Generation
+**Best:** AnimateDiff + SDXL
+- 16-frame clips
+- Good quality
+
+**Needs:** A100 40GB or better
+
+---
+
+## Next Steps
+
+1. **Review this plan** and provide feedback
+2. **Set budget** for GPU infrastructure
+3. **Choose provider** (recommend RunPod)
+4. **Define priority services** (LLM hosting first? Image gen first?)
+5. **Schedule implementation** (4-week timeline above)
+
+Would you like me to:
+- Create the detailed Docker Compose configurations?
+- Set up a cost estimation spreadsheet?
+- Research specific models for your use cases?
+- Begin implementation with Phase 1?
+
+Let me know how you'd like to proceed! 🚀
diff --git a/ai/README_GPU_SETUP.md b/ai/README_GPU_SETUP.md
new file mode 100644
index 0000000..34974f0
--- /dev/null
+++ b/ai/README_GPU_SETUP.md
@@ -0,0 +1,444 @@
+# GPU-Enhanced AI Stack - Implementation Guide
+
+Welcome to your GPU expansion setup! This directory contains everything you need to deploy a production-ready GPU server for LLM hosting, image generation, and model training.
+
+## 📚 Documentation Files
+
+### Planning & Architecture
+- **`GPU_EXPANSION_PLAN.md`** - Complete 70-page plan with provider comparison, architecture, and roadmap
+- **`README_GPU_SETUP.md`** - This file
+
+### Step-by-Step Setup Guides
+1. **`SETUP_GUIDE.md`** - Day 1-2: RunPod account & GPU server deployment
+2. **`WIREGUARD_SETUP.md`** - Day 3-4: VPN connection between VPS and GPU server
+3. **`DOCKER_GPU_SETUP.md`** - Day 5: Docker + NVIDIA Container Toolkit configuration
+
+### Configuration Files
+- **`gpu-server-compose.yaml`** - Production Docker Compose for GPU server
+- **`litellm-config-gpu.yaml`** - Updated LiteLLM config with self-hosted models
+- **`deploy-gpu-stack.sh`** - Automated deployment script
+
+---
+
+## 🚀 Quick Start (Week 1 Checklist)
+
+### Day 1-2: RunPod & GPU Server ✓
+- [ ] Create RunPod account at https://www.runpod.io/
+- [ ] Add billing method ($50 initial credit recommended)
+- [ ] Deploy RTX 4090 pod with PyTorch template
+- [ ] Configure 500GB network volume
+- [ ] Verify SSH access
+- [ ] Test GPU with `nvidia-smi`
+- [ ] **Guide:** `SETUP_GUIDE.md`
+
+### Day 3-4: Network Configuration ✓
+- [ ] Install Tailscale on VPS
+- [ ] Install Tailscale on GPU server
+- [ ] Authenticate both devices
+- [ ] Test VPN connectivity
+- [ ] Configure firewall rules
+- [ ] Verify VPS can reach GPU server
+- [ ] **Guide:** `TAILSCALE_SETUP.md`
+
+### Day 5: Docker & GPU Setup ✓
+- [ ] Install Docker on GPU server
+- [ ] Install NVIDIA Container Toolkit
+- [ ] Test GPU access in containers
+- [ ] Create /workspace/gpu-stack directory
+- [ ] Copy configuration files
+- [ ] **Guide:** `DOCKER_GPU_SETUP.md`
+
+### Day 6-7: Deploy Services ✓
+- [ ] Copy `gpu-server-compose.yaml` to GPU server
+- [ ] Edit `.env` with your settings
+- [ ] Run `./deploy-gpu-stack.sh`
+- [ ] Wait for vLLM to load model (~5 minutes)
+- [ ] Test vLLM: `curl http://localhost:8000/v1/models`
+- [ ] Access ComfyUI: `http://[tailscale-ip]:8188`
+- [ ] **Script:** `deploy-gpu-stack.sh`
+
+---
+
+## 📦 Services Included
+
+### vLLM (http://[tailscale-ip]:8000)
+**Purpose:** High-performance LLM inference
+**Default Model:** Llama 3.1 8B Instruct
+**Performance:** 50-80 tokens/second on RTX 4090
+**Use for:** General chat, Q&A, code generation, summarization
+
+**Switch models:**
+Edit `gpu-server-compose.yaml`, change `--model` parameter, restart:
+```bash
+docker compose restart vllm
+```
+
+### ComfyUI (http://[tailscale-ip]:8188)
+**Purpose:** Advanced Stable Diffusion interface
+**Features:** FLUX, SDXL, ControlNet, LoRA
+**Use for:** Image generation, img2img, inpainting
+
+**Download models:**
+Access web UI → ComfyUI Manager → Install Models
+
+### JupyterLab (http://[tailscale-ip]:8888)
+**Purpose:** Interactive development environment
+**Token:** `pivoine-ai-2025` (change in `.env`)
+**Use for:** Research, experimentation, custom training scripts
+
+### Axolotl (Training - on-demand)
+**Purpose:** LLM fine-tuning framework
+**Start:** `docker compose --profile training up -d axolotl`
+**Use for:** LoRA training, full fine-tuning, RLHF
+
+### Netdata (http://[tailscale-ip]:19999)
+**Purpose:** System & GPU monitoring
+**Features:** Real-time metrics, GPU utilization, memory usage
+**Use for:** Performance monitoring, troubleshooting
+
+---
+
+## 🔧 Configuration
+
+### Environment Variables (.env)
+
+```bash
+# VPN Network (Tailscale)
+VPS_IP=100.x.x.x         # Your VPS Tailscale IP (get with: tailscale ip -4)
+GPU_IP=100.x.x.x         # GPU server Tailscale IP (get with: tailscale ip -4)
+
+# Model Storage
+MODELS_PATH=/workspace/models
+
+# Hugging Face Token (for gated models like Llama)
+HF_TOKEN=hf_xxxxxxxxxxxxx
+
+# Weights & Biases (for training logging)
+WANDB_API_KEY=
+
+# JupyterLab Access
+JUPYTER_TOKEN=pivoine-ai-2025
+
+# PostgreSQL (on VPS)
+DB_HOST=100.x.x.x        # Your VPS Tailscale IP
+DB_PORT=5432
+DB_USER=valknar
+DB_PASSWORD=ragnarok98
+DB_NAME=openwebui
+```
+
+### Updating LiteLLM on VPS
+
+After GPU server is running, update your VPS LiteLLM config:
+
+```bash
+# On VPS
+cd ~/Projects/docker-compose/ai
+
+# Backup current config
+cp litellm-config.yaml litellm-config.yaml.backup
+
+# Copy new config with GPU models
+cp litellm-config-gpu.yaml litellm-config.yaml
+
+# Restart LiteLLM
+arty restart litellm
+```
+
+Now Open WebUI will have access to both Claude (API) and Llama (self-hosted)!
+
+---
+
+## 💰 Cost Management
+
+### Current Costs (24/7 Operation)
+- **GPU Server:** RTX 4090 @ $0.50/hour = $360/month
+- **Storage:** 500GB network volume = $50/month
+- **Total:** **$410/month**
+
+### Cost-Saving Options
+
+**1. Pay-as-you-go (8 hours/day)**
+- GPU: $0.50 × 8 × 30 = $120/month
+- Storage: $50/month
+- **Total: $170/month**
+
+**2. Auto-stop idle pods**
+RunPod can auto-stop after X minutes idle:
+- Dashboard → Pod Settings → Auto-stop after 30 minutes
+
+**3. Use smaller models**
+- Mistral 7B instead of Llama 8B: Faster, cheaper GPU
+- Quantized models: 4-bit = 1/4 the VRAM
+
+**4. Batch image generation**
+- Generate multiple images at once
+- Use scheduled jobs (cron) during off-peak hours
+
+### Cost Tracking
+
+**Check GPU usage:**
+```bash
+# On RunPod dashboard
+Billing → Usage History
+
+# See hourly costs, total spent
+```
+
+**Check API vs GPU savings:**
+```bash
+# On VPS, check LiteLLM logs
+docker logs ai_litellm | grep "model="
+
+# Count requests to llama-3.1-8b vs claude-*
+```
+
+**Expected savings:**
+- 80% of requests → self-hosted = $0 cost
+- 20% of requests → Claude = API cost
+- Break-even if currently spending >$500/month on APIs
+
+---
+
+## 🔍 Monitoring & Troubleshooting
+
+### Check Service Status
+
+```bash
+# On GPU server
+cd /workspace/gpu-stack
+
+# View all services
+docker compose ps
+
+# Check specific service logs
+docker compose logs -f vllm
+docker compose logs -f comfyui
+docker compose logs -f jupyter
+
+# Check GPU usage
+nvidia-smi
+# or prettier:
+nvtop
+```
+
+### Common Issues
+
+**vLLM not loading model:**
+```bash
+# Check logs
+docker compose logs vllm
+
+# Common causes:
+# - Model download in progress (wait 5-10 minutes)
+# - Out of VRAM (try smaller model)
+# - Missing HF_TOKEN (for gated models like Llama)
+```
+
+**ComfyUI slow/crashing:**
+```bash
+# Check GPU memory
+nvidia-smi
+
+# If VRAM full:
+# - Close vLLM temporarily
+# - Use smaller models
+# - Reduce batch size in ComfyUI
+```
+
+**Can't access from VPS:**
+```bash
+# Test VPN
+ping [tailscale-ip]
+
+# If fails:
+# - Check Tailscale status: tailscale status
+# - Restart Tailscale: tailscale down && tailscale up
+# - Check firewall: ufw status
+```
+
+**Docker can't see GPU:**
+```bash
+# Test GPU access
+docker run --rm --runtime=nvidia --gpus all nvidia/cuda:12.1.0-base nvidia-smi
+
+# If fails:
+# - Check NVIDIA driver: nvidia-smi
+# - Check nvidia-docker: nvidia-ctk --version
+# - Restart Docker: systemctl restart docker
+```
+
+---
+
+## 📊 Performance Benchmarks
+
+### Expected Performance (RTX 4090)
+
+**LLM Inference (vLLM):**
+- Llama 3.1 8B: 50-80 tokens/second
+- Qwen 2.5 14B: 30-50 tokens/second
+- Batch size 32: ~1500 tokens/second
+
+**Image Generation (ComfyUI):**
+- SDXL (1024×1024): ~4-6 seconds
+- FLUX (1024×1024): ~8-12 seconds
+- SD 1.5 (512×512): ~1-2 seconds
+
+**Training (Axolotl):**
+- LoRA fine-tuning (8B model): ~3-5 hours for 3 epochs
+- Full fine-tuning: Not recommended on 24GB VRAM
+
+---
+
+## 🔐 Security Best Practices
+
+### Network Security
+✅ All services behind Tailscale VPN (end-to-end encrypted)
+✅ No public exposure (except RunPod's SSH)
+✅ Firewall configured (no additional ports needed)
+
+### Access Control
+✅ JupyterLab password-protected
+✅ ComfyUI accessible via VPN only
+✅ vLLM internal API (no auth needed)
+
+### SSH Security
+```bash
+# On GPU server, harden SSH
+nano /etc/ssh/sshd_config
+
+# Set:
+PermitRootLogin prohibit-password
+PasswordAuthentication no
+PubkeyAuthentication yes
+
+systemctl restart sshd
+```
+
+### Regular Updates
+```bash
+# Weekly updates
+apt update && apt upgrade -y
+
+# Update Docker images
+docker compose pull
+docker compose up -d
+```
+
+---
+
+## 📈 Scaling Up
+
+### When to Add More GPUs
+
+**Current limitations (1× RTX 4090):**
+- Can run ONE of these at a time:
+  - 8B LLM at full speed
+  - 14B LLM at moderate speed
+  - SDXL image generation
+  - Training job
+
+**Add 2nd GPU if:**
+- You want LLM + image gen simultaneously
+- Training + inference at same time
+- Multiple users with high demand
+
+**Multi-GPU options:**
+- 2× RTX 4090: Run vLLM + ComfyUI separately ($720/month)
+- 1× A100 40GB: Larger models (70B with quantization) ($1,080/month)
+- Mix: RTX 4090 (inference) + A100 (training) (~$1,300/month)
+
+### Deploying Larger Models
+
+**70B models (need 2× A100 or 4× RTX 4090):**
+```yaml
+# In gpu-server-compose.yaml
+vllm:
+  command:
+    - --model
+    - meta-llama/Meta-Llama-3.1-70B-Instruct
+    - --tensor-parallel-size
+    - "2"  # Split across 2 GPUs
+  deploy:
+    resources:
+      reservations:
+        devices:
+          - driver: nvidia
+            count: 2  # Use 2 GPUs
+            capabilities: [gpu]
+```
+
+---
+
+## 🎯 Next Steps (Week 2+)
+
+### Week 2: LLM Production Deployment
+- [ ] Test Llama 3.1 8B performance
+- [ ] Download additional models (Qwen, Mistral)
+- [ ] Configure model routing in LiteLLM
+- [ ] Set up usage monitoring
+- [ ] Benchmark tokens/second for each model
+
+### Week 3: Image Generation
+- [ ] Download FLUX and SDXL models
+- [ ] Install ComfyUI Manager
+- [ ] Download ControlNet models
+- [ ] Create sample workflows
+- [ ] Test API integration with Open WebUI
+
+### Week 4: Training Infrastructure
+- [ ] Prepare a sample dataset
+- [ ] Test LoRA fine-tuning with Axolotl
+- [ ] Set up Weights & Biases logging
+- [ ] Create training documentation
+- [ ] Benchmark training speed
+
+---
+
+## 🆘 Getting Help
+
+### Resources
+- **RunPod Docs:** https://docs.runpod.io/
+- **vLLM Docs:** https://docs.vllm.ai/
+- **ComfyUI Wiki:** https://github.com/comfyanonymous/ComfyUI/wiki
+- **Axolotl Docs:** https://github.com/OpenAccess-AI-Collective/axolotl
+
+### Community
+- **RunPod Discord:** https://discord.gg/runpod
+- **vLLM Discord:** https://discord.gg/vllm
+- **r/LocalLLaMA:** https://reddit.com/r/LocalLLaMA
+
+### Support
+If you encounter issues:
+1. Check logs: `docker compose logs -f [service]`
+2. Check GPU: `nvidia-smi`
+3. Check VPN: `wg show`
+4. Restart service: `docker compose restart [service]`
+5. Full restart: `docker compose down && docker compose up -d`
+
+---
+
+## ✅ Success Criteria
+
+You're ready to proceed when:
+- [ ] GPU server responds to `ping [tailscale-ip]` from VPS
+- [ ] vLLM returns models: `curl http://[tailscale-ip]:8000/v1/models`
+- [ ] ComfyUI web interface loads: `http://[tailscale-ip]:8188`
+- [ ] JupyterLab accessible with token
+- [ ] Netdata shows GPU metrics
+- [ ] Open WebUI shows both Claude and Llama models
+
+**Total setup time:** 4-6 hours (if following guides sequentially)
+
+---
+
+## 🎉 You're All Set!
+
+Your GPU-enhanced AI stack is ready. You now have:
+- ✅ Self-hosted LLM inference (saves $$$)
+- ✅ Advanced image generation (FLUX, SDXL)
+- ✅ Model training capabilities (LoRA, fine-tuning)
+- ✅ Secure VPN connection
+- ✅ Full monitoring and logging
+
+Enjoy building with your new AI infrastructure! 🚀
diff --git a/ai/SETUP_GUIDE.md b/ai/SETUP_GUIDE.md
new file mode 100644
index 0000000..1d14145
--- /dev/null
+++ b/ai/SETUP_GUIDE.md
@@ -0,0 +1,261 @@
+# GPU Server Setup Guide - Week 1
+
+## Day 1-2: RunPod Account & GPU Server
+
+### Step 1: Create RunPod Account
+
+1. **Go to RunPod**: https://www.runpod.io/
+2. **Sign up** with email or GitHub
+3. **Add billing method**:
+   - Credit card required
+   - No charges until you deploy a pod
+   - Recommended: Add $50 initial credit
+
+4. **Verify email** and complete account setup
+
+### Step 2: Deploy Your First GPU Pod
+
+#### 2.1 Navigate to Pods
+
+1. Click **"Deploy"** in top menu
+2. Select **"GPU Pods"**
+
+#### 2.2 Choose GPU Type
+
+**Recommended: RTX 4090**
+- 24GB VRAM
+- ~$0.50/hour
+- Perfect for LLMs up to 14B params
+- Great for SDXL/FLUX
+
+**Filter options:**
+- GPU Type: RTX 4090
+- GPU Count: 1
+- Sort by: Price (lowest first)
+- Region: Europe (lower latency to Germany)
+
+#### 2.3 Select Template
+
+Choose: **"RunPod PyTorch"** template
+- Includes: CUDA, PyTorch, Python
+- Pre-configured for GPU workloads
+- Docker pre-installed
+
+**Alternative**: "Ubuntu 22.04 with CUDA 12.1" (more control)
+
+#### 2.4 Configure Pod
+
+**Container Settings:**
+- **Container Disk**: 50GB (temporary, auto-included)
+- **Expose Ports**:
+  - Add: 22 (SSH)
+  - Add: 8000 (vLLM)
+  - Add: 8188 (ComfyUI)
+  - Add: 8888 (JupyterLab)
+
+**Volume Settings:**
+- Click **"+ Network Volume"**
+- **Name**: `gpu-models-storage`
+- **Size**: 500GB
+- **Region**: Same as pod
+- **Cost**: ~$50/month
+
+**Environment Variables:**
+- Add later (not needed for initial setup)
+
+#### 2.5 Deploy Pod
+
+1. Review configuration
+2. Click **"Deploy On-Demand"** (not Spot for reliability)
+3. Wait 2-3 minutes for deployment
+
+**Expected cost:**
+- GPU: $0.50/hour = $360/month (24/7)
+- Storage: $50/month
+- **Total: $410/month**
+
+### Step 3: Access Your GPU Server
+
+#### 3.1 Get Connection Info
+
+Once deployed, you'll see:
+- **Pod ID**: e.g., `abc123def456`
+- **SSH Command**: `ssh root@<pod-id>.runpod.io -p 12345`
+- **Public IP**: May not be directly accessible (use SSH)
+
+#### 3.2 SSH Access
+
+RunPod automatically generates SSH keys for you:
+
+```bash
+# Copy the SSH command from RunPod dashboard
+ssh root@abc123def456.runpod.io -p 12345
+
+# First time: Accept fingerprint
+# You should now be in the GPU server!
+```
+
+**Verify GPU:**
+```bash
+nvidia-smi
+```
+
+Expected output:
+```
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 535.xx       Driver Version: 535.xx       CUDA Version: 12.1    |
+|-------------------------------+----------------------+----------------------+
+| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+|===============================+======================+======================|
+|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
+| 30%   45C    P0    50W / 450W |      0MiB / 24564MiB |      0%      Default |
++-------------------------------+----------------------+----------------------+
+```
+
+### Step 4: Initial Server Configuration
+
+#### 4.1 Update System
+
+```bash
+# Update package lists
+apt update
+
+# Upgrade existing packages
+apt upgrade -y
+
+# Install essential tools
+apt install -y \
+  vim \
+  htop \
+  tmux \
+  curl \
+  wget \
+  git \
+  net-tools \
+  iptables-persistent
+```
+
+#### 4.2 Set Timezone
+
+```bash
+timedatectl set-timezone Europe/Berlin
+date  # Verify
+```
+
+#### 4.3 Create Working Directory
+
+```bash
+# Create workspace
+mkdir -p /workspace/{models,configs,data,scripts}
+
+# Check network volume mount
+ls -la /workspace
+# Should show your 500GB volume
+```
+
+#### 4.4 Configure SSH (Optional but Recommended)
+
+**Generate your own SSH key on your local machine:**
+
+```bash
+# On your local machine (not GPU server)
+ssh-keygen -t ed25519 -C "gpu-server-pivoine" -f ~/.ssh/gpu_pivoine
+
+# Copy public key to GPU server
+ssh-copy-id -i ~/.ssh/gpu_pivoine.pub root@abc123def456.runpod.io -p 12345
+```
+
+**Add to your local ~/.ssh/config:**
+
+```bash
+Host gpu-pivoine
+    HostName abc123def456.runpod.io
+    Port 12345
+    User root
+    IdentityFile ~/.ssh/gpu_pivoine
+```
+
+Now you can connect with: `ssh gpu-pivoine`
+
+### Step 5: Verify GPU Access
+
+Run this test:
+
+```bash
+# Test CUDA
+python3 -c "import torch; print('CUDA available:', torch.cuda.is_available()); print('GPU count:', torch.cuda.device_count())"
+```
+
+Expected output:
+```
+CUDA available: True
+GPU count: 1
+```
+
+### Troubleshooting
+
+**Problem: Can't connect via SSH**
+- Check pod is running (not stopped)
+- Verify port number in SSH command
+- Try web terminal in RunPod dashboard
+
+**Problem: GPU not detected**
+- Run `nvidia-smi`
+- Check RunPod selected correct GPU type
+- Restart pod if needed
+
+**Problem: Network volume not mounted**
+- Check RunPod dashboard → Volume tab
+- Verify volume is attached to pod
+- Try: `df -h` to see mounts
+
+### Next Steps
+
+Once SSH access works and GPU is verified:
+✅ Proceed to **Day 3-4: Network Configuration (Tailscale VPN)**
+
+### Save Important Info
+
+Create a file to track your setup:
+
+```bash
+# On GPU server
+cat > /workspace/SERVER_INFO.md << 'EOF'
+# GPU Server Information
+
+## Connection
+- SSH: ssh root@abc123def456.runpod.io -p 12345
+- Pod ID: abc123def456
+- Region: [YOUR_REGION]
+
+## Hardware
+- GPU: RTX 4090 24GB
+- CPU: [Check with: lscpu]
+- RAM: [Check with: free -h]
+- Storage: 500GB network volume at /workspace
+
+## Costs
+- GPU: $0.50/hour
+- Storage: $50/month
+- Total: ~$410/month (24/7)
+
+## Deployed: [DATE]
+EOF
+```
+
+---
+
+## Checkpoint ✓
+
+Before moving to Day 3, verify:
+- [ ] RunPod account created and billing added
+- [ ] RTX 4090 pod deployed successfully
+- [ ] 500GB network volume attached
+- [ ] SSH access working
+- [ ] `nvidia-smi` shows GPU
+- [ ] `torch.cuda.is_available()` returns True
+- [ ] Timezone set to Europe/Berlin
+- [ ] Essential tools installed
+
+**Ready for Tailscale setup? Let's go!**
diff --git a/ai/TAILSCALE_SETUP.md b/ai/TAILSCALE_SETUP.md
new file mode 100644
index 0000000..9950469
--- /dev/null
+++ b/ai/TAILSCALE_SETUP.md
@@ -0,0 +1,417 @@
+# Tailscale VPN Setup - Better Alternative to WireGuard
+
+## Why Tailscale?
+
+RunPod doesn't support UDP ports, which blocks WireGuard. Tailscale solves this by:
+- ✅ Works over HTTPS (TCP) - no UDP needed
+- ✅ Zero configuration - automatic setup
+- ✅ Free for personal use
+- ✅ Built on WireGuard (same security)
+- ✅ Automatic NAT traversal
+- ✅ Peer-to-peer when possible (low latency)
+
+---
+
+## Step 1: Create Tailscale Account
+
+1. Go to: https://tailscale.com/
+2. Click **"Get Started"**
+3. Sign up with **GitHub** or **Google** (easiest)
+4. You'll be redirected to the Tailscale admin console
+
+**No credit card required!** Free tier is perfect for our use case.
+
+---
+
+## Step 2: Install Tailscale on VPS
+
+**SSH into your VPS:**
+
+```bash
+ssh root@vps
+```
+
+**Install Tailscale:**
+
+```bash
+# Download and run install script
+curl -fsSL https://tailscale.com/install.sh | sh
+
+# Start Tailscale
+tailscale up
+
+# You'll see a URL like:
+# https://login.tailscale.com/a/xxxxxxxxxx
+```
+
+**Authenticate:**
+1. Copy the URL and open in browser
+2. Click **"Connect"** to authorize the device
+3. Name it: `pivoine-vps`
+
+**Check status:**
+```bash
+tailscale status
+```
+
+You should see your VPS listed with an IP like `100.x.x.x`
+
+**Save your VPS Tailscale IP:**
+```bash
+tailscale ip -4
+# Example output: 100.101.102.103
+```
+
+**Write this down - you'll need it!**
+
+---
+
+## Step 3: Install Tailscale on GPU Server
+
+**SSH into your RunPod GPU server:**
+
+```bash
+ssh root@abc123def456-12345678.runpod.io -p 12345
+```
+
+**Install Tailscale:**
+
+```bash
+# Download and run install script
+curl -fsSL https://tailscale.com/install.sh | sh
+
+# Start Tailscale
+tailscale up --advertise-tags=tag:gpu
+
+# You'll see another URL
+```
+
+**Authenticate:**
+1. Copy the URL and open in browser
+2. Click **"Connect"**
+3. Name it: `gpu-runpod`
+
+**Check status:**
+```bash
+tailscale status
+```
+
+You should now see BOTH devices:
+- `pivoine-vps` - 100.x.x.x
+- `gpu-runpod` - 100.x.x.x
+
+**Save your GPU server Tailscale IP:**
+```bash
+tailscale ip -4
+# Example output: 100.104.105.106
+```
+
+---
+
+## Step 4: Test Connectivity
+
+**From VPS, ping GPU server:**
+
+```bash
+# SSH into VPS
+ssh root@vps
+
+# Ping GPU server (use its Tailscale IP)
+ping 100.104.105.106 -c 4
+```
+
+Expected output:
+```
+PING 100.104.105.106 (100.104.105.106) 56(84) bytes of data.
+64 bytes from 100.104.105.106: icmp_seq=1 ttl=64 time=15.3 ms
+64 bytes from 100.104.105.106: icmp_seq=2 ttl=64 time=14.8 ms
+...
+```
+
+**From GPU server, ping VPS:**
+
+```bash
+# SSH into GPU server
+ssh root@abc123def456-12345678.runpod.io -p 12345
+
+# Ping VPS (use its Tailscale IP)
+ping 100.101.102.103 -c 4
+```
+
+**Both should work!** ✅
+
+---
+
+## Step 5: Update Configuration Files
+
+Now update the IP addresses in your configs to use Tailscale IPs.
+
+### On GPU Server (.env file)
+
+**Edit your .env file:**
+
+```bash
+# On GPU server
+cd /workspace/gpu-stack
+
+nano .env
+```
+
+**Update these lines:**
+```bash
+# VPN Network (use your actual Tailscale IPs)
+VPS_IP=100.101.102.103      # Your VPS Tailscale IP
+GPU_IP=100.104.105.106      # Your GPU Tailscale IP
+
+# PostgreSQL (on VPS)
+DB_HOST=100.101.102.103     # Your VPS Tailscale IP
+DB_PORT=5432
+```
+
+Save and exit (Ctrl+X, Y, Enter)
+
+### On VPS (LiteLLM config)
+
+**Edit your LiteLLM config:**
+
+```bash
+# On VPS
+ssh root@vps
+cd ~/Projects/docker-compose/ai
+
+nano litellm-config-gpu.yaml
+```
+
+**Update the GPU server IP:**
+
+```yaml
+# Find this section and update IP:
+  - model_name: llama-3.1-8b
+    litellm_params:
+      model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct
+      api_base: http://100.104.105.106:8000/v1  # Use GPU Tailscale IP
+      api_key: dummy
+```
+
+Save and exit.
+
+---
+
+## Step 6: Verify PostgreSQL Access
+
+**From GPU server, test database connection:**
+
+```bash
+# Install PostgreSQL client
+apt install -y postgresql-client
+
+# Test connection (use your VPS Tailscale IP)
+psql -h 100.101.102.103 -U valknar -d openwebui -c "SELECT 1;"
+```
+
+**If this fails, allow Tailscale network on VPS PostgreSQL:**
+
+```bash
+# On VPS
+ssh root@vps
+
+# Check if postgres allows Tailscale network
+docker exec core_postgres cat /var/lib/postgresql/data/pg_hba.conf | grep 100
+
+# If not present, add it:
+docker exec -it core_postgres bash
+
+# Inside container:
+echo "host    all             all             100.0.0.0/8             scram-sha-256" >> /var/lib/postgresql/data/pg_hba.conf
+
+# Restart postgres
+exit
+docker restart core_postgres
+```
+
+Try connecting again - should work now!
+
+---
+
+## Tailscale Management
+
+### View Connected Devices
+
+**Web dashboard:**
+https://login.tailscale.com/admin/machines
+
+You'll see all your devices with their Tailscale IPs.
+
+**Command line:**
+```bash
+tailscale status
+```
+
+### Disconnect/Reconnect
+
+```bash
+# Stop Tailscale
+tailscale down
+
+# Start Tailscale
+tailscale up
+```
+
+### Remove Device
+
+From web dashboard:
+1. Click on device
+2. Click "..." menu
+3. Select "Disable" or "Delete"
+
+---
+
+## Advantages Over WireGuard
+
+✅ **Works anywhere** - No UDP ports needed
+✅ **Auto-reconnect** - Survives network changes
+✅ **Multiple devices** - Easy to add laptop, phone, etc.
+✅ **NAT traversal** - Direct peer-to-peer when possible
+✅ **Access Control** - Manage from web dashboard
+✅ **Monitoring** - See connection status in real-time
+
+---
+
+## Security Notes
+
+🔒 **Tailscale is secure:**
+- End-to-end encrypted (WireGuard)
+- Zero-trust architecture
+- No Tailscale servers can see your traffic
+- Only authenticated devices can connect
+
+🔒 **Access control:**
+- Only devices you authorize can join
+- Revoke access anytime from dashboard
+- Set ACLs for fine-grained control
+
+---
+
+## Network Reference (Updated)
+
+**Old (WireGuard):**
+- VPS: `10.8.0.1`
+- GPU: `10.8.0.2`
+
+**New (Tailscale):**
+- VPS: `100.101.102.103` (example - use your actual IP)
+- GPU: `100.104.105.106` (example - use your actual IP)
+
+**All services now accessible via Tailscale:**
+
+**From VPS to GPU:**
+- vLLM: `http://100.104.105.106:8000`
+- ComfyUI: `http://100.104.105.106:8188`
+- JupyterLab: `http://100.104.105.106:8888`
+- Netdata: `http://100.104.105.106:19999`
+
+**From GPU to VPS:**
+- PostgreSQL: `100.101.102.103:5432`
+- Redis: `100.101.102.103:6379`
+- LiteLLM: `http://100.101.102.103:4000`
+
+---
+
+## Troubleshooting
+
+### Can't ping between devices
+
+**Check Tailscale status:**
+```bash
+tailscale status
+```
+
+Both devices should show "active" or "online".
+
+**Check connectivity:**
+```bash
+tailscale ping 100.104.105.106
+```
+
+**Restart Tailscale:**
+```bash
+tailscale down && tailscale up
+```
+
+### PostgreSQL connection refused
+
+**Check if postgres is listening on all interfaces:**
+```bash
+# On VPS
+docker exec core_postgres cat /var/lib/postgresql/data/postgresql.conf | grep listen_addresses
+```
+
+Should show: `listen_addresses = '*'`
+
+**Check pg_hba.conf allows Tailscale network:**
+```bash
+docker exec core_postgres cat /var/lib/postgresql/data/pg_hba.conf | grep 100
+```
+
+Should have line:
+```
+host    all             all             100.0.0.0/8             scram-sha-256
+```
+
+### Device not showing in network
+
+**Re-authenticate:**
+```bash
+tailscale logout
+tailscale up
+# Click the new URL to re-authenticate
+```
+
+---
+
+## Verification Checklist
+
+Before proceeding:
+- [ ] Tailscale account created
+- [ ] Tailscale installed on VPS
+- [ ] Tailscale installed on GPU server
+- [ ] Both devices visible in `tailscale status`
+- [ ] VPS can ping GPU server (via Tailscale IP)
+- [ ] GPU server can ping VPS (via Tailscale IP)
+- [ ] PostgreSQL accessible from GPU server
+- [ ] .env file updated with Tailscale IPs
+- [ ] LiteLLM config updated with GPU Tailscale IP
+
+---
+
+## Next Steps
+
+✅ **Network configured!** Proceed to Docker & GPU setup:
+
+```bash
+cat /home/valknar/Projects/docker-compose/ai/DOCKER_GPU_SETUP.md
+```
+
+**Your Tailscale IPs (save these!):**
+- VPS: `__________________` (from `tailscale ip -4` on VPS)
+- GPU: `__________________` (from `tailscale ip -4` on GPU server)
+
+---
+
+## Bonus: Add Your Local Machine
+
+Want to access GPU server from your laptop?
+
+```bash
+# On your local machine
+curl -fsSL https://tailscale.com/install.sh | sh
+tailscale up
+
+# Now you can SSH directly via Tailscale:
+ssh root@100.104.105.106
+
+# Or access ComfyUI in browser:
+# http://100.104.105.106:8188
+```
+
+No more port forwarding needed! 🎉
diff --git a/ai/WIREGUARD_SETUP.md b/ai/WIREGUARD_SETUP.md
new file mode 100644
index 0000000..0f274fa
--- /dev/null
+++ b/ai/WIREGUARD_SETUP.md
@@ -0,0 +1,393 @@
+# WireGuard VPN Setup - Connecting GPU Server to VPS
+
+## Day 3-4: Network Configuration
+
+This guide connects your RunPod GPU server to your VPS via WireGuard VPN, enabling secure, low-latency communication.
+
+### Architecture
+
+```
+┌─────────────────────────────┐         ┌──────────────────────────────┐
+│ VPS (pivoine.art)           │         │ GPU Server (RunPod)          │
+│ 10.8.0.1 (WireGuard)        │◄───────►│ 10.8.0.2 (WireGuard)         │
+├─────────────────────────────┤         ├──────────────────────────────┤
+│ - LiteLLM Proxy             │         │ - vLLM (10.8.0.2:8000)       │
+│ - Open WebUI                │         │ - ComfyUI (10.8.0.2:8188)    │
+│ - PostgreSQL                │         │ - Training                    │
+└─────────────────────────────┘         └──────────────────────────────┘
+```
+
+### Prerequisites
+
+- ✅ VPS with root access
+- ✅ GPU server with root access
+- ✅ Both servers have public IPs
+
+---
+
+## Method 1: Using Existing wg-easy (Recommended)
+
+You already have `wg-easy` running on your VPS. Let's use it!
+
+### Step 1: Access wg-easy Dashboard
+
+**On your local machine:**
+
+1. Open browser: https://vpn.pivoine.art (or whatever your wg-easy URL is)
+2. Login with admin password
+
+**Don't have wg-easy set up? Skip to Method 2.**
+
+### Step 2: Create GPU Server Client
+
+1. In wg-easy dashboard, click **"+ New Client"**
+2. **Name**: `gpu-server-runpod`
+3. Click **"Create"**
+4. **Download** configuration file (or copy QR code data)
+
+You'll get a file like: `gpu-server-runpod.conf`
+
+### Step 3: Install WireGuard on GPU Server
+
+**SSH into GPU server:**
+
+```bash
+ssh gpu-pivoine  # or your SSH command
+
+# Install WireGuard
+apt update
+apt install -y wireguard wireguard-tools
+```
+
+### Step 4: Configure WireGuard on GPU Server
+
+**Upload the config file:**
+
+```bash
+# On your local machine, copy the config to GPU server
+scp gpu-server-runpod.conf gpu-pivoine:/etc/wireguard/wg0.conf
+
+# Or manually create it on GPU server:
+nano /etc/wireguard/wg0.conf
+# Paste the configuration from wg-easy
+```
+
+**Example config (yours will be different):**
+```ini
+[Interface]
+PrivateKey = <PRIVATE_KEY_FROM_WG_EASY>
+Address = 10.8.0.2/24
+DNS = 10.8.0.1
+
+[Peer]
+PublicKey = <VPS_PUBLIC_KEY_FROM_WG_EASY>
+PresharedKey = <PRESHARED_KEY>
+AllowedIPs = 10.8.0.0/24
+Endpoint = <VPS_PUBLIC_IP>:51820
+PersistentKeepalive = 25
+```
+
+### Step 5: Start WireGuard
+
+```bash
+# Enable IP forwarding
+echo "net.ipv4.ip_forward=1" >> /etc/sysctl.conf
+sysctl -p
+
+# Set permissions
+chmod 600 /etc/wireguard/wg0.conf
+
+# Start WireGuard
+systemctl enable wg-quick@wg0
+systemctl start wg-quick@wg0
+
+# Check status
+systemctl status wg-quick@wg0
+wg show
+```
+
+Expected output:
+```
+interface: wg0
+  public key: <GPU_SERVER_PUBLIC_KEY>
+  private key: (hidden)
+  listening port: 51820
+
+peer: <VPS_PUBLIC_KEY>
+  endpoint: <VPS_IP>:51820
+  allowed ips: 10.8.0.0/24
+  latest handshake: 1 second ago
+  transfer: 1.2 KiB received, 892 B sent
+  persistent keepalive: every 25 seconds
+```
+
+### Step 6: Test Connectivity
+
+**From GPU server, ping VPS:**
+
+```bash
+ping 10.8.0.1 -c 4
+```
+
+Expected output:
+```
+PING 10.8.0.1 (10.8.0.1) 56(84) bytes of data.
+64 bytes from 10.8.0.1: icmp_seq=1 ttl=64 time=25.3 ms
+64 bytes from 10.8.0.1: icmp_seq=2 ttl=64 time=24.8 ms
+...
+```
+
+**From VPS, ping GPU server:**
+
+```bash
+ssh root@vps
+ping 10.8.0.2 -c 4
+```
+
+**Test PostgreSQL access from GPU server:**
+
+```bash
+# On GPU server
+apt install -y postgresql-client
+
+# Try connecting to VPS postgres
+psql -h 10.8.0.1 -U valknar -d openwebui -c "SELECT 1;"
+# Should work if postgres allows 10.8.0.0/24
+```
+
+---
+
+## Method 2: Manual WireGuard Setup (If no wg-easy)
+
+### Step 1: Install WireGuard on Both Servers
+
+**On VPS:**
+```bash
+ssh root@vps
+apt update
+apt install -y wireguard wireguard-tools
+```
+
+**On GPU Server:**
+```bash
+ssh gpu-pivoine
+apt update
+apt install -y wireguard wireguard-tools
+```
+
+### Step 2: Generate Keys
+
+**On VPS:**
+```bash
+cd /etc/wireguard
+umask 077
+wg genkey | tee vps-private.key | wg pubkey > vps-public.key
+```
+
+**On GPU Server:**
+```bash
+cd /etc/wireguard
+umask 077
+wg genkey | tee gpu-private.key | wg pubkey > gpu-public.key
+```
+
+### Step 3: Create Config on VPS
+
+**On VPS (`/etc/wireguard/wg0.conf`):**
+
+```bash
+cat > /etc/wireguard/wg0.conf << 'EOF'
+[Interface]
+PrivateKey = <VPS_PRIVATE_KEY>
+Address = 10.8.0.1/24
+ListenPort = 51820
+SaveConfig = false
+
+# GPU Server Peer
+[Peer]
+PublicKey = <GPU_PUBLIC_KEY>
+AllowedIPs = 10.8.0.2/32
+PersistentKeepalive = 25
+EOF
+```
+
+Replace `<VPS_PRIVATE_KEY>` with contents of `vps-private.key`
+Replace `<GPU_PUBLIC_KEY>` with contents from GPU server's `gpu-public.key`
+
+### Step 4: Create Config on GPU Server
+
+**On GPU Server (`/etc/wireguard/wg0.conf`):**
+
+```bash
+cat > /etc/wireguard/wg0.conf << 'EOF'
+[Interface]
+PrivateKey = <GPU_PRIVATE_KEY>
+Address = 10.8.0.2/24
+
+[Peer]
+PublicKey = <VPS_PUBLIC_KEY>
+AllowedIPs = 10.8.0.0/24
+Endpoint = <VPS_PUBLIC_IP>:51820
+PersistentKeepalive = 25
+EOF
+```
+
+Replace:
+- `<GPU_PRIVATE_KEY>` with contents of `gpu-private.key`
+- `<VPS_PUBLIC_KEY>` with contents from VPS's `vps-public.key`
+- `<VPS_PUBLIC_IP>` with your VPS's public IP address
+
+### Step 5: Start WireGuard on Both
+
+**On VPS:**
+```bash
+# Enable IP forwarding
+echo "net.ipv4.ip_forward=1" >> /etc/sysctl.conf
+sysctl -p
+
+# Start WireGuard
+chmod 600 /etc/wireguard/wg0.conf
+systemctl enable wg-quick@wg0
+systemctl start wg-quick@wg0
+```
+
+**On GPU Server:**
+```bash
+# Enable IP forwarding
+echo "net.ipv4.ip_forward=1" >> /etc/sysctl.conf
+sysctl -p
+
+# Start WireGuard
+chmod 600 /etc/wireguard/wg0.conf
+systemctl enable wg-quick@wg0
+systemctl start wg-quick@wg0
+```
+
+### Step 6: Configure Firewall
+
+**On VPS:**
+```bash
+# Allow WireGuard port
+ufw allow 51820/udp
+ufw reload
+
+# Or with iptables
+iptables -A INPUT -p udp --dport 51820 -j ACCEPT
+iptables-save > /etc/iptables/rules.v4
+```
+
+**On GPU Server (RunPod):**
+```bash
+# Allow WireGuard
+ufw allow 51820/udp
+ufw reload
+```
+
+### Step 7: Test Connection
+
+Same as Method 1 Step 6.
+
+---
+
+## Troubleshooting
+
+### No handshake
+
+**Check:**
+```bash
+wg show
+```
+
+If "latest handshake" shows "never":
+1. Verify public keys are correct (easy to swap them!)
+2. Check firewall allows UDP 51820
+3. Verify endpoint IP is correct
+4. Check `systemctl status wg-quick@wg0` for errors
+
+### Can ping but can't access services
+
+**On VPS, check PostgreSQL allows 10.8.0.0/24:**
+
+```bash
+# Edit postgresql.conf
+nano /var/lib/postgresql/data/postgresql.conf
+# Add or modify:
+listen_addresses = '*'
+
+# Edit pg_hba.conf
+nano /var/lib/postgresql/data/pg_hba.conf
+# Add:
+host    all             all             10.8.0.0/24             scram-sha-256
+
+# Restart
+docker restart core_postgres
+```
+
+### WireGuard won't start
+
+```bash
+# Check logs
+journalctl -u wg-quick@wg0 -n 50
+
+# Common issues:
+# - Wrong permissions: chmod 600 /etc/wireguard/wg0.conf
+# - Invalid keys: regenerate with wg genkey
+# - Port already in use: lsof -i :51820
+```
+
+---
+
+## Verification Checklist
+
+Before proceeding to Day 5:
+
+- [ ] WireGuard installed on both VPS and GPU server
+- [ ] VPN tunnel established (wg show shows handshake)
+- [ ] GPU server can ping VPS (10.8.0.1)
+- [ ] VPS can ping GPU server (10.8.0.2)
+- [ ] Firewall allows WireGuard (UDP 51820)
+- [ ] PostgreSQL accessible from GPU server
+- [ ] WireGuard starts on boot (systemctl enable)
+
+---
+
+## Network Reference
+
+**VPN IPs:**
+- VPS: `10.8.0.1`
+- GPU Server: `10.8.0.2`
+
+**Service Access from GPU Server:**
+- PostgreSQL: `postgresql://valknar:password@10.8.0.1:5432/dbname`
+- Redis: `10.8.0.1:6379`
+- LiteLLM: `http://10.8.0.1:4000`
+- Mailpit: `10.8.0.1:1025`
+
+**Service Access from VPS:**
+- vLLM: `http://10.8.0.2:8000`
+- ComfyUI: `http://10.8.0.2:8188`
+- JupyterLab: `http://10.8.0.2:8888`
+
+---
+
+## Next: Docker & GPU Setup
+
+Once VPN is working, proceed to **Day 5: Docker & NVIDIA Container Toolkit Setup**.
+
+**Save connection info:**
+
+```bash
+# On GPU server
+cat >> /workspace/SERVER_INFO.md << 'EOF'
+
+## VPN Configuration
+- VPN IP: 10.8.0.2
+- VPS VPN IP: 10.8.0.1
+- WireGuard Status: Active
+- Latest Handshake: [Check with: wg show]
+
+## Network Access
+- Can reach VPS services: ✓
+- VPS can reach GPU services: ✓
+EOF
+```
diff --git a/ai/deploy-gpu-stack.sh b/ai/deploy-gpu-stack.sh
new file mode 100755
index 0000000..f770946
--- /dev/null
+++ b/ai/deploy-gpu-stack.sh
@@ -0,0 +1,229 @@
+#!/bin/bash
+# GPU Stack Deployment Script
+# Run this on the GPU server after SSH access is established
+
+set -e  # Exit on error
+
+echo "=================================="
+echo "GPU Stack Deployment Script"
+echo "=================================="
+echo ""
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Functions
+print_success() {
+    echo -e "${GREEN}✓ $1${NC}"
+}
+
+print_error() {
+    echo -e "${RED}✗ $1${NC}"
+}
+
+print_info() {
+    echo -e "${YELLOW}→ $1${NC}"
+}
+
+# Check if running as root
+if [[ $EUID -ne 0 ]]; then
+   print_error "This script must be run as root (use sudo)"
+   exit 1
+fi
+
+# Step 1: Check prerequisites
+print_info "Checking prerequisites..."
+
+if ! command -v docker &> /dev/null; then
+    print_error "Docker is not installed. Please run DOCKER_GPU_SETUP.md first."
+    exit 1
+fi
+print_success "Docker installed"
+
+if ! command -v nvidia-smi &> /dev/null; then
+    print_error "nvidia-smi not found. Is this a GPU server?"
+    exit 1
+fi
+print_success "NVIDIA GPU detected"
+
+if ! docker run --rm --runtime=nvidia --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi &> /dev/null; then
+    print_error "Docker cannot access GPU. Please configure NVIDIA Container Toolkit."
+    exit 1
+fi
+print_success "Docker GPU access working"
+
+# Step 2: Create directory structure
+print_info "Creating directory structure..."
+
+mkdir -p /workspace/gpu-stack/{vllm,comfyui,training/{configs,data,output},notebooks,monitoring}
+cd /workspace/gpu-stack
+
+print_success "Directory structure created"
+
+# Step 3: Create .env file
+if [ ! -f .env ]; then
+    print_info "Creating .env file..."
+
+    cat > .env << 'EOF'
+# GPU Stack Environment Variables
+
+# Timezone
+TIMEZONE=Europe/Berlin
+
+# VPN Network
+VPS_IP=10.8.0.1
+GPU_IP=10.8.0.2
+
+# Model Storage (network volume)
+MODELS_PATH=/workspace/models
+
+# Hugging Face Token (optional, for gated models like Llama)
+# Get from: https://huggingface.co/settings/tokens
+HF_TOKEN=
+
+# Weights & Biases (optional, for training logging)
+# Get from: https://wandb.ai/authorize
+WANDB_API_KEY=
+
+# JupyterLab Access Token
+JUPYTER_TOKEN=pivoine-ai-2025
+
+# PostgreSQL (on VPS)
+DB_HOST=10.8.0.1
+DB_PORT=5432
+DB_USER=valknar
+DB_PASSWORD=ragnarok98
+DB_NAME=openwebui
+EOF
+
+    chmod 600 .env
+    print_success ".env file created (please edit with your tokens)"
+else
+    print_success ".env file already exists"
+fi
+
+# Step 4: Download docker-compose.yaml
+print_info "Downloading docker-compose.yaml..."
+
+# In production, this would be copied from the repo
+# For now, assume it's already in the current directory
+if [ ! -f docker-compose.yaml ]; then
+    print_error "docker-compose.yaml not found. Please copy gpu-server-compose.yaml to docker-compose.yaml"
+    exit 1
+fi
+
+print_success "docker-compose.yaml found"
+
+# Step 5: Pre-download models (optional but recommended)
+print_info "Do you want to pre-download models? (y/n)"
+read -r response
+
+if [[ "$response" =~ ^[Yy]$ ]]; then
+    print_info "Downloading Llama 3.1 8B Instruct (this will take a while)..."
+
+    mkdir -p /workspace/models
+
+    # Use huggingface-cli to download
+    pip install -q huggingface-hub
+
+    huggingface-cli download \
+        meta-llama/Meta-Llama-3.1-8B-Instruct \
+        --local-dir /workspace/models/Meta-Llama-3.1-8B-Instruct \
+        --local-dir-use-symlinks False || print_error "Model download failed (may need HF_TOKEN)"
+
+    print_success "Model downloaded to /workspace/models"
+fi
+
+# Step 6: Start services
+print_info "Starting GPU stack services..."
+
+docker compose up -d vllm comfyui jupyter netdata
+
+print_success "Services starting (this may take a few minutes)..."
+
+# Step 7: Wait for services
+print_info "Waiting for services to be ready..."
+
+sleep 10
+
+# Check service health
+print_info "Checking service status..."
+
+if docker ps | grep -q gpu_vllm; then
+    print_success "vLLM container running"
+else
+    print_error "vLLM container not running"
+fi
+
+if docker ps | grep -q gpu_comfyui; then
+    print_success "ComfyUI container running"
+else
+    print_error "ComfyUI container not running"
+fi
+
+if docker ps | grep -q gpu_jupyter; then
+    print_success "JupyterLab container running"
+else
+    print_error "JupyterLab container not running"
+fi
+
+if docker ps | grep -q gpu_netdata; then
+    print_success "Netdata container running"
+else
+    print_error "Netdata container not running"
+fi
+
+# Step 8: Display access information
+echo ""
+echo "=================================="
+echo "Deployment Complete!"
+echo "=================================="
+echo ""
+echo "Services accessible via VPN (from VPS):"
+echo "  - vLLM API: http://10.8.0.2:8000"
+echo "  - ComfyUI: http://10.8.0.2:8188"
+echo "  - JupyterLab: http://10.8.0.2:8888 (token: pivoine-ai-2025)"
+echo "  - Netdata: http://10.8.0.2:19999"
+echo ""
+echo "Local access (from GPU server):"
+echo "  - vLLM API: http://localhost:8000"
+echo "  - ComfyUI: http://localhost:8188"
+echo "  - JupyterLab: http://localhost:8888"
+echo "  - Netdata: http://localhost:19999"
+echo ""
+echo "Useful commands:"
+echo "  - View logs: docker compose logs -f"
+echo "  - Check status: docker compose ps"
+echo "  - Stop all: docker compose down"
+echo "  - Restart service: docker compose restart vllm"
+echo "  - Start training: docker compose --profile training up -d axolotl"
+echo ""
+echo "Next steps:"
+echo "  1. Wait for vLLM to load model (check logs: docker compose logs -f vllm)"
+echo "  2. Test vLLM: curl http://localhost:8000/v1/models"
+echo "  3. Configure LiteLLM on VPS to use http://10.8.0.2:8000"
+echo "  4. Download ComfyUI models via web interface"
+echo ""
+
+# Step 9: Create helpful aliases
+print_info "Creating helpful aliases..."
+
+cat >> ~/.bashrc << 'EOF'
+
+# GPU Stack Aliases
+alias gpu-logs='cd /workspace/gpu-stack && docker compose logs -f'
+alias gpu-ps='cd /workspace/gpu-stack && docker compose ps'
+alias gpu-restart='cd /workspace/gpu-stack && docker compose restart'
+alias gpu-down='cd /workspace/gpu-stack && docker compose down'
+alias gpu-up='cd /workspace/gpu-stack && docker compose up -d'
+alias gpu-stats='watch -n 1 nvidia-smi'
+alias gpu-top='nvtop'
+EOF
+
+print_success "Aliases added to ~/.bashrc (reload with: source ~/.bashrc)"
+
+echo ""
+print_success "All done! 🚀"
diff --git a/ai/gpu-server-compose.yaml b/ai/gpu-server-compose.yaml
new file mode 100644
index 0000000..9cb2f70
--- /dev/null
+++ b/ai/gpu-server-compose.yaml
@@ -0,0 +1,237 @@
+# GPU Server Docker Compose Configuration
+# Deploy on RunPod GPU server (10.8.0.2)
+# Services accessible from VPS (10.8.0.1) via WireGuard VPN
+
+version: '3.8'
+
+services:
+  # =============================================================================
+  # vLLM - High-performance LLM Inference Server
+  # =============================================================================
+  vllm:
+    image: vllm/vllm-openai:latest
+    container_name: gpu_vllm
+    restart: unless-stopped
+    runtime: nvidia
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      CUDA_VISIBLE_DEVICES: "0"
+      HF_TOKEN: ${HF_TOKEN:-}
+    volumes:
+      - ${MODELS_PATH:-/workspace/models}:/root/.cache/huggingface
+    command:
+      - --model
+      - meta-llama/Meta-Llama-3.1-8B-Instruct  # Change model here
+      - --host
+      - 0.0.0.0
+      - --port
+      - 8000
+      - --tensor-parallel-size
+      - "1"
+      - --gpu-memory-utilization
+      - "0.85"  # Leave 15% for other tasks
+      - --max-model-len
+      - "8192"
+      - --dtype
+      - auto
+      - --trust-remote-code
+    ports:
+      - "8000:8000"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 120s  # Model loading takes time
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    labels:
+      - "service=vllm"
+      - "stack=gpu-ai"
+
+  # =============================================================================
+  # ComfyUI - Advanced Stable Diffusion Interface
+  # =============================================================================
+  comfyui:
+    image: ghcr.io/ai-dock/comfyui:latest
+    container_name: gpu_comfyui
+    restart: unless-stopped
+    runtime: nvidia
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      TZ: ${TIMEZONE:-Europe/Berlin}
+      # ComfyUI auto-installs custom nodes on first run
+      COMFYUI_FLAGS: "--listen 0.0.0.0 --port 8188"
+    volumes:
+      - comfyui_data:/data
+      - ${MODELS_PATH:-/workspace/models}/comfyui:/opt/ComfyUI/models
+      - comfyui_output:/opt/ComfyUI/output
+      - comfyui_input:/opt/ComfyUI/input
+    ports:
+      - "8188:8188"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8188/"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    labels:
+      - "service=comfyui"
+      - "stack=gpu-ai"
+
+  # =============================================================================
+  # Axolotl - LLM Fine-tuning Framework
+  # =============================================================================
+  # Note: This service uses "profiles" - only starts when explicitly requested
+  # Start with: docker compose --profile training up -d axolotl
+  axolotl:
+    image: winglian/axolotl:main-py3.11-cu121-2.2.2
+    container_name: gpu_training
+    runtime: nvidia
+    volumes:
+      - ./training/configs:/workspace/configs
+      - ./training/data:/workspace/data
+      - ./training/output:/workspace/output
+      - ${MODELS_PATH:-/workspace/models}:/workspace/models
+      - training_cache:/root/.cache
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      WANDB_API_KEY: ${WANDB_API_KEY:-}
+      HF_TOKEN: ${HF_TOKEN:-}
+    working_dir: /workspace
+    # Default command - override when running specific training
+    command: sleep infinity
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    profiles:
+      - training
+    labels:
+      - "service=axolotl"
+      - "stack=gpu-ai"
+
+  # =============================================================================
+  # JupyterLab - Interactive Development Environment
+  # =============================================================================
+  jupyter:
+    image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel
+    container_name: gpu_jupyter
+    restart: unless-stopped
+    runtime: nvidia
+    volumes:
+      - ./notebooks:/workspace/notebooks
+      - ${MODELS_PATH:-/workspace/models}:/workspace/models
+      - jupyter_cache:/root/.cache
+    ports:
+      - "8888:8888"
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      JUPYTER_ENABLE_LAB: "yes"
+      JUPYTER_TOKEN: ${JUPYTER_TOKEN:-pivoine-ai-2025}
+      HF_TOKEN: ${HF_TOKEN:-}
+    command: |
+      bash -c "
+      pip install --quiet jupyterlab transformers datasets accelerate bitsandbytes peft trl sentencepiece protobuf &&
+      jupyter lab --ip=0.0.0.0 --port=8888 --allow-root --no-browser --NotebookApp.token='${JUPYTER_TOKEN:-pivoine-ai-2025}'
+      "
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8888/"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    labels:
+      - "service=jupyter"
+      - "stack=gpu-ai"
+
+  # =============================================================================
+  # Netdata - System & GPU Monitoring
+  # =============================================================================
+  netdata:
+    image: netdata/netdata:latest
+    container_name: gpu_netdata
+    restart: unless-stopped
+    runtime: nvidia
+    hostname: gpu-runpod
+    cap_add:
+      - SYS_PTRACE
+      - SYS_ADMIN
+    security_opt:
+      - apparmor:unconfined
+    environment:
+      NVIDIA_VISIBLE_DEVICES: all
+      TZ: ${TIMEZONE:-Europe/Berlin}
+    volumes:
+      - /sys:/host/sys:ro
+      - /proc:/host/proc:ro
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+      - /etc/os-release:/host/etc/os-release:ro
+      - netdata_config:/etc/netdata
+      - netdata_cache:/var/cache/netdata
+      - netdata_lib:/var/lib/netdata
+    ports:
+      - "19999:19999"
+    labels:
+      - "service=netdata"
+      - "stack=gpu-ai"
+
+# =============================================================================
+# Volumes
+# =============================================================================
+volumes:
+  # ComfyUI data
+  comfyui_data:
+    driver: local
+  comfyui_output:
+    driver: local
+  comfyui_input:
+    driver: local
+
+  # Training data
+  training_cache:
+    driver: local
+
+  # Jupyter data
+  jupyter_cache:
+    driver: local
+
+  # Netdata data
+  netdata_config:
+    driver: local
+  netdata_cache:
+    driver: local
+  netdata_lib:
+    driver: local
+
+# =============================================================================
+# Networks
+# =============================================================================
+networks:
+  default:
+    driver: bridge
+    ipam:
+      config:
+        - subnet: 172.25.0.0/24
diff --git a/ai/litellm-config-gpu.yaml b/ai/litellm-config-gpu.yaml
new file mode 100644
index 0000000..5313d64
--- /dev/null
+++ b/ai/litellm-config-gpu.yaml
@@ -0,0 +1,199 @@
+# LiteLLM Configuration with GPU Server Integration
+# This config includes both Anthropic Claude (API) and self-hosted models (vLLM on GPU server)
+
+model_list:
+  # =============================================================================
+  # Anthropic Claude Models (API-based, for complex reasoning)
+  # =============================================================================
+
+  - model_name: claude-sonnet-4
+    litellm_params:
+      model: anthropic/claude-sonnet-4-20250514
+      api_key: os.environ/ANTHROPIC_API_KEY
+
+  - model_name: claude-sonnet-4.5
+    litellm_params:
+      model: anthropic/claude-sonnet-4-5-20250929
+      api_key: os.environ/ANTHROPIC_API_KEY
+
+  - model_name: claude-3-5-sonnet
+    litellm_params:
+      model: anthropic/claude-3-5-sonnet-20241022
+      api_key: os.environ/ANTHROPIC_API_KEY
+
+  - model_name: claude-3-opus
+    litellm_params:
+      model: anthropic/claude-3-opus-20240229
+      api_key: os.environ/ANTHROPIC_API_KEY
+
+  - model_name: claude-3-haiku
+    litellm_params:
+      model: anthropic/claude-3-haiku-20240307
+      api_key: os.environ/ANTHROPIC_API_KEY
+
+  # =============================================================================
+  # Self-Hosted Models (vLLM on GPU server via WireGuard VPN)
+  # =============================================================================
+
+  # Llama 3.1 8B Instruct - Fast, general-purpose, good for routine tasks
+  - model_name: llama-3.1-8b
+    litellm_params:
+      model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct
+      api_base: http://10.8.0.2:8000/v1
+      api_key: dummy  # vLLM doesn't require auth
+      rpm: 1000  # Rate limit: requests per minute
+      tpm: 100000  # Rate limit: tokens per minute
+
+  # Alternative models (uncomment and configure on GPU server as needed)
+
+  # Qwen 2.5 14B Instruct - Excellent multilingual, stronger reasoning
+  # - model_name: qwen-2.5-14b
+  #   litellm_params:
+  #     model: openai/Qwen/Qwen2.5-14B-Instruct
+  #     api_base: http://10.8.0.2:8000/v1
+  #     api_key: dummy
+  #     rpm: 800
+  #     tpm: 80000
+
+  # Mistral 7B Instruct - Very fast, lightweight
+  # - model_name: mistral-7b
+  #   litellm_params:
+  #     model: openai/mistralai/Mistral-7B-Instruct-v0.3
+  #     api_base: http://10.8.0.2:8000/v1
+  #     api_key: dummy
+  #     rpm: 1200
+  #     tpm: 120000
+
+  # DeepSeek Coder 6.7B - Code generation specialist
+  # - model_name: deepseek-coder-6.7b
+  #   litellm_params:
+  #     model: openai/deepseek-ai/deepseek-coder-6.7b-instruct
+  #     api_base: http://10.8.0.2:8000/v1
+  #     api_key: dummy
+  #     rpm: 1000
+  #     tpm: 100000
+
+# =============================================================================
+# Router Settings - Intelligent Model Selection
+# =============================================================================
+
+# Model aliases for easy switching in Open WebUI
+model_name_map:
+  # Default model (self-hosted, fast)
+  gpt-3.5-turbo: llama-3.1-8b
+
+  # Power users can use Claude for complex tasks
+  gpt-4: claude-sonnet-4.5
+  gpt-4-turbo: claude-sonnet-4.5
+
+# LiteLLM Settings
+litellm_settings:
+  drop_params: true
+  set_verbose: false  # Disable verbose logging for better performance
+
+  # Enable caching with Redis for better performance
+  cache: true
+  cache_params:
+    type: redis
+    host: redis
+    port: 6379
+    ttl: 3600  # Cache for 1 hour
+
+  # Force strip specific parameters globally
+  allowed_fails: 0
+
+  # Modify params before sending to provider
+  modify_params: true
+
+  # Enable success and failure logging but minimize overhead
+  success_callback: []  # Disable all success callbacks to reduce DB writes
+  failure_callback: []  # Disable all failure callbacks
+
+# Router Settings
+router_settings:
+  allowed_fails: 0
+
+  # Routing strategy: Try self-hosted first, fallback to Claude on failure
+  routing_strategy: simple-shuffle
+
+  # Cooldown for failed models
+  cooldown_time: 30  # seconds
+
+# Drop unsupported parameters
+default_litellm_params:
+  drop_params: true
+
+# General Settings
+general_settings:
+  disable_responses_id_security: true
+
+  # Disable spend tracking to reduce database overhead
+  disable_spend_logs: false  # Keep enabled to track API vs GPU costs
+
+  # Disable tag tracking
+  disable_tag_tracking: true
+
+  # Disable daily spend updates
+  disable_daily_spend_logs: false  # Keep enabled for cost analysis
+
+  # Master key for authentication (set via env var)
+  master_key: os.environ/LITELLM_MASTER_KEY
+
+  # Database for logging (optional but recommended for cost tracking)
+  database_url: os.environ/DATABASE_URL
+
+  # Enable OpenAPI docs
+  docs_url: /docs
+
+# =============================================================================
+# Usage Guidelines (for Open WebUI users)
+# =============================================================================
+#
+# Model Selection Guide:
+#
+# Use llama-3.1-8b for:
+# - General chat and Q&A
+# - Simple code generation
+# - Data extraction
+# - Summarization
+# - Translation
+# - Most routine tasks
+# Cost: ~$0/month (self-hosted)
+# Speed: ~50-80 tokens/second
+#
+# Use qwen-2.5-14b for:
+# - Complex reasoning
+# - Multi-step problems
+# - Advanced code generation
+# - Multilingual tasks
+# Cost: ~$0/month (self-hosted)
+# Speed: ~30-50 tokens/second
+#
+# Use claude-sonnet-4.5 for:
+# - Very complex reasoning
+# - Long documents (200K context)
+# - Production-critical code
+# - When quality matters most
+# Cost: ~$3/million input tokens, ~$15/million output tokens
+# Speed: ~30-40 tokens/second
+#
+# Use claude-3-haiku for:
+# - API fallback (if self-hosted down)
+# - Very fast responses needed
+# Cost: ~$0.25/million input tokens, ~$1.25/million output tokens
+# Speed: ~60-80 tokens/second
+#
+# =============================================================================
+
+# Health Check Configuration
+health_check:
+  # Check vLLM health endpoint
+  enabled: true
+  interval: 30  # seconds
+  timeout: 5  # seconds
+
+# Fallback Configuration
+# If GPU server is down, automatically use Claude
+fallback:
+  - ["llama-3.1-8b", "claude-3-haiku"]
+  - ["qwen-2.5-14b", "claude-sonnet-4.5"]