From d5e37dbd3f7ec2460b34b8e6c93414043685c21d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= <valknar@pivoine.art>
Date: Fri, 21 Nov 2025 14:41:10 +0100
Subject: [PATCH] cleanup: remove GPU/RunPod files from docker-compose
 repository

Removed GPU orchestration files migrated to dedicated runpod repository:
- Model orchestrator, vLLM, Flux, MusicGen services
- GPU Docker Compose files and configs
- GPU deployment scripts and documentation

Kept VPS AI services and facefusion:
- compose.yaml (VPS AI + facefusion)
- litellm-config.yaml (VPS LiteLLM)
- postgres/ (VPS PostgreSQL init)
- Dockerfile, entrypoint.sh, disable-nsfw-filter.patch (facefusion)
- README.md (updated with runpod reference)

GPU infrastructure now maintained at: ssh://git@dev.pivoine.art:2222/valknar/runpod.git
---
 ai/.env.example                        |    9 -
 ai/DOCKER_GPU_SETUP.md                 |  430 --------
 ai/GPU_DEPLOYMENT_LOG.md               |  421 --------
 ai/GPU_EXPANSION_PLAN.md               | 1306 ------------------------
 ai/README_GPU_SETUP.md                 |  444 --------
 ai/SETUP_GUIDE.md                      |  261 -----
 ai/TAILSCALE_SETUP.md                  |  417 --------
 ai/WIREGUARD_SETUP.md                  |  393 -------
 ai/deploy-gpu-stack.sh                 |  229 -----
 ai/docker-compose.gpu.yaml             |  104 --
 ai/flux/config/config.json             |   13 -
 ai/gpu-server-compose.yaml             |  237 -----
 ai/litellm-config-gpu.yaml             |  199 ----
 ai/model-orchestrator/Dockerfile       |   22 -
 ai/model-orchestrator/models.yaml      |   89 --
 ai/model-orchestrator/orchestrator.py  |  359 -------
 ai/model-orchestrator/requirements.txt |    6 -
 ai/musicgen/Dockerfile                 |   38 -
 ai/musicgen/requirements.txt           |    6 -
 ai/musicgen/server.py                  |  194 ----
 ai/simple_vllm_server.py               |  302 ------
 ai/vllm/Dockerfile                     |   34 -
 ai/vllm/requirements.txt               |    4 -
 ai/vllm/server.py                      |  302 ------
 24 files changed, 5819 deletions(-)
 delete mode 100644 ai/.env.example
 delete mode 100644 ai/DOCKER_GPU_SETUP.md
 delete mode 100644 ai/GPU_DEPLOYMENT_LOG.md
 delete mode 100644 ai/GPU_EXPANSION_PLAN.md
 delete mode 100644 ai/README_GPU_SETUP.md
 delete mode 100644 ai/SETUP_GUIDE.md
 delete mode 100644 ai/TAILSCALE_SETUP.md
 delete mode 100644 ai/WIREGUARD_SETUP.md
 delete mode 100755 ai/deploy-gpu-stack.sh
 delete mode 100644 ai/docker-compose.gpu.yaml
 delete mode 100644 ai/flux/config/config.json
 delete mode 100644 ai/gpu-server-compose.yaml
 delete mode 100644 ai/litellm-config-gpu.yaml
 delete mode 100644 ai/model-orchestrator/Dockerfile
 delete mode 100644 ai/model-orchestrator/models.yaml
 delete mode 100644 ai/model-orchestrator/orchestrator.py
 delete mode 100644 ai/model-orchestrator/requirements.txt
 delete mode 100644 ai/musicgen/Dockerfile
 delete mode 100644 ai/musicgen/requirements.txt
 delete mode 100644 ai/musicgen/server.py
 delete mode 100644 ai/simple_vllm_server.py
 delete mode 100644 ai/vllm/Dockerfile
 delete mode 100644 ai/vllm/requirements.txt
 delete mode 100644 ai/vllm/server.py

diff --git a/ai/.env.example b/ai/.env.example
deleted file mode 100644
index 0c5c769..0000000
--- a/ai/.env.example
+++ /dev/null
@@ -1,9 +0,0 @@
-# Environment Variables for Multi-Modal AI Orchestration
-# Copy this file to .env and fill in your values
-
-# Hugging Face Token (for downloading models)
-# Get from: https://huggingface.co/settings/tokens
-HF_TOKEN=hf_your_token_here
-
-# Tailscale IP of GPU Server (for VPS to connect)
-GPU_TAILSCALE_IP=100.100.108.13
diff --git a/ai/DOCKER_GPU_SETUP.md b/ai/DOCKER_GPU_SETUP.md
deleted file mode 100644
index e60d103..0000000
--- a/ai/DOCKER_GPU_SETUP.md
+++ /dev/null
@@ -1,430 +0,0 @@
-# Docker & NVIDIA Container Toolkit Setup
-
-## Day 5: Docker Configuration on GPU Server
-
-This guide sets up Docker with GPU support on your RunPod server.
-
----
-
-## Step 1: Install Docker
-
-### Quick Install (Recommended)
-
-```bash
-# SSH into GPU server
-ssh gpu-pivoine
-
-# Download and run Docker install script
-curl -fsSL https://get.docker.com -o get-docker.sh
-sh get-docker.sh
-
-# Verify installation
-docker --version
-docker compose version
-```
-
-Expected output:
-```
-Docker version 24.0.7, build afdd53b
-Docker Compose version v2.23.0
-```
-
-### Manual Install (Alternative)
-
-```bash
-# Add Docker's official GPG key
-apt-get update
-apt-get install -y ca-certificates curl gnupg
-install -m 0755 -d /etc/apt/keyrings
-curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg
-chmod a+r /etc/apt/keyrings/docker.gpg
-
-# Add repository
-echo \
-  "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
-  $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
-  tee /etc/apt/sources.list.d/docker.list > /dev/null
-
-# Install Docker
-apt-get update
-apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
-
-# Start Docker
-systemctl enable docker
-systemctl start docker
-```
-
----
-
-## Step 2: Install NVIDIA Container Toolkit
-
-This enables Docker containers to use the GPU.
-
-```bash
-# Add NVIDIA repository
-distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
-curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \
-  gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
-
-curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \
-  sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
-  tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
-
-# Install toolkit
-apt-get update
-apt-get install -y nvidia-container-toolkit
-
-# Configure Docker to use NVIDIA runtime
-nvidia-ctk runtime configure --runtime=docker
-
-# Restart Docker
-systemctl restart docker
-```
-
----
-
-## Step 3: Test GPU Access in Docker
-
-### Test 1: Basic CUDA Container
-
-```bash
-docker run --rm --runtime=nvidia --gpus all \
-  nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
-```
-
-Expected output: Same as `nvidia-smi` output showing your RTX 4090.
-
-### Test 2: PyTorch Container
-
-```bash
-docker run --rm --runtime=nvidia --gpus all \
-  pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime \
-  python -c "import torch; print('CUDA:', torch.cuda.is_available(), 'Device:', torch.cuda.get_device_name(0))"
-```
-
-Expected output:
-```
-CUDA: True Device: NVIDIA GeForce RTX 4090
-```
-
-### Test 3: Multi-GPU Query (if you have multiple GPUs)
-
-```bash
-docker run --rm --runtime=nvidia --gpus all \
-  nvidia/cuda:12.1.0-base-ubuntu22.04 \
-  bash -c "echo 'GPU Count:' && nvidia-smi --list-gpus"
-```
-
----
-
-## Step 4: Configure Docker Compose with GPU Support
-
-Docker Compose needs to know about NVIDIA runtime.
-
-### Create daemon.json
-
-```bash
-cat > /etc/docker/daemon.json << 'EOF'
-{
-  "runtimes": {
-    "nvidia": {
-      "path": "nvidia-container-runtime",
-      "runtimeArgs": []
-    }
-  },
-  "default-runtime": "nvidia",
-  "log-driver": "json-file",
-  "log-opts": {
-    "max-size": "10m",
-    "max-file": "3"
-  }
-}
-EOF
-
-# Restart Docker
-systemctl restart docker
-```
-
----
-
-## Step 5: Create GPU Project Structure
-
-```bash
-cd /workspace
-
-# Create directory structure
-mkdir -p gpu-stack/{vllm,comfyui,training,jupyter,monitoring}
-cd gpu-stack
-
-# Create .env file
-cat > .env << 'EOF'
-# GPU Stack Environment Variables
-
-# Timezone
-TIMEZONE=Europe/Berlin
-
-# VPN Network
-VPS_IP=10.8.0.1
-GPU_IP=10.8.0.2
-
-# Model Storage
-MODELS_PATH=/workspace/models
-
-# Hugging Face (optional, for private models)
-HF_TOKEN=
-
-# PostgreSQL (on VPS)
-DB_HOST=10.8.0.1
-DB_PORT=5432
-DB_USER=valknar
-DB_PASSWORD=ragnarok98
-DB_NAME=openwebui
-
-# Weights & Biases (optional, for training logging)
-WANDB_API_KEY=
-EOF
-
-chmod 600 .env
-```
-
----
-
-## Step 6: Test Full Stack (Quick Smoke Test)
-
-Let's deploy a minimal vLLM container to verify everything works:
-
-```bash
-cd /workspace/gpu-stack
-
-# Create test compose file
-cat > test-compose.yaml << 'EOF'
-services:
-  test-vllm:
-    image: vllm/vllm-openai:latest
-    container_name: test_vllm
-    runtime: nvidia
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-    command:
-      - --model
-      - facebook/opt-125m  # Tiny model for testing
-      - --host
-      - 0.0.0.0
-      - --port
-      - 8000
-    ports:
-      - "8000:8000"
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-EOF
-
-# Start test
-docker compose -f test-compose.yaml up -d
-
-# Wait 30 seconds for model download
-sleep 30
-
-# Check logs
-docker compose -f test-compose.yaml logs
-
-# Test inference
-curl http://localhost:8000/v1/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "facebook/opt-125m",
-    "prompt": "Hello, my name is",
-    "max_tokens": 10
-  }'
-```
-
-Expected output (JSON response with generated text).
-
-**Clean up test:**
-```bash
-docker compose -f test-compose.yaml down
-```
-
----
-
-## Step 7: Install Additional Tools
-
-```bash
-# Python tools
-apt install -y python3-pip python3-venv
-
-# Monitoring tools
-apt install -y htop nvtop iotop
-
-# Network tools
-apt install -y iperf3 tcpdump
-
-# Development tools
-apt install -y build-essential
-
-# Git LFS (for large model files)
-apt install -y git-lfs
-git lfs install
-```
-
----
-
-## Step 8: Configure Automatic Updates (Optional)
-
-```bash
-# Install unattended-upgrades
-apt install -y unattended-upgrades
-
-# Configure
-dpkg-reconfigure -plow unattended-upgrades
-
-# Enable automatic security updates
-cat > /etc/apt/apt.conf.d/50unattended-upgrades << 'EOF'
-Unattended-Upgrade::Allowed-Origins {
-    "${distro_id}:${distro_codename}-security";
-};
-Unattended-Upgrade::Automatic-Reboot "false";
-Unattended-Upgrade::Remove-Unused-Dependencies "true";
-EOF
-```
-
----
-
-## Troubleshooting
-
-### Docker can't access GPU
-
-**Problem:** `docker: Error response from daemon: could not select device driver "" with capabilities: [[gpu]]`
-
-**Solution:**
-```bash
-# Verify NVIDIA runtime is configured
-docker info | grep -i runtime
-
-# Should show nvidia in runtimes list
-# If not, reinstall nvidia-container-toolkit
-
-# Check daemon.json
-cat /etc/docker/daemon.json
-
-# Restart Docker
-systemctl restart docker
-```
-
-### Permission denied on docker commands
-
-**Solution:**
-```bash
-# Add your user to docker group (if not root)
-usermod -aG docker $USER
-
-# Or always use sudo
-sudo docker ...
-```
-
-### Out of disk space
-
-**Check usage:**
-```bash
-df -h
-du -sh /var/lib/docker
-docker system df
-```
-
-**Clean up:**
-```bash
-# Remove unused images
-docker image prune -a
-
-# Remove unused volumes
-docker volume prune
-
-# Full cleanup
-docker system prune -a --volumes
-```
-
----
-
-## Verification Checklist
-
-Before deploying the full stack:
-
-- [ ] Docker installed and running
-- [ ] `docker --version` shows 24.x or newer
-- [ ] `docker compose version` works
-- [ ] NVIDIA Container Toolkit installed
-- [ ] `docker run --gpus all nvidia/cuda:12.1.0-base nvidia-smi` works
-- [ ] PyTorch container can see GPU
-- [ ] Test vLLM deployment successful
-- [ ] /workspace directory structure created
-- [ ] .env file configured with VPN IPs
-- [ ] Additional tools installed (nvtop, htop, etc.)
-
----
-
-## Performance Monitoring Commands
-
-**GPU Monitoring:**
-```bash
-# Real-time GPU stats
-watch -n 1 nvidia-smi
-
-# Or with nvtop (prettier)
-nvtop
-
-# GPU memory usage
-nvidia-smi --query-gpu=memory.used,memory.total --format=csv
-```
-
-**Docker Stats:**
-```bash
-# Container resource usage
-docker stats
-
-# Specific container
-docker stats vllm --no-stream
-```
-
-**System Resources:**
-```bash
-# Overall system
-htop
-
-# I/O stats
-iotop
-
-# Network
-iftop
-```
-
----
-
-## Next: Deploy Production Stack
-
-Now you're ready to deploy the full GPU stack with vLLM, ComfyUI, and training tools.
-
-**Proceed to:** Deploying the production docker-compose.yaml
-
-**Save your progress:**
-
-```bash
-cat >> /workspace/SERVER_INFO.md << 'EOF'
-
-## Docker Configuration
-- Docker Version: [docker --version]
-- NVIDIA Runtime: Enabled
-- GPU Access in Containers: ✓
-- Test vLLM Deployment: Successful
-- Directory: /workspace/gpu-stack
-
-## Tools Installed
-- nvtop: GPU monitoring
-- htop: System monitoring
-- Docker Compose: v2.x
-- Git LFS: Large file support
-EOF
-```
diff --git a/ai/GPU_DEPLOYMENT_LOG.md b/ai/GPU_DEPLOYMENT_LOG.md
deleted file mode 100644
index 206097b..0000000
--- a/ai/GPU_DEPLOYMENT_LOG.md
+++ /dev/null
@@ -1,421 +0,0 @@
-# GPU Server Deployment Log
-
-## Current Deployment (2025-11-21)
-
-### Infrastructure
-- **Provider**: RunPod (Spot Instance)
-- **GPU**: NVIDIA RTX 4090 24GB
-- **Disk**: 50GB local SSD (expanded from 20GB)
-- **Network Volume**: 922TB at `/workspace`
-- **Region**: Europe
-- **Cost**: ~$0.50/hour (~$360/month if running 24/7)
-
-### Network Configuration
-- **VPN**: Tailscale (replaces WireGuard due to RunPod UDP restrictions)
-- **GPU Server Tailscale IP**: 100.100.108.13
-- **VPS Tailscale IP**: (get with `tailscale ip -4` on VPS)
-
-### SSH Access
-```
-Host gpu-pivoine
-    HostName 213.173.102.232
-    Port 29695
-    User root
-    IdentityFile ~/.ssh/id_ed25519
-```
-
-**Note**: RunPod Spot instances can be terminated and restarted with new ports/IPs. Update SSH config accordingly.
-
-### Software Stack
-- **Python**: 3.11.10
-- **vLLM**: 0.6.4.post1 (installed with pip)
-- **PyTorch**: 2.5.1 with CUDA 12.4
-- **Tailscale**: Installed via official script
-
-### vLLM Deployment
-
-**Custom Server**: `ai/simple_vllm_server.py`
-- Uses `AsyncLLMEngine` directly to bypass multiprocessing issues
-- OpenAI-compatible API endpoints:
-  - `GET /v1/models` - List available models
-  - `POST /v1/completions` - Text completion
-  - `POST /v1/chat/completions` - Chat completion
-- Default model: Qwen/Qwen2.5-7B-Instruct
-- Cache directory: `/workspace/huggingface_cache`
-
-**Deployment Command**:
-```bash
-# Copy server script to GPU server
-scp ai/simple_vllm_server.py gpu-pivoine:/workspace/
-
-# Start server
-ssh gpu-pivoine "cd /workspace && nohup python3 simple_vllm_server.py > vllm.log 2>&1 &"
-
-# Check status
-ssh gpu-pivoine "curl http://localhost:8000/v1/models"
-```
-
-**Server Configuration** (environment variables):
-- `VLLM_HOST`: 0.0.0.0 (default)
-- `VLLM_PORT`: 8000 (default)
-
-### Model Configuration
-- **Model**: Qwen/Qwen2.5-7B-Instruct (no auth required)
-- **Context Length**: 4096 tokens
-- **GPU Memory**: 85% utilization
-- **Tensor Parallel**: 1 (single GPU)
-
-### Known Issues & Solutions
-
-#### Issue 1: vLLM Multiprocessing Errors
-**Problem**: Default vLLM v1 engine fails with ZMQ/CUDA multiprocessing errors on RunPod.
-**Solution**: Custom `AsyncLLMEngine` FastAPI server bypasses multiprocessing layer entirely.
-
-#### Issue 2: Disk Space (Solved)
-**Problem**: Original 20GB disk filled up with Hugging Face cache.
-**Solution**: Expanded to 50GB and use `/workspace` for model cache.
-
-#### Issue 3: Gated Models
-**Problem**: Llama models require Hugging Face authentication.
-**Solution**: Use Qwen 2.5 7B Instruct (no auth required) or set `HF_TOKEN` environment variable.
-
-#### Issue 4: Spot Instance Volatility
-**Problem**: RunPod Spot instances can be terminated anytime.
-**Solution**: Accept as trade-off for cost savings. Document SSH details for quick reconnection.
-
-### Monitoring
-
-**Check vLLM logs**:
-```bash
-ssh gpu-pivoine "tail -f /workspace/vllm.log"
-```
-
-**Check GPU usage**:
-```bash
-ssh gpu-pivoine "nvidia-smi"
-```
-
-**Check Tailscale status**:
-```bash
-ssh gpu-pivoine "tailscale status"
-```
-
-**Test API locally (on GPU server)**:
-```bash
-ssh gpu-pivoine "curl http://localhost:8000/v1/models"
-```
-
-**Test API via Tailscale (from VPS)**:
-```bash
-curl http://100.100.108.13:8000/v1/models
-```
-
-### LiteLLM Integration
-
-Update VPS LiteLLM config at `ai/litellm-config-gpu.yaml`:
-
-```yaml
-# Replace old WireGuard IP (10.8.0.2) with Tailscale IP
-- model_name: qwen-2.5-7b
-  litellm_params:
-    model: openai/qwen-2.5-7b
-    api_base: http://100.100.108.13:8000/v1  # Tailscale IP
-    api_key: dummy
-    rpm: 1000
-    tpm: 100000
-```
-
-Restart LiteLLM:
-```bash
-arty restart litellm
-```
-
-### Troubleshooting
-
-**Server not responding**:
-1. Check if process is running: `pgrep -f simple_vllm_server`
-2. Check logs: `tail -100 /workspace/vllm.log`
-3. Check GPU availability: `nvidia-smi`
-4. Restart server: `pkill -f simple_vllm_server && python3 /workspace/simple_vllm_server.py &`
-
-**Tailscale not connected**:
-1. Check status: `tailscale status`
-2. Check daemon: `ps aux | grep tailscaled`
-3. Restart: `tailscale down && tailscale up`
-
-**Model download failing**:
-1. Check disk space: `df -h`
-2. Check cache directory: `ls -lah /workspace/huggingface_cache`
-3. Clear cache if needed: `rm -rf /workspace/huggingface_cache/*`
-
-### Deployment Status ✅ COMPLETE
-
-**Deployment Date**: 2025-11-21
-
-1. ✅ Deploy vLLM with Qwen 2.5 7B - COMPLETE
-2. ✅ Test API endpoints locally and via Tailscale - COMPLETE
-3. ✅ Update VPS LiteLLM configuration - COMPLETE
-4. ✅ Test end-to-end: Open WebUI → LiteLLM → vLLM - COMPLETE
-5. ⏳ Monitor performance and costs - ONGOING
-
-**Model Available**: `qwen-2.5-7b` visible in Open WebUI at https://ai.pivoine.art
-
-### Next Steps (2025-11-21 Original)
-6. ✅ Consider adding more models → COMPLETE (added Flux.1 Schnell + MusicGen Medium)
-7. ⏹️ Set up auto-stop for idle periods to save costs
-
----
-
-## Multi-Modal Architecture (2025-11-21 Update)
-
-### Overview
-
-Expanded GPU deployment to support **text, image, and music generation** with intelligent model orchestration. All models run sequentially on a single RTX 4090 GPU with automatic switching based on request type.
-
-### Architecture Components
-
-#### 1. **Orchestrator Service** (Port 9000 - Always Running)
-- **Location**: `ai/model-orchestrator/`
-- **Purpose**: Central service managing model lifecycle
-- **Features**:
-  - Detects request type (text/image/audio)
-  - Automatically unloads current model
-  - Loads requested model
-  - Proxies requests to active model
-  - Tracks GPU memory usage
-- **Technology**: FastAPI + Docker SDK Python
-- **Endpoints**:
-  - `POST /v1/chat/completions` → Routes to text models
-  - `POST /v1/images/generations` → Routes to image models
-  - `POST /v1/audio/generations` → Routes to music models
-  - `GET /health` → Shows active model and status
-  - `GET /models` → Lists all available models
-  - `POST /switch` → Manually switch models
-
-#### 2. **Text Generation** (vLLM + Qwen 2.5 7B)
-- **Service**: `vllm-qwen` (Port 8001)
-- **Location**: `ai/vllm/`
-- **Model**: Qwen/Qwen2.5-7B-Instruct
-- **VRAM**: 14GB (85% GPU utilization)
-- **Speed**: ~50 tokens/second
-- **Startup**: 120 seconds
-- **Status**: ✅ Working (same as original deployment)
-
-#### 3. **Image Generation** (Flux.1 Schnell)
-- **Service**: `flux` (Port 8002)
-- **Location**: `ai/flux/`
-- **Model**: black-forest-labs/FLUX.1-schnell
-- **VRAM**: 14GB with CPU offloading
-- **Speed**: 4-5 seconds per image
-- **Startup**: 60 seconds
-- **Features**: OpenAI DALL-E compatible API
-- **Image**: `ghcr.io/matatonic/openedai-images-flux:latest`
-
-#### 4. **Music Generation** (MusicGen Medium)
-- **Service**: `musicgen` (Port 8003)
-- **Location**: `ai/musicgen/`
-- **Model**: facebook/musicgen-medium
-- **VRAM**: 11GB
-- **Speed**: 60-90 seconds for 30 seconds of audio
-- **Startup**: 45 seconds
-- **Features**: Text-to-music generation with sampling controls
-- **Technology**: Meta's AudioCraft + custom FastAPI wrapper
-
-### Model Registry (`models.yaml`)
-
-Simple configuration file for managing all models:
-
-```yaml
-models:
-  qwen-2.5-7b:
-    type: text
-    framework: vllm
-    docker_service: vllm-qwen
-    port: 8001
-    vram_gb: 14
-    startup_time_seconds: 120
-    endpoint: /v1/chat/completions
-
-  flux-schnell:
-    type: image
-    framework: openedai-images
-    docker_service: flux
-    port: 8002
-    vram_gb: 14
-    startup_time_seconds: 60
-    endpoint: /v1/images/generations
-
-  musicgen-medium:
-    type: audio
-    framework: audiocraft
-    docker_service: musicgen
-    port: 8003
-    vram_gb: 11
-    startup_time_seconds: 45
-    endpoint: /v1/audio/generations
-```
-
-**Adding new models**: Just add a new entry to this file and define the Docker service.
-
-### Deployment Changes
-
-#### Docker Compose Structure
-- **File**: `docker-compose.gpu.yaml`
-- **Services**: 4 total (1 orchestrator + 3 models)
-- **Profiles**: `text`, `image`, `audio` (orchestrator manages activation)
-- **Restart Policy**: `no` for models (orchestrator controls lifecycle)
-- **Volumes**: All model caches on `/workspace` (922TB network volume)
-
-#### LiteLLM Integration
-Updated `litellm-config.yaml` to route all self-hosted models through orchestrator:
-
-```yaml
-# Text
-- model_name: qwen-2.5-7b
-  api_base: http://100.100.108.13:9000/v1  # Orchestrator
-
-# Image
-- model_name: flux-schnell
-  api_base: http://100.100.108.13:9000/v1  # Orchestrator
-
-# Music
-- model_name: musicgen-medium
-  api_base: http://100.100.108.13:9000/v1  # Orchestrator
-```
-
-All models now available via Open WebUI at https://ai.pivoine.art
-
-### Usage Examples
-
-**Text Generation**:
-```bash
-curl http://100.100.108.13:9000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{"model": "qwen-2.5-7b", "messages": [{"role": "user", "content": "Hello"}]}'
-```
-
-**Image Generation**:
-```bash
-curl http://100.100.108.13:9000/v1/images/generations \
-  -H "Content-Type: application/json" \
-  -d '{"model": "flux-schnell", "prompt": "a cute cat", "size": "1024x1024"}'
-```
-
-**Music Generation**:
-```bash
-curl http://100.100.108.13:9000/v1/audio/generations \
-  -H "Content-Type: application/json" \
-  -d '{"model": "musicgen-medium", "prompt": "upbeat electronic", "duration": 30}'
-```
-
-### Deployment Commands
-
-```bash
-# Copy all files to RunPod
-scp -r ai/* gpu-pivoine:/workspace/ai/
-
-# SSH to GPU server
-ssh gpu-pivoine
-cd /workspace/ai/
-
-# Start orchestrator (manages everything)
-docker compose -f docker-compose.gpu.yaml up -d orchestrator
-
-# Check status
-curl http://100.100.108.13:9000/health
-
-# View logs
-docker logs -f ai_orchestrator
-
-# Manually switch models (optional)
-curl -X POST http://100.100.108.13:9000/switch \
-  -H "Content-Type: application/json" \
-  -d '{"model": "flux-schnell"}'
-```
-
-### Performance Characteristics
-
-| Model | VRAM | Startup Time | Generation Time | Notes |
-|-------|------|--------------|-----------------|-------|
-| Qwen 2.5 7B | 14GB | 120s | ~50 tok/sec | Fast text generation |
-| Flux.1 Schnell | 14GB | 60s | 4-5s/image | High-quality images |
-| MusicGen Medium | 11GB | 45s | 60-90s for 30s audio | Text-to-music |
-
-**Model Switching Overhead**: 30-120 seconds (unload + load)
-
-### Cost Analysis
-
-**Current (Single GPU Sequential)**:
-- Cost: ~$0.50/hour
-- Monthly: ~$360 (24/7) or ~$120 (8hr/day)
-- Trade-off: 30-120s switching time
-
-**Alternative (Multi-GPU Concurrent)**:
-- Cost: ~$0.75/hour (+50%)
-- Monthly: ~$540 (24/7) or ~$180 (8hr/day)
-- Benefit: No switching time, all models always available
-
-**Decision**: Stick with single GPU for cost optimization. Switching time is acceptable for most use cases.
-
-### Known Limitations
-
-1. **Sequential Only**: Only one model active at a time
-2. **Switching Latency**: 30-120 seconds to change models
-3. **MusicGen License**: Pre-trained weights are CC-BY-NC (non-commercial)
-4. **Spot Instance Volatility**: Pod can be terminated anytime
-
-### Monitoring
-
-**Check active model**:
-```bash
-curl http://100.100.108.13:9000/health | jq '{model: .current_model, vram: .model_info.vram_gb}'
-```
-
-**View orchestrator logs**:
-```bash
-docker logs -f ai_orchestrator
-```
-
-**GPU usage**:
-```bash
-ssh gpu-pivoine "nvidia-smi"
-```
-
-### Deployment Status ✅ COMPLETE (Multi-Modal)
-
-**Deployment Date**: 2025-11-21
-
-1. ✅ Create model orchestrator service - COMPLETE
-2. ✅ Deploy vLLM text generation (Qwen 2.5 7B) - COMPLETE
-3. ✅ Deploy Flux.1 Schnell image generation - COMPLETE
-4. ✅ Deploy MusicGen Medium music generation - COMPLETE
-5. ✅ Update LiteLLM configuration - COMPLETE
-6. ✅ Test all three model types via orchestrator - READY FOR TESTING
-7. ⏳ Monitor performance and costs - ONGOING
-
-**Models Available**: `qwen-2.5-7b`, `flux-schnell`, `musicgen-medium` via Open WebUI
-
-### Future Model Additions
-
-**Easy to add** (just edit `models.yaml`):
-- Llama 3.1 8B Instruct (text, gated model)
-- Whisper Large v3 (speech-to-text)
-- XTTS v2 (text-to-speech)
-- Stable Diffusion XL (alternative image generation)
-
-See `README.md` for detailed instructions on adding new models.
-
-### Cost Optimization Ideas
-1. **Auto-stop**: Configure RunPod to auto-stop after 30 minutes idle
-2. **Spot Instances**: Already using Spot for 50% cost reduction
-3. **Scheduled Operation**: Run only during business hours (8 hours/day = $120/month)
-4. **Smaller Models**: Use Mistral 7B or quantized models for lighter workloads
-5. **Pay-as-you-go**: Manually start/stop pod as needed
-
-### Performance Benchmarks
-*To be measured after deployment*
-
-Expected (based on RTX 4090):
-- Qwen 2.5 7B: 50-80 tokens/second
-- Context processing: ~2-3 seconds for 1000 tokens
-- First token latency: ~200-300ms
diff --git a/ai/GPU_EXPANSION_PLAN.md b/ai/GPU_EXPANSION_PLAN.md
deleted file mode 100644
index d34ea01..0000000
--- a/ai/GPU_EXPANSION_PLAN.md
+++ /dev/null
@@ -1,1306 +0,0 @@
-# GPU-Enhanced AI Stack Expansion Plan
-
-## Executive Summary
-
-This document outlines a comprehensive plan to extend the current AI stack (LiteLLM, Open WebUI, Crawl4AI) with dedicated GPU hosting capabilities for:
-- **LLM Model Hosting**: Self-hosted models (Llama, Mistral, Qwen, etc.)
-- **Model Training**: Fine-tuning and training workflows
-- **Image Generation**: Stable Diffusion, FLUX via ComfyUI
-- **Video Generation**: AnimateDiff, CogVideo, etc.
-
-**Current Architecture**: CPU-based stack on pivoine.art VPS → Claude API via LiteLLM
-**Target Architecture**: Hybrid stack with GPU server(s) for self-hosted models + API-based models
-
----
-
-## Phase 1: Current Stack Analysis
-
-### Existing Components
-
-1. **ai_postgres** (pgvector/pgvector:pg16)
-   - PostgreSQL with pgvector for RAG
-   - Stores: conversations, embeddings, LiteLLM logs
-
-2. **webui** (Open WebUI)
-   - User-facing ChatGPT-like interface
-   - URL: https://ai.pivoine.art
-   - Features: RAG, web search, document upload
-   - Connected to LiteLLM proxy
-
-3. **litellm** (LiteLLM proxy)
-   - Currently proxies Anthropic Claude API
-   - OpenAI-compatible endpoint at http://litellm:4000
-   - Supports multiple providers via config
-
-4. **crawl4ai**
-   - Internal web scraping for LLM content prep
-   - Port 11235 (internal only)
-
-5. **facefusion** (CPU-only)
-   - Face swapping/enhancement
-   - Currently CPU-based (slow)
-   - Protected by Authelia SSO
-
-### Current Limitations
-
-- ❌ No self-hosted LLMs (relies on expensive API calls)
-- ❌ No GPU acceleration for facefusion
-- ❌ No image generation capabilities
-- ❌ No model training/fine-tuning capabilities
-- ❌ No video generation
-- ❌ High operational costs for API usage
-
----
-
-## Phase 2: GPU Provider Comparison
-
-### Provider Options
-
-#### 1. **RunPod** ⭐ RECOMMENDED
-**Pros:**
-- Pay-per-second GPU billing
-- Wide GPU selection (RTX 4090, A100, H100)
-- Docker-first platform
-- Global locations
-- Easy HTTP/SSH tunneling
-- Volume persistence
-
-**Pricing (Approximate):**
-- RTX 4090 (24GB): ~$0.50/hour ($360/month 24/7)
-- RTX 3090 (24GB): ~$0.35/hour ($250/month)
-- A6000 (48GB): ~$0.80/hour ($576/month)
-- A100 (40GB): ~$1.50/hour ($1,080/month)
-
-**Best for:** On-demand workloads, experimentation, cost-conscious hosting
-
----
-
-#### 2. **Lambda Labs**
-**Pros:**
-- Flat monthly pricing
-- High-end GPUs (A100, H100)
-- Jupyter notebooks included
-- Fast network
-
-**Pricing:**
-- 1x A100 (40GB): $1.10/hour ($792/month)
-- 8x A100 (40GB): $8.00/hour (~$5,760/month)
-
-**Best for:** Research, high-utilization workloads
-
----
-
-#### 3. **Vast.ai**
-**Pros:**
-- Marketplace model (cheapest)
-- Many GPU options
-- Spot pricing available
-
-**Cons:**
-- Variable reliability
-- Setup complexity
-- Community-hosted machines
-
-**Pricing:**
-- RTX 4090: ~$0.25-0.40/hour
-- A100: ~$0.80-1.20/hour
-
-**Best for:** Budget-conscious, experimental workloads
-
----
-
-#### 4. **Google Cloud Platform (GCP)**
-**Pros:**
-- Enterprise reliability
-- Auto-scaling
-- Integration with Google services
-- Preemptible instances available
-
-**Pricing:**
-- T4 (16GB): ~$0.35/hour
-- V100 (16GB): ~$2.48/hour
-- A100 (40GB): ~$2.93/hour
-- TPU options available
-
-**Best for:** Enterprise workloads, auto-scaling needs
-
----
-
-#### 5. **AWS**
-**Pros:**
-- Global infrastructure
-- Broad GPU selection
-- Spot instances for cost savings
-- Enterprise support
-
-**Pricing:**
-- g4dn.xlarge (T4 16GB): ~$0.526/hour
-- p3.2xlarge (V100 16GB): ~$3.06/hour
-- p4d.24xlarge (8x A100 40GB): ~$32.77/hour
-
-**Best for:** Enterprise, existing AWS infrastructure
-
----
-
-#### 6. **Hugging Face Spaces / Inference Endpoints**
-**Pros:**
-- Managed model hosting
-- Auto-scaling
-- Simple deployment
-- Community models
-
-**Pricing:**
-- CPU: $0.03/hour
-- T4: $0.60/hour
-- A10G: $1.00/hour
-- A100: $4.00/hour
-
-**Best for:** Quick model deployment, serverless inference
-
----
-
-### Recommendation: **RunPod** for Primary GPU Server
-
-**Rationale:**
-1. **Cost-effective**: Pay-per-second billing, ~$0.50/hour for RTX 4090
-2. **Docker-native**: Easy integration with existing compose stack
-3. **Flexibility**: Start/stop as needed, scale up for training
-4. **Community**: Large user base, good documentation
-5. **Network**: Built-in HTTP/SSH tunneling
-
-**Supplementary**: Use Hugging Face for specific model hosting if needed
-
----
-
-## Phase 3: Architecture Design
-
-### Network Topology
-
-```
-┌─────────────────────────────────────────────────────────────┐
-│ pivoine.art VPS (CPU-based)                                 │
-├─────────────────────────────────────────────────────────────┤
-│                                                              │
-│  ┌──────────┐      ┌──────────┐      ┌──────────┐         │
-│  │ Open     │─────▶│ LiteLLM  │◀────▶│ ai_      │         │
-│  │ WebUI    │      │ Proxy    │      │ postgres │         │
-│  └──────────┘      └──────────┘      └──────────┘         │
-│       │                  │                                  │
-│       │                  │                                  │
-└───────┼──────────────────┼──────────────────────────────────┘
-        │                  │
-        │                  ▼
-        │         ┌─────────────────┐
-        │         │ Anthropic API   │
-        │         │ (Claude)        │
-        │         └─────────────────┘
-        │
-        ▼
-┌────────────────────────────────────────────────────────────┐
-│ GPU Server (RunPod)                                        │
-├────────────────────────────────────────────────────────────┤
-│                                                             │
-│  ┌──────────┐  ┌──────────┐  ┌──────────┐  ┌──────────┐ │
-│  │ vLLM     │  │ ComfyUI  │  │ Model    │  │ JupyterLab│ │
-│  │ (LLMs)   │  │ (SD/FLUX)│  │ Training │  │           │ │
-│  └──────────┘  └──────────┘  └──────────┘  └──────────┘ │
-│       │              │             │              │       │
-│       └──────────────┴─────────────┴──────────────┘       │
-│                      │                                     │
-│              ┌───────────────┐                            │
-│              │ Model Storage │                            │
-│              │ (Persistent)  │                            │
-│              └───────────────┘                            │
-│                                                            │
-└────────────────────────────────────────────────────────────┘
-         │
-         ▼ (Tunneled via WireGuard or Tailscale)
-┌────────────────────────────────────────────────────────────┐
-│ Integration Options:                                       │
-├────────────────────────────────────────────────────────────┤
-│ 1. LiteLLM adds vLLM endpoint (http://gpu.internal:8000)  │
-│ 2. ComfyUI exposed via subdomain (comfy.ai.pivoine.art)   │
-│ 3. Model storage synced via rclone/restic                 │
-└────────────────────────────────────────────────────────────┘
-```
-
-### Connection Methods
-
-#### Option A: WireGuard VPN (RECOMMENDED)
-- Create WireGuard tunnel between VPS and GPU server
-- GPU services accessible via private IPs
-- Secure, low overhead, easy to manage
-- Already have wg-easy in your stack
-
-**Setup:**
-1. Deploy WireGuard on GPU server
-2. Add GPU server as VPN peer
-3. Configure LiteLLM to use VPN IPs
-
-#### Option B: SSH Tunnel
-- SSH reverse tunnel from GPU to VPS
-- Simple, no additional software
-- Higher latency
-
-#### Option C: Tailscale
-- Zero-config VPN mesh
-- Easy setup, good UX
-- Proprietary (but free tier available)
-
----
-
-## Phase 4: Service Implementation Plans
-
-### 4.1 LLM Hosting with vLLM
-
-**vLLM** is the industry-standard for high-performance LLM inference.
-
-#### Features:
-- PagedAttention for efficient KV cache
-- Continuous batching
-- OpenAI-compatible API
-- Tensor parallelism for multi-GPU
-- Quantization support (AWQ, GPTQ)
-
-#### Docker Compose Configuration:
-
-```yaml
-services:
-  vllm:
-    image: vllm/vllm-openai:latest
-    container_name: gpu_vllm
-    restart: unless-stopped
-    runtime: nvidia
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-      CUDA_VISIBLE_DEVICES: 0
-    volumes:
-      - vllm_models:/root/.cache/huggingface
-    command:
-      - --model
-      - meta-llama/Meta-Llama-3.1-8B-Instruct  # or any model
-      - --host
-      - 0.0.0.0
-      - --port
-      - 8000
-      - --tensor-parallel-size
-      - '1'
-      - --gpu-memory-utilization
-      - '0.9'
-      - --max-model-len
-      - '8192'
-    ports:
-      - "8000:8000"
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-```
-
-#### Recommended Models for RTX 4090 (24GB):
-
-**Text Generation:**
-- Llama 3.1 8B Instruct (8GB VRAM, fast)
-- Qwen2.5 14B Instruct (14GB VRAM, multilingual)
-- Mistral 7B Instruct v0.3 (7GB VRAM)
-- Nous Hermes 2 Mixtral 8x7B (with quantization, 16GB)
-
-**Code:**
-- DeepSeek Coder 6.7B (7GB VRAM)
-- CodeLlama 13B (13GB VRAM)
-- Qwen2.5-Coder 14B (14GB VRAM)
-
-#### Integration with LiteLLM:
-
-Add to `ai/litellm-config.yaml`:
-
-```yaml
-model_list:
-  # Existing Anthropic
-  - model_name: claude-sonnet-4-5
-    litellm_params:
-      model: anthropic/claude-sonnet-4-5-20250929
-      api_key: os.environ/ANTHROPIC_API_KEY
-
-  # New vLLM models
-  - model_name: llama-3.1-8b
-    litellm_params:
-      model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct
-      api_base: http://gpu.internal:8000/v1
-      api_key: dummy
-
-  - model_name: qwen-2.5-14b
-    litellm_params:
-      model: openai/Qwen/Qwen2.5-14B-Instruct
-      api_base: http://gpu.internal:8000/v1
-      api_key: dummy
-```
-
----
-
-### 4.2 ComfyUI for Image/Video Generation
-
-**ComfyUI** is a node-based UI for Stable Diffusion with advanced workflows.
-
-#### Features:
-- Node-based workflow editor
-- Support for SD 1.5, SDXL, SD3, FLUX
-- ControlNet, LoRA, embeddings
-- Video generation (AnimateDiff, SVD)
-- API for automation
-
-#### Docker Compose Configuration:
-
-```yaml
-services:
-  comfyui:
-    image: ghcr.io/ai-dock/comfyui:latest
-    container_name: gpu_comfyui
-    restart: unless-stopped
-    runtime: nvidia
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-      # Custom nodes auto-install
-      COMFYUI_FLAGS: --listen 0.0.0.0 --port 8188
-    volumes:
-      - comfyui_data:/data
-      - comfyui_models:/opt/ComfyUI/models
-      - comfyui_output:/opt/ComfyUI/output
-    ports:
-      - "8188:8188"
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-```
-
-#### Model Downloads (via ComfyUI Manager):
-
-**Stable Diffusion Models:**
-- FLUX.1-dev (12GB, newest, best quality)
-- FLUX.1-schnell (12GB, fast)
-- SDXL Base 1.0 (6.9GB)
-- SD 1.5 (4GB, fast, wide LoRA support)
-
-**ControlNet Models:**
-- controlnet-canny-sdxl
-- controlnet-depth-sdxl
-- controlnet-openpose-sdxl
-
-**LoRA Models** (download from Civitai):
-- Style LoRAs (anime, realistic, etc.)
-- Character LoRAs
-- Concept LoRAs
-
-#### Traefik Integration:
-
-Add subdomain routing for ComfyUI:
-
-```yaml
-labels:
-  - 'traefik.enable=true'
-  - 'traefik.http.routers.comfyui-web-secure.rule=Host(`comfy.ai.pivoine.art`)'
-  - 'traefik.http.routers.comfyui-web-secure.tls.certresolver=resolver'
-  - 'traefik.http.routers.comfyui-web-secure.entrypoints=web-secure'
-  - 'traefik.http.routers.comfyui-web-secure.middlewares=net-authelia,security-headers@file'
-  - 'traefik.http.services.comfyui.loadbalancer.server.port=8188'
-```
-
-#### Open WebUI Integration:
-
-ComfyUI has a REST API that can be called from Open WebUI using function calling.
-
-Example workflow API call:
-```python
-import requests
-
-def generate_image(prompt: str, negative_prompt: str = ""):
-    workflow = {
-        # ComfyUI workflow JSON
-    }
-    response = requests.post(
-        "http://comfyui:8188/prompt",
-        json={"prompt": workflow}
-    )
-    return response.json()
-```
-
----
-
-### 4.3 Model Training Infrastructure
-
-For fine-tuning LLMs and training custom models.
-
-#### Option A: Axolotl (Recommended)
-
-**Axolotl** is a user-friendly fine-tuning framework supporting:
-- LoRA, QLoRA
-- Full fine-tuning
-- RLHF/DPO
-- Multi-GPU training
-
-```yaml
-services:
-  axolotl:
-    image: winglian/axolotl:main-py3.11-cu121-2.2.2
-    container_name: gpu_training
-    runtime: nvidia
-    volumes:
-      - ./training/configs:/workspace/configs
-      - ./training/data:/workspace/data
-      - ./training/output:/workspace/output
-      - training_cache:/root/.cache
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-      WANDB_API_KEY: ${WANDB_API_KEY:-}  # Optional: Weights & Biases logging
-    command: |
-      bash -c "
-      accelerate launch -m axolotl.cli.train /workspace/configs/config.yaml
-      "
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-```
-
-#### Training Workflow:
-1. Prepare dataset (JSONL format)
-2. Create Axolotl config (LoRA, batch size, epochs)
-3. Start training container
-4. Monitor via Weights & Biases or TensorBoard
-5. Export LoRA adapters
-6. Merge with base model or use in vLLM
-
-#### Example Config:
-```yaml
-# training/configs/lora-llama3.yaml
-base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: /workspace/data/train.jsonl
-    type: completion
-    field: text
-
-output_dir: /workspace/output/llama3-lora
-
-adapter: lora
-lora_r: 16
-lora_alpha: 32
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 3
-learning_rate: 0.0002
-
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-warmup_steps: 100
-```
-
-#### Option B: JupyterLab for Custom Training
-
-For research and custom training scripts:
-
-```yaml
-services:
-  jupyter:
-    image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel
-    container_name: gpu_jupyter
-    runtime: nvidia
-    volumes:
-      - ./notebooks:/workspace
-      - jupyter_cache:/root/.cache
-    ports:
-      - "8888:8888"
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-      JUPYTER_ENABLE_LAB: "yes"
-    command: |
-      bash -c "
-      pip install jupyterlab transformers datasets accelerate bitsandbytes peft &&
-      jupyter lab --ip=0.0.0.0 --allow-root --no-browser --NotebookApp.token=''
-      "
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-```
-
----
-
-### 4.4 Model Storage Strategy
-
-#### Storage Requirements:
-
-**Per Model Type:**
-- LLM 7B: ~14GB (FP16)
-- LLM 13B: ~26GB
-- SDXL: ~7GB
-- FLUX: ~12GB
-- ControlNet: ~2.5GB each
-- LoRA: ~100-500MB each
-
-**Total Estimated:**
-- 3-4 LLMs: ~80GB
-- SD models + LoRAs: ~50GB
-- Training checkpoints: ~100GB
-- **Total: 250-300GB minimum**
-
-#### RunPod Storage Options:
-
-1. **Network Volume** (Recommended)
-   - Persistent across pod restarts
-   - Shared between multiple pods
-   - ~$0.10/GB/month
-   - 500GB = $50/month
-
-2. **Container Disk**
-   - Included with pod
-   - Lost when pod stops
-   - Good for temporary storage
-
-3. **External Storage (rclone)**
-   - Sync to/from VPS or cloud storage
-   - Backup models to Backblaze B2 or Wasabi
-   - Good for disaster recovery
-
-#### Model Management:
-
-Use **Hugging Face Hub** as model cache:
-
-```bash
-# Download models on first run
-huggingface-cli download meta-llama/Meta-Llama-3.1-8B-Instruct \
-  --local-dir /models/llama-3.1-8b
-
-# Or let vLLM/ComfyUI auto-download
-```
-
-**Model Sync Script:**
-```bash
-#!/bin/bash
-# sync-models.sh - Sync models from VPS to GPU server
-
-rclone sync \
-  /mnt/hidrive/AI/models \
-  gpu:/workspace/models \
-  --progress \
-  --transfers 4
-```
-
----
-
-## Phase 5: Implementation Roadmap
-
-### Week 1: Infrastructure Setup
-
-**Day 1-2: RunPod Account & GPU Server**
-- [ ] Create RunPod account
-- [ ] Deploy RTX 4090 pod with Ubuntu 22.04 + PyTorch template
-- [ ] Configure persistent network volume (500GB)
-- [ ] Set up SSH access
-
-**Day 3-4: Network Configuration**
-- [ ] Deploy WireGuard on GPU server
-- [ ] Add GPU server as peer to existing VPN (vpn/compose.yaml)
-- [ ] Test connectivity between VPS and GPU server
-- [ ] Configure firewall rules
-
-**Day 5: Docker Setup on GPU Server**
-- [ ] Install Docker + NVIDIA Container Toolkit
-- [ ] Create docker-compose.yaml for GPU services
-- [ ] Test GPU access in containers
-
----
-
-### Week 2: LLM Hosting
-
-**Day 1-2: vLLM Deployment**
-- [ ] Deploy vLLM container
-- [ ] Download Llama 3.1 8B Instruct
-- [ ] Test inference locally
-- [ ] Benchmark performance (tokens/sec)
-
-**Day 3-4: LiteLLM Integration**
-- [ ] Update litellm-config.yaml with vLLM endpoint
-- [ ] Test via Open WebUI
-- [ ] Configure model routing (cheap models → vLLM, complex → Claude)
-- [ ] Set up usage monitoring
-
-**Day 5: Model Expansion**
-- [ ] Download Qwen 2.5 14B
-- [ ] Download Mistral 7B Instruct
-- [ ] Test model switching in Open WebUI
-- [ ] Document performance characteristics
-
----
-
-### Week 3: Image Generation
-
-**Day 1-2: ComfyUI Setup**
-- [ ] Deploy ComfyUI container
-- [ ] Download FLUX.1-schnell
-- [ ] Download SDXL
-- [ ] Install ComfyUI Manager
-
-**Day 3-4: Model Downloads**
-- [ ] Download ControlNet models
-- [ ] Download VAE models
-- [ ] Download popular LoRAs from Civitai
-- [ ] Organize model directory
-
-**Day 5: Integration & Workflows**
-- [ ] Create basic text-to-image workflow
-- [ ] Create ControlNet workflow
-- [ ] Test API access
-- [ ] Add Traefik subdomain (comfy.ai.pivoine.art)
-
----
-
-### Week 4: Training Infrastructure
-
-**Day 1-2: Axolotl Setup**
-- [ ] Deploy Axolotl container
-- [ ] Create sample dataset
-- [ ] Test LoRA fine-tuning with tiny model
-- [ ] Verify GPU utilization
-
-**Day 3-4: JupyterLab Setup**
-- [ ] Deploy JupyterLab container
-- [ ] Install ML libraries
-- [ ] Create example notebooks
-- [ ] Test custom training scripts
-
-**Day 5: Documentation & Testing**
-- [ ] Write training guides
-- [ ] Test end-to-end workflows
-- [ ] Benchmark training speeds
-- [ ] Document best practices
-
----
-
-### Ongoing: Optimization & Expansion
-
-**Month 2:**
-- Monitor costs and optimize GPU utilization
-- Implement model caching strategies
-- Add more models based on usage patterns
-- Set up automated model updates
-- Implement usage quotas per user
-
-**Month 3+:**
-- Consider multi-GPU setup for larger models
-- Implement model quantization (AWQ/GPTQ)
-- Add video generation (AnimateDiff, CogVideo)
-- Explore voice synthesis (XTTS, Bark)
-- Custom model training for specific use cases
-
----
-
-## Phase 6: Cost Analysis
-
-### Scenario A: Single RTX 4090 (24/7)
-
-**GPU Server (RunPod):**
-- RTX 4090 pod: $0.50/hour × 720 hours = $360/month
-- 500GB network volume: $50/month
-- **Subtotal: $410/month**
-
-**VPS (Existing):**
-- No change in cost
-
-**Total: ~$410/month**
-
-**Savings:**
-- Claude API costs reduced by ~80% (self-hosted for routine tasks)
-- Break-even if currently spending >$500/month on API calls
-
----
-
-### Scenario B: Pay-as-you-go (8 hours/day)
-
-**GPU Server (RunPod):**
-- RTX 4090: $0.50/hour × 8 hours × 30 days = $120/month
-- Storage: $50/month
-- **Subtotal: $170/month**
-
-**Best for:**
-- Development/experimentation
-- Burst workloads
-- Image generation on-demand
-
----
-
-### Scenario C: Dual GPU (Training + Inference)
-
-**GPU Server 1 (Inference):**
-- RTX 4090 24/7: $360/month
-
-**GPU Server 2 (Training - On-demand):**
-- A100 40GB: $1.50/hour × 40 hours/month = $60/month
-- Used only for fine-tuning sessions
-
-**Storage:**
-- 1TB network volume: $100/month
-
-**Total: ~$520/month**
-
----
-
-### Cost Optimization Tips
-
-1. **Auto-stop idle pods**: RunPod can auto-stop after X minutes idle
-2. **Use spot instances**: ~50% cheaper but can be interrupted
-3. **Quantized models**: 4-bit models use 4x less VRAM → cheaper GPUs
-4. **Batch processing**: Queue image gen jobs to maximize GPU usage
-5. **Model sharing**: One vLLM instance can serve multiple models via adapters
-6. **Monitoring**: Track per-model costs to optimize routing
-
----
-
-## Phase 7: Monitoring & Operations
-
-### Metrics to Track
-
-**GPU Utilization:**
-- nvidia-smi metrics (utilization %, memory usage)
-- Temperature and power draw
-- Per-process GPU usage
-
-**Model Performance:**
-- Tokens per second (LLM inference)
-- Images per second (SD/FLUX)
-- Training time per epoch
-
-**Costs:**
-- GPU hours consumed
-- Storage usage
-- API vs self-hosted breakdown
-
-### Monitoring Stack
-
-**Option A: Netdata (Already deployed)**
-
-Add GPU monitoring to existing Netdata:
-
-```yaml
-# On GPU server
-services:
-  netdata:
-    image: netdata/netdata:latest
-    container_name: gpu_netdata
-    runtime: nvidia
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-    volumes:
-      - /sys:/host/sys:ro
-      - /proc:/host/proc:ro
-      - /var/run/docker.sock:/var/run/docker.sock:ro
-    command: |
-      bash -c "
-      # Enable nvidia_smi plugin
-      /usr/libexec/netdata/plugins.d/charts.d.plugin nvidia_smi
-      "
-```
-
-**Option B: Prometheus + Grafana**
-
-For detailed metrics:
-
-```yaml
-services:
-  prometheus:
-    image: prom/prometheus:latest
-    volumes:
-      - ./prometheus.yml:/etc/prometheus/prometheus.yml
-      - prometheus_data:/prometheus
-    command:
-      - '--config.file=/etc/prometheus/prometheus.yml'
-
-  dcgm-exporter:
-    image: nvidia/dcgm-exporter:latest
-    runtime: nvidia
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-
-  grafana:
-    image: grafana/grafana:latest
-    ports:
-      - "3000:3000"
-    volumes:
-      - grafana_data:/var/lib/grafana
-```
-
-Import Grafana dashboard #12219 for GPU metrics.
-
----
-
-## Phase 8: Backup & Disaster Recovery
-
-### What to Backup
-
-1. **Models** (250-300GB)
-   - Base models can be re-downloaded
-   - Custom fine-tuned models: CRITICAL
-   - LoRAs: CRITICAL
-
-2. **Training Data** (~10-50GB)
-   - Datasets
-   - Preprocessing scripts
-
-3. **Configurations** (<1GB)
-   - Docker compose files
-   - Training configs
-   - Workflow JSONs
-
-### Backup Strategy
-
-**Tier 1: Critical (Daily)**
-- Fine-tuned models
-- Training checkpoints
-- Custom datasets
-
-**Backup to:**
-- Restic → HiDrive (already configured)
-- Backblaze B2 (~$6/TB/month)
-
-```bash
-# Add to core/compose.yaml backrest config
-- gpu_models:/volumes/gpu_models:ro
-- gpu_checkpoints:/volumes/gpu_checkpoints:ro
-```
-
-**Tier 2: Nice-to-have (Weekly)**
-- Base models (can re-download)
-- ComfyUI outputs
-
-**Tier 3: Ephemeral (No backup)**
-- Inference cache
-- Temporary generations
-
----
-
-## Phase 9: Security Considerations
-
-### GPU Server Security
-
-1. **Firewall:**
-   - Only allow WireGuard port (51820)
-   - All services accessed via VPN
-   - No public exposure
-
-2. **SSH:**
-   - Key-based auth only
-   - Disable password auth
-   - Change default port
-
-3. **Docker:**
-   - Rootless Docker (optional but recommended)
-   - Limited container capabilities
-   - No privileged containers except for nvidia-runtime
-
-4. **Secrets:**
-   - Store API keys in .env
-   - Use Docker secrets for sensitive data
-   - Rotate keys periodically
-
-### Access Control
-
-- **ComfyUI**: Protected by Authelia SSO (already configured)
-- **vLLM**: Internal only, accessed via LiteLLM proxy
-- **JupyterLab**: Password-protected or Authelia
-- **Training**: No public access, VPN only
-
----
-
-## Phase 10: Advanced Features (Future)
-
-### Multi-GPU Scaling
-
-**Tensor Parallelism** (vLLM):
-- Split large models across multiple GPUs
-- Example: 70B model on 2x A100s
-
-```yaml
-command:
-  - --model
-  - meta-llama/Meta-Llama-3.1-70B-Instruct
-  - --tensor-parallel-size
-  - '2'  # Use 2 GPUs
-```
-
-**Pipeline Parallelism** (training):
-- Split model layers across GPUs
-- Useful for very large models
-
-### Model Serving Optimization
-
-**vLLM Features:**
-- Speculative decoding (faster generation)
-- Prefix caching (faster for repeated prompts)
-- Multi-LoRA serving (multiple adapters, one base model)
-
-**Example multi-LoRA:**
-```yaml
-command:
-  - --model
-  - meta-llama/Meta-Llama-3.1-8B-Instruct
-  - --enable-lora
-  - --max-loras
-  - '4'
-  - --lora-modules
-  - customer-support=/models/loras/support-lora
-  - creative-writing=/models/loras/writing-lora
-```
-
-### Video Generation
-
-**AnimateDiff in ComfyUI:**
-- Generate short videos from text prompts
-- Animate static images
-- ~8GB VRAM for 512x512 16-frame videos
-
-**CogVideo:**
-- High-quality video generation
-- Requires A100 or H100
-- 5-second clips at 720p
-
-### Voice Synthesis
-
-**XTTS v2:**
-- High-quality voice cloning
-- Multi-language support
-- ~6GB VRAM
-
-**Bark:**
-- Text-to-speech with emotions
-- Sound effects
-- ~10GB VRAM
-
----
-
-## Appendix A: Quick Start Commands
-
-### Initial GPU Server Setup
-
-```bash
-# SSH into RunPod instance
-ssh root@gpu.runpod.io -p 12345
-
-# Install Docker
-curl -fsSL https://get.docker.com -o get-docker.sh
-sh get-docker.sh
-
-# Install NVIDIA Container Toolkit
-distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
-curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
-curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \
-  sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
-  tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
-apt-get update
-apt-get install -y nvidia-container-toolkit
-systemctl restart docker
-
-# Test GPU access
-docker run --rm --runtime=nvidia --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
-```
-
-### Deploy vLLM (Quick Test)
-
-```bash
-# Create directory
-mkdir -p /workspace/vllm
-cd /workspace/vllm
-
-# Run vLLM
-docker run -d \
-  --name vllm \
-  --runtime=nvidia \
-  --gpus all \
-  -v ~/.cache/huggingface:/root/.cache/huggingface \
-  -p 8000:8000 \
-  vllm/vllm-openai:latest \
-  --model meta-llama/Meta-Llama-3.1-8B-Instruct \
-  --dtype auto \
-  --max-model-len 8192
-
-# Test inference
-curl http://localhost:8000/v1/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-    "prompt": "Once upon a time",
-    "max_tokens": 50
-  }'
-```
-
-### Deploy ComfyUI (Quick Test)
-
-```bash
-docker run -d \
-  --name comfyui \
-  --runtime=nvidia \
-  --gpus all \
-  -v /workspace/comfyui:/data \
-  -p 8188:8188 \
-  ghcr.io/ai-dock/comfyui:latest
-
-# Access at http://gpu-ip:8188
-```
-
----
-
-## Appendix B: Sample Docker Compose (Full GPU Stack)
-
-```yaml
-# gpu-server/compose.yaml
-version: '3.8'
-
-services:
-  # vLLM for LLM inference
-  vllm:
-    image: vllm/vllm-openai:latest
-    container_name: gpu_vllm
-    restart: unless-stopped
-    runtime: nvidia
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-      CUDA_VISIBLE_DEVICES: 0
-    volumes:
-      - vllm_models:/root/.cache/huggingface
-    command:
-      - --model
-      - meta-llama/Meta-Llama-3.1-8B-Instruct
-      - --host
-      - 0.0.0.0
-      - --port
-      - 8000
-      - --gpu-memory-utilization
-      - '0.9'
-    ports:
-      - "8000:8000"
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-
-  # ComfyUI for image generation
-  comfyui:
-    image: ghcr.io/ai-dock/comfyui:latest
-    container_name: gpu_comfyui
-    restart: unless-stopped
-    runtime: nvidia
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-    volumes:
-      - comfyui_data:/data
-      - comfyui_models:/opt/ComfyUI/models
-      - comfyui_output:/opt/ComfyUI/output
-    ports:
-      - "8188:8188"
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-
-  # Axolotl for model training
-  axolotl:
-    image: winglian/axolotl:main-py3.11-cu121-2.2.2
-    container_name: gpu_training
-    runtime: nvidia
-    volumes:
-      - ./training/configs:/workspace/configs
-      - ./training/data:/workspace/data
-      - ./training/output:/workspace/output
-      - training_cache:/root/.cache
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-    # Only start when training
-    profiles:
-      - training
-
-  # JupyterLab for research
-  jupyter:
-    image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel
-    container_name: gpu_jupyter
-    restart: unless-stopped
-    runtime: nvidia
-    volumes:
-      - ./notebooks:/workspace
-      - jupyter_cache:/root/.cache
-    ports:
-      - "8888:8888"
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-      JUPYTER_ENABLE_LAB: "yes"
-    command: |
-      bash -c "
-      pip install jupyterlab transformers datasets accelerate bitsandbytes peft &&
-      jupyter lab --ip=0.0.0.0 --allow-root --no-browser
-      "
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-
-  # Netdata monitoring
-  netdata:
-    image: netdata/netdata:latest
-    container_name: gpu_netdata
-    restart: unless-stopped
-    runtime: nvidia
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-    volumes:
-      - /sys:/host/sys:ro
-      - /proc:/host/proc:ro
-      - /var/run/docker.sock:/var/run/docker.sock:ro
-    ports:
-      - "19999:19999"
-    cap_add:
-      - SYS_PTRACE
-      - SYS_ADMIN
-    security_opt:
-      - apparmor:unconfined
-
-volumes:
-  vllm_models:
-  comfyui_data:
-  comfyui_models:
-  comfyui_output:
-  training_cache:
-  jupyter_cache:
-```
-
----
-
-## Appendix C: Cost Calculator
-
-**Monthly GPU Costs:**
-
-| GPU Model | VRAM | $/hour | 24/7 Month | 8hr/day | Use Case |
-|-----------|------|--------|------------|---------|----------|
-| RTX 3090 | 24GB | $0.35 | $252 | $84 | Development, small models |
-| RTX 4090 | 24GB | $0.50 | $360 | $120 | Production inference, SD |
-| A6000 | 48GB | $0.80 | $576 | $192 | Large models, training |
-| A100 40GB | 40GB | $1.50 | $1,080 | $360 | Enterprise, training |
-| A100 80GB | 80GB | $2.50 | $1,800 | $600 | Massive models, research |
-
-**Storage Costs:**
-- Network volume: $0.10/GB/month
-- 500GB = $50/month
-- 1TB = $100/month
-
-**Total Estimated Monthly:**
-- RTX 4090 + 500GB storage = $410/month (24/7)
-- RTX 4090 + 500GB storage = $170/month (8hr/day)
-
-**Break-even Analysis:**
-- If spending >$500/month on API calls → GPU server saves money
-- If spending <$200/month → stick with APIs
-
----
-
-## Appendix D: Model Recommendations by Use Case
-
-### General Chat (24/7 Inference)
-**Best:** Qwen 2.5 14B Instruct
-- Excellent multilingual support
-- Fast inference
-- Good reasoning
-
-**Alternative:** Mistral 7B Instruct v0.3
-- Fastest inference
-- Lower VRAM
-
-### Code Generation
-**Best:** Qwen 2.5 Coder 14B
-- SOTA coding performance
-- Multi-language support
-
-**Alternative:** DeepSeek Coder 6.7B
-- Faster, lighter
-
-### Creative Writing
-**Best:** Nous Hermes 2 Mixtral 8x7B (quantized)
-- Creative, engaging
-- Follows instructions well
-
-### Image Generation (Realistic)
-**Best:** FLUX.1-dev
-- Highest quality
-- Best prompt following
-
-**Alternative:** SDXL + RealVisXL LoRA
-- Faster generation
-- Good quality
-
-### Image Generation (Anime)
-**Best:** SDXL + AnimagineXL LoRA
-- Anime-specific training
-- Vibrant colors
-
-### Video Generation
-**Best:** AnimateDiff + SDXL
-- 16-frame clips
-- Good quality
-
-**Needs:** A100 40GB or better
-
----
-
-## Next Steps
-
-1. **Review this plan** and provide feedback
-2. **Set budget** for GPU infrastructure
-3. **Choose provider** (recommend RunPod)
-4. **Define priority services** (LLM hosting first? Image gen first?)
-5. **Schedule implementation** (4-week timeline above)
-
-Would you like me to:
-- Create the detailed Docker Compose configurations?
-- Set up a cost estimation spreadsheet?
-- Research specific models for your use cases?
-- Begin implementation with Phase 1?
-
-Let me know how you'd like to proceed! 🚀
diff --git a/ai/README_GPU_SETUP.md b/ai/README_GPU_SETUP.md
deleted file mode 100644
index 34974f0..0000000
--- a/ai/README_GPU_SETUP.md
+++ /dev/null
@@ -1,444 +0,0 @@
-# GPU-Enhanced AI Stack - Implementation Guide
-
-Welcome to your GPU expansion setup! This directory contains everything you need to deploy a production-ready GPU server for LLM hosting, image generation, and model training.
-
-## 📚 Documentation Files
-
-### Planning & Architecture
-- **`GPU_EXPANSION_PLAN.md`** - Complete 70-page plan with provider comparison, architecture, and roadmap
-- **`README_GPU_SETUP.md`** - This file
-
-### Step-by-Step Setup Guides
-1. **`SETUP_GUIDE.md`** - Day 1-2: RunPod account & GPU server deployment
-2. **`WIREGUARD_SETUP.md`** - Day 3-4: VPN connection between VPS and GPU server
-3. **`DOCKER_GPU_SETUP.md`** - Day 5: Docker + NVIDIA Container Toolkit configuration
-
-### Configuration Files
-- **`gpu-server-compose.yaml`** - Production Docker Compose for GPU server
-- **`litellm-config-gpu.yaml`** - Updated LiteLLM config with self-hosted models
-- **`deploy-gpu-stack.sh`** - Automated deployment script
-
----
-
-## 🚀 Quick Start (Week 1 Checklist)
-
-### Day 1-2: RunPod & GPU Server ✓
-- [ ] Create RunPod account at https://www.runpod.io/
-- [ ] Add billing method ($50 initial credit recommended)
-- [ ] Deploy RTX 4090 pod with PyTorch template
-- [ ] Configure 500GB network volume
-- [ ] Verify SSH access
-- [ ] Test GPU with `nvidia-smi`
-- [ ] **Guide:** `SETUP_GUIDE.md`
-
-### Day 3-4: Network Configuration ✓
-- [ ] Install Tailscale on VPS
-- [ ] Install Tailscale on GPU server
-- [ ] Authenticate both devices
-- [ ] Test VPN connectivity
-- [ ] Configure firewall rules
-- [ ] Verify VPS can reach GPU server
-- [ ] **Guide:** `TAILSCALE_SETUP.md`
-
-### Day 5: Docker & GPU Setup ✓
-- [ ] Install Docker on GPU server
-- [ ] Install NVIDIA Container Toolkit
-- [ ] Test GPU access in containers
-- [ ] Create /workspace/gpu-stack directory
-- [ ] Copy configuration files
-- [ ] **Guide:** `DOCKER_GPU_SETUP.md`
-
-### Day 6-7: Deploy Services ✓
-- [ ] Copy `gpu-server-compose.yaml` to GPU server
-- [ ] Edit `.env` with your settings
-- [ ] Run `./deploy-gpu-stack.sh`
-- [ ] Wait for vLLM to load model (~5 minutes)
-- [ ] Test vLLM: `curl http://localhost:8000/v1/models`
-- [ ] Access ComfyUI: `http://[tailscale-ip]:8188`
-- [ ] **Script:** `deploy-gpu-stack.sh`
-
----
-
-## 📦 Services Included
-
-### vLLM (http://[tailscale-ip]:8000)
-**Purpose:** High-performance LLM inference
-**Default Model:** Llama 3.1 8B Instruct
-**Performance:** 50-80 tokens/second on RTX 4090
-**Use for:** General chat, Q&A, code generation, summarization
-
-**Switch models:**
-Edit `gpu-server-compose.yaml`, change `--model` parameter, restart:
-```bash
-docker compose restart vllm
-```
-
-### ComfyUI (http://[tailscale-ip]:8188)
-**Purpose:** Advanced Stable Diffusion interface
-**Features:** FLUX, SDXL, ControlNet, LoRA
-**Use for:** Image generation, img2img, inpainting
-
-**Download models:**
-Access web UI → ComfyUI Manager → Install Models
-
-### JupyterLab (http://[tailscale-ip]:8888)
-**Purpose:** Interactive development environment
-**Token:** `pivoine-ai-2025` (change in `.env`)
-**Use for:** Research, experimentation, custom training scripts
-
-### Axolotl (Training - on-demand)
-**Purpose:** LLM fine-tuning framework
-**Start:** `docker compose --profile training up -d axolotl`
-**Use for:** LoRA training, full fine-tuning, RLHF
-
-### Netdata (http://[tailscale-ip]:19999)
-**Purpose:** System & GPU monitoring
-**Features:** Real-time metrics, GPU utilization, memory usage
-**Use for:** Performance monitoring, troubleshooting
-
----
-
-## 🔧 Configuration
-
-### Environment Variables (.env)
-
-```bash
-# VPN Network (Tailscale)
-VPS_IP=100.x.x.x         # Your VPS Tailscale IP (get with: tailscale ip -4)
-GPU_IP=100.x.x.x         # GPU server Tailscale IP (get with: tailscale ip -4)
-
-# Model Storage
-MODELS_PATH=/workspace/models
-
-# Hugging Face Token (for gated models like Llama)
-HF_TOKEN=hf_xxxxxxxxxxxxx
-
-# Weights & Biases (for training logging)
-WANDB_API_KEY=
-
-# JupyterLab Access
-JUPYTER_TOKEN=pivoine-ai-2025
-
-# PostgreSQL (on VPS)
-DB_HOST=100.x.x.x        # Your VPS Tailscale IP
-DB_PORT=5432
-DB_USER=valknar
-DB_PASSWORD=ragnarok98
-DB_NAME=openwebui
-```
-
-### Updating LiteLLM on VPS
-
-After GPU server is running, update your VPS LiteLLM config:
-
-```bash
-# On VPS
-cd ~/Projects/docker-compose/ai
-
-# Backup current config
-cp litellm-config.yaml litellm-config.yaml.backup
-
-# Copy new config with GPU models
-cp litellm-config-gpu.yaml litellm-config.yaml
-
-# Restart LiteLLM
-arty restart litellm
-```
-
-Now Open WebUI will have access to both Claude (API) and Llama (self-hosted)!
-
----
-
-## 💰 Cost Management
-
-### Current Costs (24/7 Operation)
-- **GPU Server:** RTX 4090 @ $0.50/hour = $360/month
-- **Storage:** 500GB network volume = $50/month
-- **Total:** **$410/month**
-
-### Cost-Saving Options
-
-**1. Pay-as-you-go (8 hours/day)**
-- GPU: $0.50 × 8 × 30 = $120/month
-- Storage: $50/month
-- **Total: $170/month**
-
-**2. Auto-stop idle pods**
-RunPod can auto-stop after X minutes idle:
-- Dashboard → Pod Settings → Auto-stop after 30 minutes
-
-**3. Use smaller models**
-- Mistral 7B instead of Llama 8B: Faster, cheaper GPU
-- Quantized models: 4-bit = 1/4 the VRAM
-
-**4. Batch image generation**
-- Generate multiple images at once
-- Use scheduled jobs (cron) during off-peak hours
-
-### Cost Tracking
-
-**Check GPU usage:**
-```bash
-# On RunPod dashboard
-Billing → Usage History
-
-# See hourly costs, total spent
-```
-
-**Check API vs GPU savings:**
-```bash
-# On VPS, check LiteLLM logs
-docker logs ai_litellm | grep "model="
-
-# Count requests to llama-3.1-8b vs claude-*
-```
-
-**Expected savings:**
-- 80% of requests → self-hosted = $0 cost
-- 20% of requests → Claude = API cost
-- Break-even if currently spending >$500/month on APIs
-
----
-
-## 🔍 Monitoring & Troubleshooting
-
-### Check Service Status
-
-```bash
-# On GPU server
-cd /workspace/gpu-stack
-
-# View all services
-docker compose ps
-
-# Check specific service logs
-docker compose logs -f vllm
-docker compose logs -f comfyui
-docker compose logs -f jupyter
-
-# Check GPU usage
-nvidia-smi
-# or prettier:
-nvtop
-```
-
-### Common Issues
-
-**vLLM not loading model:**
-```bash
-# Check logs
-docker compose logs vllm
-
-# Common causes:
-# - Model download in progress (wait 5-10 minutes)
-# - Out of VRAM (try smaller model)
-# - Missing HF_TOKEN (for gated models like Llama)
-```
-
-**ComfyUI slow/crashing:**
-```bash
-# Check GPU memory
-nvidia-smi
-
-# If VRAM full:
-# - Close vLLM temporarily
-# - Use smaller models
-# - Reduce batch size in ComfyUI
-```
-
-**Can't access from VPS:**
-```bash
-# Test VPN
-ping [tailscale-ip]
-
-# If fails:
-# - Check Tailscale status: tailscale status
-# - Restart Tailscale: tailscale down && tailscale up
-# - Check firewall: ufw status
-```
-
-**Docker can't see GPU:**
-```bash
-# Test GPU access
-docker run --rm --runtime=nvidia --gpus all nvidia/cuda:12.1.0-base nvidia-smi
-
-# If fails:
-# - Check NVIDIA driver: nvidia-smi
-# - Check nvidia-docker: nvidia-ctk --version
-# - Restart Docker: systemctl restart docker
-```
-
----
-
-## 📊 Performance Benchmarks
-
-### Expected Performance (RTX 4090)
-
-**LLM Inference (vLLM):**
-- Llama 3.1 8B: 50-80 tokens/second
-- Qwen 2.5 14B: 30-50 tokens/second
-- Batch size 32: ~1500 tokens/second
-
-**Image Generation (ComfyUI):**
-- SDXL (1024×1024): ~4-6 seconds
-- FLUX (1024×1024): ~8-12 seconds
-- SD 1.5 (512×512): ~1-2 seconds
-
-**Training (Axolotl):**
-- LoRA fine-tuning (8B model): ~3-5 hours for 3 epochs
-- Full fine-tuning: Not recommended on 24GB VRAM
-
----
-
-## 🔐 Security Best Practices
-
-### Network Security
-✅ All services behind Tailscale VPN (end-to-end encrypted)
-✅ No public exposure (except RunPod's SSH)
-✅ Firewall configured (no additional ports needed)
-
-### Access Control
-✅ JupyterLab password-protected
-✅ ComfyUI accessible via VPN only
-✅ vLLM internal API (no auth needed)
-
-### SSH Security
-```bash
-# On GPU server, harden SSH
-nano /etc/ssh/sshd_config
-
-# Set:
-PermitRootLogin prohibit-password
-PasswordAuthentication no
-PubkeyAuthentication yes
-
-systemctl restart sshd
-```
-
-### Regular Updates
-```bash
-# Weekly updates
-apt update && apt upgrade -y
-
-# Update Docker images
-docker compose pull
-docker compose up -d
-```
-
----
-
-## 📈 Scaling Up
-
-### When to Add More GPUs
-
-**Current limitations (1× RTX 4090):**
-- Can run ONE of these at a time:
-  - 8B LLM at full speed
-  - 14B LLM at moderate speed
-  - SDXL image generation
-  - Training job
-
-**Add 2nd GPU if:**
-- You want LLM + image gen simultaneously
-- Training + inference at same time
-- Multiple users with high demand
-
-**Multi-GPU options:**
-- 2× RTX 4090: Run vLLM + ComfyUI separately ($720/month)
-- 1× A100 40GB: Larger models (70B with quantization) ($1,080/month)
-- Mix: RTX 4090 (inference) + A100 (training) (~$1,300/month)
-
-### Deploying Larger Models
-
-**70B models (need 2× A100 or 4× RTX 4090):**
-```yaml
-# In gpu-server-compose.yaml
-vllm:
-  command:
-    - --model
-    - meta-llama/Meta-Llama-3.1-70B-Instruct
-    - --tensor-parallel-size
-    - "2"  # Split across 2 GPUs
-  deploy:
-    resources:
-      reservations:
-        devices:
-          - driver: nvidia
-            count: 2  # Use 2 GPUs
-            capabilities: [gpu]
-```
-
----
-
-## 🎯 Next Steps (Week 2+)
-
-### Week 2: LLM Production Deployment
-- [ ] Test Llama 3.1 8B performance
-- [ ] Download additional models (Qwen, Mistral)
-- [ ] Configure model routing in LiteLLM
-- [ ] Set up usage monitoring
-- [ ] Benchmark tokens/second for each model
-
-### Week 3: Image Generation
-- [ ] Download FLUX and SDXL models
-- [ ] Install ComfyUI Manager
-- [ ] Download ControlNet models
-- [ ] Create sample workflows
-- [ ] Test API integration with Open WebUI
-
-### Week 4: Training Infrastructure
-- [ ] Prepare a sample dataset
-- [ ] Test LoRA fine-tuning with Axolotl
-- [ ] Set up Weights & Biases logging
-- [ ] Create training documentation
-- [ ] Benchmark training speed
-
----
-
-## 🆘 Getting Help
-
-### Resources
-- **RunPod Docs:** https://docs.runpod.io/
-- **vLLM Docs:** https://docs.vllm.ai/
-- **ComfyUI Wiki:** https://github.com/comfyanonymous/ComfyUI/wiki
-- **Axolotl Docs:** https://github.com/OpenAccess-AI-Collective/axolotl
-
-### Community
-- **RunPod Discord:** https://discord.gg/runpod
-- **vLLM Discord:** https://discord.gg/vllm
-- **r/LocalLLaMA:** https://reddit.com/r/LocalLLaMA
-
-### Support
-If you encounter issues:
-1. Check logs: `docker compose logs -f [service]`
-2. Check GPU: `nvidia-smi`
-3. Check VPN: `wg show`
-4. Restart service: `docker compose restart [service]`
-5. Full restart: `docker compose down && docker compose up -d`
-
----
-
-## ✅ Success Criteria
-
-You're ready to proceed when:
-- [ ] GPU server responds to `ping [tailscale-ip]` from VPS
-- [ ] vLLM returns models: `curl http://[tailscale-ip]:8000/v1/models`
-- [ ] ComfyUI web interface loads: `http://[tailscale-ip]:8188`
-- [ ] JupyterLab accessible with token
-- [ ] Netdata shows GPU metrics
-- [ ] Open WebUI shows both Claude and Llama models
-
-**Total setup time:** 4-6 hours (if following guides sequentially)
-
----
-
-## 🎉 You're All Set!
-
-Your GPU-enhanced AI stack is ready. You now have:
-- ✅ Self-hosted LLM inference (saves $$$)
-- ✅ Advanced image generation (FLUX, SDXL)
-- ✅ Model training capabilities (LoRA, fine-tuning)
-- ✅ Secure VPN connection
-- ✅ Full monitoring and logging
-
-Enjoy building with your new AI infrastructure! 🚀
diff --git a/ai/SETUP_GUIDE.md b/ai/SETUP_GUIDE.md
deleted file mode 100644
index 1d14145..0000000
--- a/ai/SETUP_GUIDE.md
+++ /dev/null
@@ -1,261 +0,0 @@
-# GPU Server Setup Guide - Week 1
-
-## Day 1-2: RunPod Account & GPU Server
-
-### Step 1: Create RunPod Account
-
-1. **Go to RunPod**: https://www.runpod.io/
-2. **Sign up** with email or GitHub
-3. **Add billing method**:
-   - Credit card required
-   - No charges until you deploy a pod
-   - Recommended: Add $50 initial credit
-
-4. **Verify email** and complete account setup
-
-### Step 2: Deploy Your First GPU Pod
-
-#### 2.1 Navigate to Pods
-
-1. Click **"Deploy"** in top menu
-2. Select **"GPU Pods"**
-
-#### 2.2 Choose GPU Type
-
-**Recommended: RTX 4090**
-- 24GB VRAM
-- ~$0.50/hour
-- Perfect for LLMs up to 14B params
-- Great for SDXL/FLUX
-
-**Filter options:**
-- GPU Type: RTX 4090
-- GPU Count: 1
-- Sort by: Price (lowest first)
-- Region: Europe (lower latency to Germany)
-
-#### 2.3 Select Template
-
-Choose: **"RunPod PyTorch"** template
-- Includes: CUDA, PyTorch, Python
-- Pre-configured for GPU workloads
-- Docker pre-installed
-
-**Alternative**: "Ubuntu 22.04 with CUDA 12.1" (more control)
-
-#### 2.4 Configure Pod
-
-**Container Settings:**
-- **Container Disk**: 50GB (temporary, auto-included)
-- **Expose Ports**:
-  - Add: 22 (SSH)
-  - Add: 8000 (vLLM)
-  - Add: 8188 (ComfyUI)
-  - Add: 8888 (JupyterLab)
-
-**Volume Settings:**
-- Click **"+ Network Volume"**
-- **Name**: `gpu-models-storage`
-- **Size**: 500GB
-- **Region**: Same as pod
-- **Cost**: ~$50/month
-
-**Environment Variables:**
-- Add later (not needed for initial setup)
-
-#### 2.5 Deploy Pod
-
-1. Review configuration
-2. Click **"Deploy On-Demand"** (not Spot for reliability)
-3. Wait 2-3 minutes for deployment
-
-**Expected cost:**
-- GPU: $0.50/hour = $360/month (24/7)
-- Storage: $50/month
-- **Total: $410/month**
-
-### Step 3: Access Your GPU Server
-
-#### 3.1 Get Connection Info
-
-Once deployed, you'll see:
-- **Pod ID**: e.g., `abc123def456`
-- **SSH Command**: `ssh root@<pod-id>.runpod.io -p 12345`
-- **Public IP**: May not be directly accessible (use SSH)
-
-#### 3.2 SSH Access
-
-RunPod automatically generates SSH keys for you:
-
-```bash
-# Copy the SSH command from RunPod dashboard
-ssh root@abc123def456.runpod.io -p 12345
-
-# First time: Accept fingerprint
-# You should now be in the GPU server!
-```
-
-**Verify GPU:**
-```bash
-nvidia-smi
-```
-
-Expected output:
-```
-+-----------------------------------------------------------------------------+
-| NVIDIA-SMI 535.xx       Driver Version: 535.xx       CUDA Version: 12.1    |
-|-------------------------------+----------------------+----------------------+
-| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
-| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
-|===============================+======================+======================|
-|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
-| 30%   45C    P0    50W / 450W |      0MiB / 24564MiB |      0%      Default |
-+-------------------------------+----------------------+----------------------+
-```
-
-### Step 4: Initial Server Configuration
-
-#### 4.1 Update System
-
-```bash
-# Update package lists
-apt update
-
-# Upgrade existing packages
-apt upgrade -y
-
-# Install essential tools
-apt install -y \
-  vim \
-  htop \
-  tmux \
-  curl \
-  wget \
-  git \
-  net-tools \
-  iptables-persistent
-```
-
-#### 4.2 Set Timezone
-
-```bash
-timedatectl set-timezone Europe/Berlin
-date  # Verify
-```
-
-#### 4.3 Create Working Directory
-
-```bash
-# Create workspace
-mkdir -p /workspace/{models,configs,data,scripts}
-
-# Check network volume mount
-ls -la /workspace
-# Should show your 500GB volume
-```
-
-#### 4.4 Configure SSH (Optional but Recommended)
-
-**Generate your own SSH key on your local machine:**
-
-```bash
-# On your local machine (not GPU server)
-ssh-keygen -t ed25519 -C "gpu-server-pivoine" -f ~/.ssh/gpu_pivoine
-
-# Copy public key to GPU server
-ssh-copy-id -i ~/.ssh/gpu_pivoine.pub root@abc123def456.runpod.io -p 12345
-```
-
-**Add to your local ~/.ssh/config:**
-
-```bash
-Host gpu-pivoine
-    HostName abc123def456.runpod.io
-    Port 12345
-    User root
-    IdentityFile ~/.ssh/gpu_pivoine
-```
-
-Now you can connect with: `ssh gpu-pivoine`
-
-### Step 5: Verify GPU Access
-
-Run this test:
-
-```bash
-# Test CUDA
-python3 -c "import torch; print('CUDA available:', torch.cuda.is_available()); print('GPU count:', torch.cuda.device_count())"
-```
-
-Expected output:
-```
-CUDA available: True
-GPU count: 1
-```
-
-### Troubleshooting
-
-**Problem: Can't connect via SSH**
-- Check pod is running (not stopped)
-- Verify port number in SSH command
-- Try web terminal in RunPod dashboard
-
-**Problem: GPU not detected**
-- Run `nvidia-smi`
-- Check RunPod selected correct GPU type
-- Restart pod if needed
-
-**Problem: Network volume not mounted**
-- Check RunPod dashboard → Volume tab
-- Verify volume is attached to pod
-- Try: `df -h` to see mounts
-
-### Next Steps
-
-Once SSH access works and GPU is verified:
-✅ Proceed to **Day 3-4: Network Configuration (Tailscale VPN)**
-
-### Save Important Info
-
-Create a file to track your setup:
-
-```bash
-# On GPU server
-cat > /workspace/SERVER_INFO.md << 'EOF'
-# GPU Server Information
-
-## Connection
-- SSH: ssh root@abc123def456.runpod.io -p 12345
-- Pod ID: abc123def456
-- Region: [YOUR_REGION]
-
-## Hardware
-- GPU: RTX 4090 24GB
-- CPU: [Check with: lscpu]
-- RAM: [Check with: free -h]
-- Storage: 500GB network volume at /workspace
-
-## Costs
-- GPU: $0.50/hour
-- Storage: $50/month
-- Total: ~$410/month (24/7)
-
-## Deployed: [DATE]
-EOF
-```
-
----
-
-## Checkpoint ✓
-
-Before moving to Day 3, verify:
-- [ ] RunPod account created and billing added
-- [ ] RTX 4090 pod deployed successfully
-- [ ] 500GB network volume attached
-- [ ] SSH access working
-- [ ] `nvidia-smi` shows GPU
-- [ ] `torch.cuda.is_available()` returns True
-- [ ] Timezone set to Europe/Berlin
-- [ ] Essential tools installed
-
-**Ready for Tailscale setup? Let's go!**
diff --git a/ai/TAILSCALE_SETUP.md b/ai/TAILSCALE_SETUP.md
deleted file mode 100644
index 9950469..0000000
--- a/ai/TAILSCALE_SETUP.md
+++ /dev/null
@@ -1,417 +0,0 @@
-# Tailscale VPN Setup - Better Alternative to WireGuard
-
-## Why Tailscale?
-
-RunPod doesn't support UDP ports, which blocks WireGuard. Tailscale solves this by:
-- ✅ Works over HTTPS (TCP) - no UDP needed
-- ✅ Zero configuration - automatic setup
-- ✅ Free for personal use
-- ✅ Built on WireGuard (same security)
-- ✅ Automatic NAT traversal
-- ✅ Peer-to-peer when possible (low latency)
-
----
-
-## Step 1: Create Tailscale Account
-
-1. Go to: https://tailscale.com/
-2. Click **"Get Started"**
-3. Sign up with **GitHub** or **Google** (easiest)
-4. You'll be redirected to the Tailscale admin console
-
-**No credit card required!** Free tier is perfect for our use case.
-
----
-
-## Step 2: Install Tailscale on VPS
-
-**SSH into your VPS:**
-
-```bash
-ssh root@vps
-```
-
-**Install Tailscale:**
-
-```bash
-# Download and run install script
-curl -fsSL https://tailscale.com/install.sh | sh
-
-# Start Tailscale
-tailscale up
-
-# You'll see a URL like:
-# https://login.tailscale.com/a/xxxxxxxxxx
-```
-
-**Authenticate:**
-1. Copy the URL and open in browser
-2. Click **"Connect"** to authorize the device
-3. Name it: `pivoine-vps`
-
-**Check status:**
-```bash
-tailscale status
-```
-
-You should see your VPS listed with an IP like `100.x.x.x`
-
-**Save your VPS Tailscale IP:**
-```bash
-tailscale ip -4
-# Example output: 100.101.102.103
-```
-
-**Write this down - you'll need it!**
-
----
-
-## Step 3: Install Tailscale on GPU Server
-
-**SSH into your RunPod GPU server:**
-
-```bash
-ssh root@abc123def456-12345678.runpod.io -p 12345
-```
-
-**Install Tailscale:**
-
-```bash
-# Download and run install script
-curl -fsSL https://tailscale.com/install.sh | sh
-
-# Start Tailscale
-tailscale up --advertise-tags=tag:gpu
-
-# You'll see another URL
-```
-
-**Authenticate:**
-1. Copy the URL and open in browser
-2. Click **"Connect"**
-3. Name it: `gpu-runpod`
-
-**Check status:**
-```bash
-tailscale status
-```
-
-You should now see BOTH devices:
-- `pivoine-vps` - 100.x.x.x
-- `gpu-runpod` - 100.x.x.x
-
-**Save your GPU server Tailscale IP:**
-```bash
-tailscale ip -4
-# Example output: 100.104.105.106
-```
-
----
-
-## Step 4: Test Connectivity
-
-**From VPS, ping GPU server:**
-
-```bash
-# SSH into VPS
-ssh root@vps
-
-# Ping GPU server (use its Tailscale IP)
-ping 100.104.105.106 -c 4
-```
-
-Expected output:
-```
-PING 100.104.105.106 (100.104.105.106) 56(84) bytes of data.
-64 bytes from 100.104.105.106: icmp_seq=1 ttl=64 time=15.3 ms
-64 bytes from 100.104.105.106: icmp_seq=2 ttl=64 time=14.8 ms
-...
-```
-
-**From GPU server, ping VPS:**
-
-```bash
-# SSH into GPU server
-ssh root@abc123def456-12345678.runpod.io -p 12345
-
-# Ping VPS (use its Tailscale IP)
-ping 100.101.102.103 -c 4
-```
-
-**Both should work!** ✅
-
----
-
-## Step 5: Update Configuration Files
-
-Now update the IP addresses in your configs to use Tailscale IPs.
-
-### On GPU Server (.env file)
-
-**Edit your .env file:**
-
-```bash
-# On GPU server
-cd /workspace/gpu-stack
-
-nano .env
-```
-
-**Update these lines:**
-```bash
-# VPN Network (use your actual Tailscale IPs)
-VPS_IP=100.101.102.103      # Your VPS Tailscale IP
-GPU_IP=100.104.105.106      # Your GPU Tailscale IP
-
-# PostgreSQL (on VPS)
-DB_HOST=100.101.102.103     # Your VPS Tailscale IP
-DB_PORT=5432
-```
-
-Save and exit (Ctrl+X, Y, Enter)
-
-### On VPS (LiteLLM config)
-
-**Edit your LiteLLM config:**
-
-```bash
-# On VPS
-ssh root@vps
-cd ~/Projects/docker-compose/ai
-
-nano litellm-config-gpu.yaml
-```
-
-**Update the GPU server IP:**
-
-```yaml
-# Find this section and update IP:
-  - model_name: llama-3.1-8b
-    litellm_params:
-      model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct
-      api_base: http://100.104.105.106:8000/v1  # Use GPU Tailscale IP
-      api_key: dummy
-```
-
-Save and exit.
-
----
-
-## Step 6: Verify PostgreSQL Access
-
-**From GPU server, test database connection:**
-
-```bash
-# Install PostgreSQL client
-apt install -y postgresql-client
-
-# Test connection (use your VPS Tailscale IP)
-psql -h 100.101.102.103 -U valknar -d openwebui -c "SELECT 1;"
-```
-
-**If this fails, allow Tailscale network on VPS PostgreSQL:**
-
-```bash
-# On VPS
-ssh root@vps
-
-# Check if postgres allows Tailscale network
-docker exec core_postgres cat /var/lib/postgresql/data/pg_hba.conf | grep 100
-
-# If not present, add it:
-docker exec -it core_postgres bash
-
-# Inside container:
-echo "host    all             all             100.0.0.0/8             scram-sha-256" >> /var/lib/postgresql/data/pg_hba.conf
-
-# Restart postgres
-exit
-docker restart core_postgres
-```
-
-Try connecting again - should work now!
-
----
-
-## Tailscale Management
-
-### View Connected Devices
-
-**Web dashboard:**
-https://login.tailscale.com/admin/machines
-
-You'll see all your devices with their Tailscale IPs.
-
-**Command line:**
-```bash
-tailscale status
-```
-
-### Disconnect/Reconnect
-
-```bash
-# Stop Tailscale
-tailscale down
-
-# Start Tailscale
-tailscale up
-```
-
-### Remove Device
-
-From web dashboard:
-1. Click on device
-2. Click "..." menu
-3. Select "Disable" or "Delete"
-
----
-
-## Advantages Over WireGuard
-
-✅ **Works anywhere** - No UDP ports needed
-✅ **Auto-reconnect** - Survives network changes
-✅ **Multiple devices** - Easy to add laptop, phone, etc.
-✅ **NAT traversal** - Direct peer-to-peer when possible
-✅ **Access Control** - Manage from web dashboard
-✅ **Monitoring** - See connection status in real-time
-
----
-
-## Security Notes
-
-🔒 **Tailscale is secure:**
-- End-to-end encrypted (WireGuard)
-- Zero-trust architecture
-- No Tailscale servers can see your traffic
-- Only authenticated devices can connect
-
-🔒 **Access control:**
-- Only devices you authorize can join
-- Revoke access anytime from dashboard
-- Set ACLs for fine-grained control
-
----
-
-## Network Reference (Updated)
-
-**Old (WireGuard):**
-- VPS: `10.8.0.1`
-- GPU: `10.8.0.2`
-
-**New (Tailscale):**
-- VPS: `100.101.102.103` (example - use your actual IP)
-- GPU: `100.104.105.106` (example - use your actual IP)
-
-**All services now accessible via Tailscale:**
-
-**From VPS to GPU:**
-- vLLM: `http://100.104.105.106:8000`
-- ComfyUI: `http://100.104.105.106:8188`
-- JupyterLab: `http://100.104.105.106:8888`
-- Netdata: `http://100.104.105.106:19999`
-
-**From GPU to VPS:**
-- PostgreSQL: `100.101.102.103:5432`
-- Redis: `100.101.102.103:6379`
-- LiteLLM: `http://100.101.102.103:4000`
-
----
-
-## Troubleshooting
-
-### Can't ping between devices
-
-**Check Tailscale status:**
-```bash
-tailscale status
-```
-
-Both devices should show "active" or "online".
-
-**Check connectivity:**
-```bash
-tailscale ping 100.104.105.106
-```
-
-**Restart Tailscale:**
-```bash
-tailscale down && tailscale up
-```
-
-### PostgreSQL connection refused
-
-**Check if postgres is listening on all interfaces:**
-```bash
-# On VPS
-docker exec core_postgres cat /var/lib/postgresql/data/postgresql.conf | grep listen_addresses
-```
-
-Should show: `listen_addresses = '*'`
-
-**Check pg_hba.conf allows Tailscale network:**
-```bash
-docker exec core_postgres cat /var/lib/postgresql/data/pg_hba.conf | grep 100
-```
-
-Should have line:
-```
-host    all             all             100.0.0.0/8             scram-sha-256
-```
-
-### Device not showing in network
-
-**Re-authenticate:**
-```bash
-tailscale logout
-tailscale up
-# Click the new URL to re-authenticate
-```
-
----
-
-## Verification Checklist
-
-Before proceeding:
-- [ ] Tailscale account created
-- [ ] Tailscale installed on VPS
-- [ ] Tailscale installed on GPU server
-- [ ] Both devices visible in `tailscale status`
-- [ ] VPS can ping GPU server (via Tailscale IP)
-- [ ] GPU server can ping VPS (via Tailscale IP)
-- [ ] PostgreSQL accessible from GPU server
-- [ ] .env file updated with Tailscale IPs
-- [ ] LiteLLM config updated with GPU Tailscale IP
-
----
-
-## Next Steps
-
-✅ **Network configured!** Proceed to Docker & GPU setup:
-
-```bash
-cat /home/valknar/Projects/docker-compose/ai/DOCKER_GPU_SETUP.md
-```
-
-**Your Tailscale IPs (save these!):**
-- VPS: `__________________` (from `tailscale ip -4` on VPS)
-- GPU: `__________________` (from `tailscale ip -4` on GPU server)
-
----
-
-## Bonus: Add Your Local Machine
-
-Want to access GPU server from your laptop?
-
-```bash
-# On your local machine
-curl -fsSL https://tailscale.com/install.sh | sh
-tailscale up
-
-# Now you can SSH directly via Tailscale:
-ssh root@100.104.105.106
-
-# Or access ComfyUI in browser:
-# http://100.104.105.106:8188
-```
-
-No more port forwarding needed! 🎉
diff --git a/ai/WIREGUARD_SETUP.md b/ai/WIREGUARD_SETUP.md
deleted file mode 100644
index 0f274fa..0000000
--- a/ai/WIREGUARD_SETUP.md
+++ /dev/null
@@ -1,393 +0,0 @@
-# WireGuard VPN Setup - Connecting GPU Server to VPS
-
-## Day 3-4: Network Configuration
-
-This guide connects your RunPod GPU server to your VPS via WireGuard VPN, enabling secure, low-latency communication.
-
-### Architecture
-
-```
-┌─────────────────────────────┐         ┌──────────────────────────────┐
-│ VPS (pivoine.art)           │         │ GPU Server (RunPod)          │
-│ 10.8.0.1 (WireGuard)        │◄───────►│ 10.8.0.2 (WireGuard)         │
-├─────────────────────────────┤         ├──────────────────────────────┤
-│ - LiteLLM Proxy             │         │ - vLLM (10.8.0.2:8000)       │
-│ - Open WebUI                │         │ - ComfyUI (10.8.0.2:8188)    │
-│ - PostgreSQL                │         │ - Training                    │
-└─────────────────────────────┘         └──────────────────────────────┘
-```
-
-### Prerequisites
-
-- ✅ VPS with root access
-- ✅ GPU server with root access
-- ✅ Both servers have public IPs
-
----
-
-## Method 1: Using Existing wg-easy (Recommended)
-
-You already have `wg-easy` running on your VPS. Let's use it!
-
-### Step 1: Access wg-easy Dashboard
-
-**On your local machine:**
-
-1. Open browser: https://vpn.pivoine.art (or whatever your wg-easy URL is)
-2. Login with admin password
-
-**Don't have wg-easy set up? Skip to Method 2.**
-
-### Step 2: Create GPU Server Client
-
-1. In wg-easy dashboard, click **"+ New Client"**
-2. **Name**: `gpu-server-runpod`
-3. Click **"Create"**
-4. **Download** configuration file (or copy QR code data)
-
-You'll get a file like: `gpu-server-runpod.conf`
-
-### Step 3: Install WireGuard on GPU Server
-
-**SSH into GPU server:**
-
-```bash
-ssh gpu-pivoine  # or your SSH command
-
-# Install WireGuard
-apt update
-apt install -y wireguard wireguard-tools
-```
-
-### Step 4: Configure WireGuard on GPU Server
-
-**Upload the config file:**
-
-```bash
-# On your local machine, copy the config to GPU server
-scp gpu-server-runpod.conf gpu-pivoine:/etc/wireguard/wg0.conf
-
-# Or manually create it on GPU server:
-nano /etc/wireguard/wg0.conf
-# Paste the configuration from wg-easy
-```
-
-**Example config (yours will be different):**
-```ini
-[Interface]
-PrivateKey = <PRIVATE_KEY_FROM_WG_EASY>
-Address = 10.8.0.2/24
-DNS = 10.8.0.1
-
-[Peer]
-PublicKey = <VPS_PUBLIC_KEY_FROM_WG_EASY>
-PresharedKey = <PRESHARED_KEY>
-AllowedIPs = 10.8.0.0/24
-Endpoint = <VPS_PUBLIC_IP>:51820
-PersistentKeepalive = 25
-```
-
-### Step 5: Start WireGuard
-
-```bash
-# Enable IP forwarding
-echo "net.ipv4.ip_forward=1" >> /etc/sysctl.conf
-sysctl -p
-
-# Set permissions
-chmod 600 /etc/wireguard/wg0.conf
-
-# Start WireGuard
-systemctl enable wg-quick@wg0
-systemctl start wg-quick@wg0
-
-# Check status
-systemctl status wg-quick@wg0
-wg show
-```
-
-Expected output:
-```
-interface: wg0
-  public key: <GPU_SERVER_PUBLIC_KEY>
-  private key: (hidden)
-  listening port: 51820
-
-peer: <VPS_PUBLIC_KEY>
-  endpoint: <VPS_IP>:51820
-  allowed ips: 10.8.0.0/24
-  latest handshake: 1 second ago
-  transfer: 1.2 KiB received, 892 B sent
-  persistent keepalive: every 25 seconds
-```
-
-### Step 6: Test Connectivity
-
-**From GPU server, ping VPS:**
-
-```bash
-ping 10.8.0.1 -c 4
-```
-
-Expected output:
-```
-PING 10.8.0.1 (10.8.0.1) 56(84) bytes of data.
-64 bytes from 10.8.0.1: icmp_seq=1 ttl=64 time=25.3 ms
-64 bytes from 10.8.0.1: icmp_seq=2 ttl=64 time=24.8 ms
-...
-```
-
-**From VPS, ping GPU server:**
-
-```bash
-ssh root@vps
-ping 10.8.0.2 -c 4
-```
-
-**Test PostgreSQL access from GPU server:**
-
-```bash
-# On GPU server
-apt install -y postgresql-client
-
-# Try connecting to VPS postgres
-psql -h 10.8.0.1 -U valknar -d openwebui -c "SELECT 1;"
-# Should work if postgres allows 10.8.0.0/24
-```
-
----
-
-## Method 2: Manual WireGuard Setup (If no wg-easy)
-
-### Step 1: Install WireGuard on Both Servers
-
-**On VPS:**
-```bash
-ssh root@vps
-apt update
-apt install -y wireguard wireguard-tools
-```
-
-**On GPU Server:**
-```bash
-ssh gpu-pivoine
-apt update
-apt install -y wireguard wireguard-tools
-```
-
-### Step 2: Generate Keys
-
-**On VPS:**
-```bash
-cd /etc/wireguard
-umask 077
-wg genkey | tee vps-private.key | wg pubkey > vps-public.key
-```
-
-**On GPU Server:**
-```bash
-cd /etc/wireguard
-umask 077
-wg genkey | tee gpu-private.key | wg pubkey > gpu-public.key
-```
-
-### Step 3: Create Config on VPS
-
-**On VPS (`/etc/wireguard/wg0.conf`):**
-
-```bash
-cat > /etc/wireguard/wg0.conf << 'EOF'
-[Interface]
-PrivateKey = <VPS_PRIVATE_KEY>
-Address = 10.8.0.1/24
-ListenPort = 51820
-SaveConfig = false
-
-# GPU Server Peer
-[Peer]
-PublicKey = <GPU_PUBLIC_KEY>
-AllowedIPs = 10.8.0.2/32
-PersistentKeepalive = 25
-EOF
-```
-
-Replace `<VPS_PRIVATE_KEY>` with contents of `vps-private.key`
-Replace `<GPU_PUBLIC_KEY>` with contents from GPU server's `gpu-public.key`
-
-### Step 4: Create Config on GPU Server
-
-**On GPU Server (`/etc/wireguard/wg0.conf`):**
-
-```bash
-cat > /etc/wireguard/wg0.conf << 'EOF'
-[Interface]
-PrivateKey = <GPU_PRIVATE_KEY>
-Address = 10.8.0.2/24
-
-[Peer]
-PublicKey = <VPS_PUBLIC_KEY>
-AllowedIPs = 10.8.0.0/24
-Endpoint = <VPS_PUBLIC_IP>:51820
-PersistentKeepalive = 25
-EOF
-```
-
-Replace:
-- `<GPU_PRIVATE_KEY>` with contents of `gpu-private.key`
-- `<VPS_PUBLIC_KEY>` with contents from VPS's `vps-public.key`
-- `<VPS_PUBLIC_IP>` with your VPS's public IP address
-
-### Step 5: Start WireGuard on Both
-
-**On VPS:**
-```bash
-# Enable IP forwarding
-echo "net.ipv4.ip_forward=1" >> /etc/sysctl.conf
-sysctl -p
-
-# Start WireGuard
-chmod 600 /etc/wireguard/wg0.conf
-systemctl enable wg-quick@wg0
-systemctl start wg-quick@wg0
-```
-
-**On GPU Server:**
-```bash
-# Enable IP forwarding
-echo "net.ipv4.ip_forward=1" >> /etc/sysctl.conf
-sysctl -p
-
-# Start WireGuard
-chmod 600 /etc/wireguard/wg0.conf
-systemctl enable wg-quick@wg0
-systemctl start wg-quick@wg0
-```
-
-### Step 6: Configure Firewall
-
-**On VPS:**
-```bash
-# Allow WireGuard port
-ufw allow 51820/udp
-ufw reload
-
-# Or with iptables
-iptables -A INPUT -p udp --dport 51820 -j ACCEPT
-iptables-save > /etc/iptables/rules.v4
-```
-
-**On GPU Server (RunPod):**
-```bash
-# Allow WireGuard
-ufw allow 51820/udp
-ufw reload
-```
-
-### Step 7: Test Connection
-
-Same as Method 1 Step 6.
-
----
-
-## Troubleshooting
-
-### No handshake
-
-**Check:**
-```bash
-wg show
-```
-
-If "latest handshake" shows "never":
-1. Verify public keys are correct (easy to swap them!)
-2. Check firewall allows UDP 51820
-3. Verify endpoint IP is correct
-4. Check `systemctl status wg-quick@wg0` for errors
-
-### Can ping but can't access services
-
-**On VPS, check PostgreSQL allows 10.8.0.0/24:**
-
-```bash
-# Edit postgresql.conf
-nano /var/lib/postgresql/data/postgresql.conf
-# Add or modify:
-listen_addresses = '*'
-
-# Edit pg_hba.conf
-nano /var/lib/postgresql/data/pg_hba.conf
-# Add:
-host    all             all             10.8.0.0/24             scram-sha-256
-
-# Restart
-docker restart core_postgres
-```
-
-### WireGuard won't start
-
-```bash
-# Check logs
-journalctl -u wg-quick@wg0 -n 50
-
-# Common issues:
-# - Wrong permissions: chmod 600 /etc/wireguard/wg0.conf
-# - Invalid keys: regenerate with wg genkey
-# - Port already in use: lsof -i :51820
-```
-
----
-
-## Verification Checklist
-
-Before proceeding to Day 5:
-
-- [ ] WireGuard installed on both VPS and GPU server
-- [ ] VPN tunnel established (wg show shows handshake)
-- [ ] GPU server can ping VPS (10.8.0.1)
-- [ ] VPS can ping GPU server (10.8.0.2)
-- [ ] Firewall allows WireGuard (UDP 51820)
-- [ ] PostgreSQL accessible from GPU server
-- [ ] WireGuard starts on boot (systemctl enable)
-
----
-
-## Network Reference
-
-**VPN IPs:**
-- VPS: `10.8.0.1`
-- GPU Server: `10.8.0.2`
-
-**Service Access from GPU Server:**
-- PostgreSQL: `postgresql://valknar:password@10.8.0.1:5432/dbname`
-- Redis: `10.8.0.1:6379`
-- LiteLLM: `http://10.8.0.1:4000`
-- Mailpit: `10.8.0.1:1025`
-
-**Service Access from VPS:**
-- vLLM: `http://10.8.0.2:8000`
-- ComfyUI: `http://10.8.0.2:8188`
-- JupyterLab: `http://10.8.0.2:8888`
-
----
-
-## Next: Docker & GPU Setup
-
-Once VPN is working, proceed to **Day 5: Docker & NVIDIA Container Toolkit Setup**.
-
-**Save connection info:**
-
-```bash
-# On GPU server
-cat >> /workspace/SERVER_INFO.md << 'EOF'
-
-## VPN Configuration
-- VPN IP: 10.8.0.2
-- VPS VPN IP: 10.8.0.1
-- WireGuard Status: Active
-- Latest Handshake: [Check with: wg show]
-
-## Network Access
-- Can reach VPS services: ✓
-- VPS can reach GPU services: ✓
-EOF
-```
diff --git a/ai/deploy-gpu-stack.sh b/ai/deploy-gpu-stack.sh
deleted file mode 100755
index f770946..0000000
--- a/ai/deploy-gpu-stack.sh
+++ /dev/null
@@ -1,229 +0,0 @@
-#!/bin/bash
-# GPU Stack Deployment Script
-# Run this on the GPU server after SSH access is established
-
-set -e  # Exit on error
-
-echo "=================================="
-echo "GPU Stack Deployment Script"
-echo "=================================="
-echo ""
-
-# Colors for output
-RED='\033[0;31m'
-GREEN='\033[0;32m'
-YELLOW='\033[1;33m'
-NC='\033[0m' # No Color
-
-# Functions
-print_success() {
-    echo -e "${GREEN}✓ $1${NC}"
-}
-
-print_error() {
-    echo -e "${RED}✗ $1${NC}"
-}
-
-print_info() {
-    echo -e "${YELLOW}→ $1${NC}"
-}
-
-# Check if running as root
-if [[ $EUID -ne 0 ]]; then
-   print_error "This script must be run as root (use sudo)"
-   exit 1
-fi
-
-# Step 1: Check prerequisites
-print_info "Checking prerequisites..."
-
-if ! command -v docker &> /dev/null; then
-    print_error "Docker is not installed. Please run DOCKER_GPU_SETUP.md first."
-    exit 1
-fi
-print_success "Docker installed"
-
-if ! command -v nvidia-smi &> /dev/null; then
-    print_error "nvidia-smi not found. Is this a GPU server?"
-    exit 1
-fi
-print_success "NVIDIA GPU detected"
-
-if ! docker run --rm --runtime=nvidia --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi &> /dev/null; then
-    print_error "Docker cannot access GPU. Please configure NVIDIA Container Toolkit."
-    exit 1
-fi
-print_success "Docker GPU access working"
-
-# Step 2: Create directory structure
-print_info "Creating directory structure..."
-
-mkdir -p /workspace/gpu-stack/{vllm,comfyui,training/{configs,data,output},notebooks,monitoring}
-cd /workspace/gpu-stack
-
-print_success "Directory structure created"
-
-# Step 3: Create .env file
-if [ ! -f .env ]; then
-    print_info "Creating .env file..."
-
-    cat > .env << 'EOF'
-# GPU Stack Environment Variables
-
-# Timezone
-TIMEZONE=Europe/Berlin
-
-# VPN Network
-VPS_IP=10.8.0.1
-GPU_IP=10.8.0.2
-
-# Model Storage (network volume)
-MODELS_PATH=/workspace/models
-
-# Hugging Face Token (optional, for gated models like Llama)
-# Get from: https://huggingface.co/settings/tokens
-HF_TOKEN=
-
-# Weights & Biases (optional, for training logging)
-# Get from: https://wandb.ai/authorize
-WANDB_API_KEY=
-
-# JupyterLab Access Token
-JUPYTER_TOKEN=pivoine-ai-2025
-
-# PostgreSQL (on VPS)
-DB_HOST=10.8.0.1
-DB_PORT=5432
-DB_USER=valknar
-DB_PASSWORD=ragnarok98
-DB_NAME=openwebui
-EOF
-
-    chmod 600 .env
-    print_success ".env file created (please edit with your tokens)"
-else
-    print_success ".env file already exists"
-fi
-
-# Step 4: Download docker-compose.yaml
-print_info "Downloading docker-compose.yaml..."
-
-# In production, this would be copied from the repo
-# For now, assume it's already in the current directory
-if [ ! -f docker-compose.yaml ]; then
-    print_error "docker-compose.yaml not found. Please copy gpu-server-compose.yaml to docker-compose.yaml"
-    exit 1
-fi
-
-print_success "docker-compose.yaml found"
-
-# Step 5: Pre-download models (optional but recommended)
-print_info "Do you want to pre-download models? (y/n)"
-read -r response
-
-if [[ "$response" =~ ^[Yy]$ ]]; then
-    print_info "Downloading Llama 3.1 8B Instruct (this will take a while)..."
-
-    mkdir -p /workspace/models
-
-    # Use huggingface-cli to download
-    pip install -q huggingface-hub
-
-    huggingface-cli download \
-        meta-llama/Meta-Llama-3.1-8B-Instruct \
-        --local-dir /workspace/models/Meta-Llama-3.1-8B-Instruct \
-        --local-dir-use-symlinks False || print_error "Model download failed (may need HF_TOKEN)"
-
-    print_success "Model downloaded to /workspace/models"
-fi
-
-# Step 6: Start services
-print_info "Starting GPU stack services..."
-
-docker compose up -d vllm comfyui jupyter netdata
-
-print_success "Services starting (this may take a few minutes)..."
-
-# Step 7: Wait for services
-print_info "Waiting for services to be ready..."
-
-sleep 10
-
-# Check service health
-print_info "Checking service status..."
-
-if docker ps | grep -q gpu_vllm; then
-    print_success "vLLM container running"
-else
-    print_error "vLLM container not running"
-fi
-
-if docker ps | grep -q gpu_comfyui; then
-    print_success "ComfyUI container running"
-else
-    print_error "ComfyUI container not running"
-fi
-
-if docker ps | grep -q gpu_jupyter; then
-    print_success "JupyterLab container running"
-else
-    print_error "JupyterLab container not running"
-fi
-
-if docker ps | grep -q gpu_netdata; then
-    print_success "Netdata container running"
-else
-    print_error "Netdata container not running"
-fi
-
-# Step 8: Display access information
-echo ""
-echo "=================================="
-echo "Deployment Complete!"
-echo "=================================="
-echo ""
-echo "Services accessible via VPN (from VPS):"
-echo "  - vLLM API: http://10.8.0.2:8000"
-echo "  - ComfyUI: http://10.8.0.2:8188"
-echo "  - JupyterLab: http://10.8.0.2:8888 (token: pivoine-ai-2025)"
-echo "  - Netdata: http://10.8.0.2:19999"
-echo ""
-echo "Local access (from GPU server):"
-echo "  - vLLM API: http://localhost:8000"
-echo "  - ComfyUI: http://localhost:8188"
-echo "  - JupyterLab: http://localhost:8888"
-echo "  - Netdata: http://localhost:19999"
-echo ""
-echo "Useful commands:"
-echo "  - View logs: docker compose logs -f"
-echo "  - Check status: docker compose ps"
-echo "  - Stop all: docker compose down"
-echo "  - Restart service: docker compose restart vllm"
-echo "  - Start training: docker compose --profile training up -d axolotl"
-echo ""
-echo "Next steps:"
-echo "  1. Wait for vLLM to load model (check logs: docker compose logs -f vllm)"
-echo "  2. Test vLLM: curl http://localhost:8000/v1/models"
-echo "  3. Configure LiteLLM on VPS to use http://10.8.0.2:8000"
-echo "  4. Download ComfyUI models via web interface"
-echo ""
-
-# Step 9: Create helpful aliases
-print_info "Creating helpful aliases..."
-
-cat >> ~/.bashrc << 'EOF'
-
-# GPU Stack Aliases
-alias gpu-logs='cd /workspace/gpu-stack && docker compose logs -f'
-alias gpu-ps='cd /workspace/gpu-stack && docker compose ps'
-alias gpu-restart='cd /workspace/gpu-stack && docker compose restart'
-alias gpu-down='cd /workspace/gpu-stack && docker compose down'
-alias gpu-up='cd /workspace/gpu-stack && docker compose up -d'
-alias gpu-stats='watch -n 1 nvidia-smi'
-alias gpu-top='nvtop'
-EOF
-
-print_success "Aliases added to ~/.bashrc (reload with: source ~/.bashrc)"
-
-echo ""
-print_success "All done! 🚀"
diff --git a/ai/docker-compose.gpu.yaml b/ai/docker-compose.gpu.yaml
deleted file mode 100644
index 9ddfe84..0000000
--- a/ai/docker-compose.gpu.yaml
+++ /dev/null
@@ -1,104 +0,0 @@
-version: '3.8'
-
-# Multi-Modal AI Orchestration for RunPod RTX 4090
-# Manages text, image, and music generation with sequential model loading
-
-services:
-  # ============================================================================
-  # ORCHESTRATOR (Always Running)
-  # ============================================================================
-  orchestrator:
-    build: ./model-orchestrator
-    container_name: ai_orchestrator
-    ports:
-      - "9000:9000"
-    volumes:
-      - /var/run/docker.sock:/var/run/docker.sock:ro
-      - ./model-orchestrator/models.yaml:/app/models.yaml:ro
-    environment:
-      - MODELS_CONFIG=/app/models.yaml
-      - COMPOSE_PROJECT_NAME=ai
-      - GPU_MEMORY_GB=24
-    restart: unless-stopped
-    network_mode: host
-
-  # ============================================================================
-  # TEXT GENERATION (vLLM + Qwen 2.5 7B)
-  # ============================================================================
-  vllm-qwen:
-    build: ./vllm
-    container_name: ai_vllm-qwen_1
-    ports:
-      - "8001:8000"
-    volumes:
-      - /workspace/huggingface_cache:/workspace/huggingface_cache
-    environment:
-      - HF_TOKEN=${HF_TOKEN}
-      - VLLM_HOST=0.0.0.0
-      - VLLM_PORT=8000
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-    profiles: ["text"]  # Only start when requested by orchestrator
-    restart: "no"  # Orchestrator manages lifecycle
-
-  # ============================================================================
-  # IMAGE GENERATION (Flux.1 Schnell)
-  # ============================================================================
-  flux:
-    image: ghcr.io/matatonic/openedai-images-flux:latest
-    container_name: ai_flux_1
-    ports:
-      - "8002:5005"
-    volumes:
-      - /workspace/flux/models:/app/models
-      - ./flux/config:/app/config:ro
-    environment:
-      - HF_TOKEN=${HF_TOKEN}
-      - CONFIG_PATH=/app/config/config.json
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-    profiles: ["image"]  # Only start when requested by orchestrator
-    restart: "no"  # Orchestrator manages lifecycle
-
-  # ============================================================================
-  # MUSIC GENERATION (MusicGen Medium)
-  # ============================================================================
-  musicgen:
-    build: ./musicgen
-    container_name: ai_musicgen_1
-    ports:
-      - "8003:8000"
-    volumes:
-      - /workspace/musicgen/models:/app/models
-    environment:
-      - HF_TOKEN=${HF_TOKEN}
-      - MODEL_NAME=facebook/musicgen-medium
-      - HOST=0.0.0.0
-      - PORT=8000
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-    profiles: ["audio"]  # Only start when requested by orchestrator
-    restart: "no"  # Orchestrator manages lifecycle
-
-# ============================================================================
-# VOLUMES
-# ============================================================================
-# Model caches are stored on RunPod's /workspace directory (922TB network volume)
-# This persists across pod restarts and reduces model download times
-
-# No named volumes - using host paths on RunPod /workspace
diff --git a/ai/flux/config/config.json b/ai/flux/config/config.json
deleted file mode 100644
index 50d9669..0000000
--- a/ai/flux/config/config.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "model": "flux-schnell",
-  "offload": true,
-  "sequential_cpu_offload": false,
-  "vae_tiling": true,
-  "enable_model_cpu_offload": true,
-  "low_vram_mode": false,
-  "torch_compile": false,
-  "safety_checker": false,
-  "watermark": false,
-  "flux_device": "cuda",
-  "compile": false
-}
diff --git a/ai/gpu-server-compose.yaml b/ai/gpu-server-compose.yaml
deleted file mode 100644
index 9cb2f70..0000000
--- a/ai/gpu-server-compose.yaml
+++ /dev/null
@@ -1,237 +0,0 @@
-# GPU Server Docker Compose Configuration
-# Deploy on RunPod GPU server (10.8.0.2)
-# Services accessible from VPS (10.8.0.1) via WireGuard VPN
-
-version: '3.8'
-
-services:
-  # =============================================================================
-  # vLLM - High-performance LLM Inference Server
-  # =============================================================================
-  vllm:
-    image: vllm/vllm-openai:latest
-    container_name: gpu_vllm
-    restart: unless-stopped
-    runtime: nvidia
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-      CUDA_VISIBLE_DEVICES: "0"
-      HF_TOKEN: ${HF_TOKEN:-}
-    volumes:
-      - ${MODELS_PATH:-/workspace/models}:/root/.cache/huggingface
-    command:
-      - --model
-      - meta-llama/Meta-Llama-3.1-8B-Instruct  # Change model here
-      - --host
-      - 0.0.0.0
-      - --port
-      - 8000
-      - --tensor-parallel-size
-      - "1"
-      - --gpu-memory-utilization
-      - "0.85"  # Leave 15% for other tasks
-      - --max-model-len
-      - "8192"
-      - --dtype
-      - auto
-      - --trust-remote-code
-    ports:
-      - "8000:8000"
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
-      interval: 30s
-      timeout: 10s
-      retries: 3
-      start_period: 120s  # Model loading takes time
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-    labels:
-      - "service=vllm"
-      - "stack=gpu-ai"
-
-  # =============================================================================
-  # ComfyUI - Advanced Stable Diffusion Interface
-  # =============================================================================
-  comfyui:
-    image: ghcr.io/ai-dock/comfyui:latest
-    container_name: gpu_comfyui
-    restart: unless-stopped
-    runtime: nvidia
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-      TZ: ${TIMEZONE:-Europe/Berlin}
-      # ComfyUI auto-installs custom nodes on first run
-      COMFYUI_FLAGS: "--listen 0.0.0.0 --port 8188"
-    volumes:
-      - comfyui_data:/data
-      - ${MODELS_PATH:-/workspace/models}/comfyui:/opt/ComfyUI/models
-      - comfyui_output:/opt/ComfyUI/output
-      - comfyui_input:/opt/ComfyUI/input
-    ports:
-      - "8188:8188"
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8188/"]
-      interval: 30s
-      timeout: 10s
-      retries: 3
-      start_period: 60s
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-    labels:
-      - "service=comfyui"
-      - "stack=gpu-ai"
-
-  # =============================================================================
-  # Axolotl - LLM Fine-tuning Framework
-  # =============================================================================
-  # Note: This service uses "profiles" - only starts when explicitly requested
-  # Start with: docker compose --profile training up -d axolotl
-  axolotl:
-    image: winglian/axolotl:main-py3.11-cu121-2.2.2
-    container_name: gpu_training
-    runtime: nvidia
-    volumes:
-      - ./training/configs:/workspace/configs
-      - ./training/data:/workspace/data
-      - ./training/output:/workspace/output
-      - ${MODELS_PATH:-/workspace/models}:/workspace/models
-      - training_cache:/root/.cache
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-      WANDB_API_KEY: ${WANDB_API_KEY:-}
-      HF_TOKEN: ${HF_TOKEN:-}
-    working_dir: /workspace
-    # Default command - override when running specific training
-    command: sleep infinity
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-    profiles:
-      - training
-    labels:
-      - "service=axolotl"
-      - "stack=gpu-ai"
-
-  # =============================================================================
-  # JupyterLab - Interactive Development Environment
-  # =============================================================================
-  jupyter:
-    image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel
-    container_name: gpu_jupyter
-    restart: unless-stopped
-    runtime: nvidia
-    volumes:
-      - ./notebooks:/workspace/notebooks
-      - ${MODELS_PATH:-/workspace/models}:/workspace/models
-      - jupyter_cache:/root/.cache
-    ports:
-      - "8888:8888"
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-      JUPYTER_ENABLE_LAB: "yes"
-      JUPYTER_TOKEN: ${JUPYTER_TOKEN:-pivoine-ai-2025}
-      HF_TOKEN: ${HF_TOKEN:-}
-    command: |
-      bash -c "
-      pip install --quiet jupyterlab transformers datasets accelerate bitsandbytes peft trl sentencepiece protobuf &&
-      jupyter lab --ip=0.0.0.0 --port=8888 --allow-root --no-browser --NotebookApp.token='${JUPYTER_TOKEN:-pivoine-ai-2025}'
-      "
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8888/"]
-      interval: 30s
-      timeout: 10s
-      retries: 3
-      start_period: 60s
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
-    labels:
-      - "service=jupyter"
-      - "stack=gpu-ai"
-
-  # =============================================================================
-  # Netdata - System & GPU Monitoring
-  # =============================================================================
-  netdata:
-    image: netdata/netdata:latest
-    container_name: gpu_netdata
-    restart: unless-stopped
-    runtime: nvidia
-    hostname: gpu-runpod
-    cap_add:
-      - SYS_PTRACE
-      - SYS_ADMIN
-    security_opt:
-      - apparmor:unconfined
-    environment:
-      NVIDIA_VISIBLE_DEVICES: all
-      TZ: ${TIMEZONE:-Europe/Berlin}
-    volumes:
-      - /sys:/host/sys:ro
-      - /proc:/host/proc:ro
-      - /var/run/docker.sock:/var/run/docker.sock:ro
-      - /etc/os-release:/host/etc/os-release:ro
-      - netdata_config:/etc/netdata
-      - netdata_cache:/var/cache/netdata
-      - netdata_lib:/var/lib/netdata
-    ports:
-      - "19999:19999"
-    labels:
-      - "service=netdata"
-      - "stack=gpu-ai"
-
-# =============================================================================
-# Volumes
-# =============================================================================
-volumes:
-  # ComfyUI data
-  comfyui_data:
-    driver: local
-  comfyui_output:
-    driver: local
-  comfyui_input:
-    driver: local
-
-  # Training data
-  training_cache:
-    driver: local
-
-  # Jupyter data
-  jupyter_cache:
-    driver: local
-
-  # Netdata data
-  netdata_config:
-    driver: local
-  netdata_cache:
-    driver: local
-  netdata_lib:
-    driver: local
-
-# =============================================================================
-# Networks
-# =============================================================================
-networks:
-  default:
-    driver: bridge
-    ipam:
-      config:
-        - subnet: 172.25.0.0/24
diff --git a/ai/litellm-config-gpu.yaml b/ai/litellm-config-gpu.yaml
deleted file mode 100644
index 5313d64..0000000
--- a/ai/litellm-config-gpu.yaml
+++ /dev/null
@@ -1,199 +0,0 @@
-# LiteLLM Configuration with GPU Server Integration
-# This config includes both Anthropic Claude (API) and self-hosted models (vLLM on GPU server)
-
-model_list:
-  # =============================================================================
-  # Anthropic Claude Models (API-based, for complex reasoning)
-  # =============================================================================
-
-  - model_name: claude-sonnet-4
-    litellm_params:
-      model: anthropic/claude-sonnet-4-20250514
-      api_key: os.environ/ANTHROPIC_API_KEY
-
-  - model_name: claude-sonnet-4.5
-    litellm_params:
-      model: anthropic/claude-sonnet-4-5-20250929
-      api_key: os.environ/ANTHROPIC_API_KEY
-
-  - model_name: claude-3-5-sonnet
-    litellm_params:
-      model: anthropic/claude-3-5-sonnet-20241022
-      api_key: os.environ/ANTHROPIC_API_KEY
-
-  - model_name: claude-3-opus
-    litellm_params:
-      model: anthropic/claude-3-opus-20240229
-      api_key: os.environ/ANTHROPIC_API_KEY
-
-  - model_name: claude-3-haiku
-    litellm_params:
-      model: anthropic/claude-3-haiku-20240307
-      api_key: os.environ/ANTHROPIC_API_KEY
-
-  # =============================================================================
-  # Self-Hosted Models (vLLM on GPU server via WireGuard VPN)
-  # =============================================================================
-
-  # Llama 3.1 8B Instruct - Fast, general-purpose, good for routine tasks
-  - model_name: llama-3.1-8b
-    litellm_params:
-      model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct
-      api_base: http://10.8.0.2:8000/v1
-      api_key: dummy  # vLLM doesn't require auth
-      rpm: 1000  # Rate limit: requests per minute
-      tpm: 100000  # Rate limit: tokens per minute
-
-  # Alternative models (uncomment and configure on GPU server as needed)
-
-  # Qwen 2.5 14B Instruct - Excellent multilingual, stronger reasoning
-  # - model_name: qwen-2.5-14b
-  #   litellm_params:
-  #     model: openai/Qwen/Qwen2.5-14B-Instruct
-  #     api_base: http://10.8.0.2:8000/v1
-  #     api_key: dummy
-  #     rpm: 800
-  #     tpm: 80000
-
-  # Mistral 7B Instruct - Very fast, lightweight
-  # - model_name: mistral-7b
-  #   litellm_params:
-  #     model: openai/mistralai/Mistral-7B-Instruct-v0.3
-  #     api_base: http://10.8.0.2:8000/v1
-  #     api_key: dummy
-  #     rpm: 1200
-  #     tpm: 120000
-
-  # DeepSeek Coder 6.7B - Code generation specialist
-  # - model_name: deepseek-coder-6.7b
-  #   litellm_params:
-  #     model: openai/deepseek-ai/deepseek-coder-6.7b-instruct
-  #     api_base: http://10.8.0.2:8000/v1
-  #     api_key: dummy
-  #     rpm: 1000
-  #     tpm: 100000
-
-# =============================================================================
-# Router Settings - Intelligent Model Selection
-# =============================================================================
-
-# Model aliases for easy switching in Open WebUI
-model_name_map:
-  # Default model (self-hosted, fast)
-  gpt-3.5-turbo: llama-3.1-8b
-
-  # Power users can use Claude for complex tasks
-  gpt-4: claude-sonnet-4.5
-  gpt-4-turbo: claude-sonnet-4.5
-
-# LiteLLM Settings
-litellm_settings:
-  drop_params: true
-  set_verbose: false  # Disable verbose logging for better performance
-
-  # Enable caching with Redis for better performance
-  cache: true
-  cache_params:
-    type: redis
-    host: redis
-    port: 6379
-    ttl: 3600  # Cache for 1 hour
-
-  # Force strip specific parameters globally
-  allowed_fails: 0
-
-  # Modify params before sending to provider
-  modify_params: true
-
-  # Enable success and failure logging but minimize overhead
-  success_callback: []  # Disable all success callbacks to reduce DB writes
-  failure_callback: []  # Disable all failure callbacks
-
-# Router Settings
-router_settings:
-  allowed_fails: 0
-
-  # Routing strategy: Try self-hosted first, fallback to Claude on failure
-  routing_strategy: simple-shuffle
-
-  # Cooldown for failed models
-  cooldown_time: 30  # seconds
-
-# Drop unsupported parameters
-default_litellm_params:
-  drop_params: true
-
-# General Settings
-general_settings:
-  disable_responses_id_security: true
-
-  # Disable spend tracking to reduce database overhead
-  disable_spend_logs: false  # Keep enabled to track API vs GPU costs
-
-  # Disable tag tracking
-  disable_tag_tracking: true
-
-  # Disable daily spend updates
-  disable_daily_spend_logs: false  # Keep enabled for cost analysis
-
-  # Master key for authentication (set via env var)
-  master_key: os.environ/LITELLM_MASTER_KEY
-
-  # Database for logging (optional but recommended for cost tracking)
-  database_url: os.environ/DATABASE_URL
-
-  # Enable OpenAPI docs
-  docs_url: /docs
-
-# =============================================================================
-# Usage Guidelines (for Open WebUI users)
-# =============================================================================
-#
-# Model Selection Guide:
-#
-# Use llama-3.1-8b for:
-# - General chat and Q&A
-# - Simple code generation
-# - Data extraction
-# - Summarization
-# - Translation
-# - Most routine tasks
-# Cost: ~$0/month (self-hosted)
-# Speed: ~50-80 tokens/second
-#
-# Use qwen-2.5-14b for:
-# - Complex reasoning
-# - Multi-step problems
-# - Advanced code generation
-# - Multilingual tasks
-# Cost: ~$0/month (self-hosted)
-# Speed: ~30-50 tokens/second
-#
-# Use claude-sonnet-4.5 for:
-# - Very complex reasoning
-# - Long documents (200K context)
-# - Production-critical code
-# - When quality matters most
-# Cost: ~$3/million input tokens, ~$15/million output tokens
-# Speed: ~30-40 tokens/second
-#
-# Use claude-3-haiku for:
-# - API fallback (if self-hosted down)
-# - Very fast responses needed
-# Cost: ~$0.25/million input tokens, ~$1.25/million output tokens
-# Speed: ~60-80 tokens/second
-#
-# =============================================================================
-
-# Health Check Configuration
-health_check:
-  # Check vLLM health endpoint
-  enabled: true
-  interval: 30  # seconds
-  timeout: 5  # seconds
-
-# Fallback Configuration
-# If GPU server is down, automatically use Claude
-fallback:
-  - ["llama-3.1-8b", "claude-3-haiku"]
-  - ["qwen-2.5-14b", "claude-sonnet-4.5"]
diff --git a/ai/model-orchestrator/Dockerfile b/ai/model-orchestrator/Dockerfile
deleted file mode 100644
index bcee1e9..0000000
--- a/ai/model-orchestrator/Dockerfile
+++ /dev/null
@@ -1,22 +0,0 @@
-FROM python:3.11-slim
-
-WORKDIR /app
-
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    curl \
-    && rm -rf /var/lib/apt/lists/*
-
-# Copy requirements and install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-
-# Copy application code
-COPY orchestrator.py .
-COPY models.yaml .
-
-# Expose port
-EXPOSE 9000
-
-# Run the orchestrator
-CMD ["python", "orchestrator.py"]
diff --git a/ai/model-orchestrator/models.yaml b/ai/model-orchestrator/models.yaml
deleted file mode 100644
index caf6a95..0000000
--- a/ai/model-orchestrator/models.yaml
+++ /dev/null
@@ -1,89 +0,0 @@
-# Model Registry for AI Orchestrator
-# Add new models by appending to this file
-
-models:
-  # Text Generation Models
-  qwen-2.5-7b:
-    type: text
-    framework: vllm
-    docker_service: vllm-qwen
-    port: 8001
-    vram_gb: 14
-    startup_time_seconds: 120
-    endpoint: /v1/chat/completions
-    description: "Qwen 2.5 7B Instruct - Fast text generation, no authentication required"
-
-  # Image Generation Models
-  flux-schnell:
-    type: image
-    framework: openedai-images
-    docker_service: flux
-    port: 8002
-    vram_gb: 14
-    startup_time_seconds: 60
-    endpoint: /v1/images/generations
-    description: "Flux.1 Schnell - Fast high-quality image generation (4-5 sec/image)"
-
-  # Music Generation Models
-  musicgen-medium:
-    type: audio
-    framework: audiocraft
-    docker_service: musicgen
-    port: 8003
-    vram_gb: 11
-    startup_time_seconds: 45
-    endpoint: /v1/audio/generations
-    description: "MusicGen Medium - Text-to-music generation (60-90 sec for 30s audio)"
-
-# Example: Add more models easily by uncommenting and customizing below
-
-# Future Text Models:
-#  llama-3.1-8b:
-#    type: text
-#    framework: vllm
-#    docker_service: vllm-llama
-#    port: 8004
-#    vram_gb: 17
-#    startup_time_seconds: 120
-#    endpoint: /v1/chat/completions
-#    description: "Llama 3.1 8B Instruct - Meta's latest model"
-
-# Future Image Models:
-#  sdxl:
-#    type: image
-#    framework: openedai-images
-#    docker_service: sdxl
-#    port: 8005
-#    vram_gb: 10
-#    startup_time_seconds: 45
-#    endpoint: /v1/images/generations
-#    description: "Stable Diffusion XL - High quality image generation"
-
-# Future Audio Models:
-#  whisper-large:
-#    type: audio
-#    framework: faster-whisper
-#    docker_service: whisper
-#    port: 8006
-#    vram_gb: 3
-#    startup_time_seconds: 30
-#    endpoint: /v1/audio/transcriptions
-#    description: "Whisper Large v3 - Speech-to-text transcription"
-#
-#  xtts-v2:
-#    type: audio
-#    framework: openedai-speech
-#    docker_service: tts
-#    port: 8007
-#    vram_gb: 3
-#    startup_time_seconds: 30
-#    endpoint: /v1/audio/speech
-#    description: "XTTS v2 - High-quality text-to-speech with voice cloning"
-
-# Configuration
-config:
-  gpu_memory_total_gb: 24
-  allow_concurrent_loading: false  # Sequential loading only
-  model_switch_timeout_seconds: 300  # 5 minutes max for model switching
-  health_check_interval_seconds: 10
-  default_model: qwen-2.5-7b
diff --git a/ai/model-orchestrator/orchestrator.py b/ai/model-orchestrator/orchestrator.py
deleted file mode 100644
index 9091537..0000000
--- a/ai/model-orchestrator/orchestrator.py
+++ /dev/null
@@ -1,359 +0,0 @@
-#!/usr/bin/env python3
-"""
-AI Model Orchestrator for RunPod RTX 4090
-Manages sequential loading of text, image, and music models on a single GPU
-
-Features:
-- Automatic model switching based on request type
-- OpenAI-compatible API endpoints
-- Docker Compose service management
-- GPU memory monitoring
-- Simple YAML configuration for adding new models
-"""
-
-import asyncio
-import logging
-import os
-import time
-from typing import Dict, Optional, Any
-
-import docker
-import httpx
-import yaml
-from fastapi import FastAPI, Request, HTTPException
-from fastapi.responses import JSONResponse, StreamingResponse
-from pydantic import BaseModel
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-
-# FastAPI app
-app = FastAPI(title="AI Model Orchestrator", version="1.0.0")
-
-# Docker client
-docker_client = docker.from_env()
-
-# Global state
-current_model: Optional[str] = None
-model_registry: Dict[str, Dict[str, Any]] = {}
-config: Dict[str, Any] = {}
-
-
-def load_model_registry():
-    """Load model registry from models.yaml"""
-    global model_registry, config
-
-    config_path = os.getenv("MODELS_CONFIG", "/app/models.yaml")
-    logger.info(f"Loading model registry from {config_path}")
-
-    with open(config_path, 'r') as f:
-        data = yaml.safe_load(f)
-
-    model_registry = data.get('models', {})
-    config = data.get('config', {})
-
-    logger.info(f"Loaded {len(model_registry)} models from registry")
-    for model_name, model_info in model_registry.items():
-        logger.info(f"  - {model_name}: {model_info['description']}")
-
-
-def get_docker_service_name(service_name: str) -> str:
-    """Get full Docker service name with project prefix"""
-    project_name = os.getenv("COMPOSE_PROJECT_NAME", "ai")
-    return f"{project_name}_{service_name}_1"
-
-
-async def stop_current_model():
-    """Stop the currently running model service"""
-    global current_model
-
-    if not current_model:
-        logger.info("No model currently running")
-        return
-
-    model_info = model_registry.get(current_model)
-    if not model_info:
-        logger.warning(f"Model {current_model} not found in registry")
-        current_model = None
-        return
-
-    service_name = get_docker_service_name(model_info['docker_service'])
-    logger.info(f"Stopping model: {current_model} (service: {service_name})")
-
-    try:
-        container = docker_client.containers.get(service_name)
-        container.stop(timeout=30)
-        logger.info(f"Stopped {current_model}")
-        current_model = None
-    except docker.errors.NotFound:
-        logger.warning(f"Container {service_name} not found (already stopped?)")
-        current_model = None
-    except Exception as e:
-        logger.error(f"Error stopping {service_name}: {e}")
-        raise
-
-
-async def start_model(model_name: str):
-    """Start a model service"""
-    global current_model
-
-    if model_name not in model_registry:
-        raise HTTPException(status_code=404, detail=f"Model {model_name} not found in registry")
-
-    model_info = model_registry[model_name]
-    service_name = get_docker_service_name(model_info['docker_service'])
-
-    logger.info(f"Starting model: {model_name} (service: {service_name})")
-    logger.info(f"  VRAM requirement: {model_info['vram_gb']} GB")
-    logger.info(f"  Estimated startup time: {model_info['startup_time_seconds']}s")
-
-    try:
-        # Start the container
-        container = docker_client.containers.get(service_name)
-        container.start()
-
-        # Wait for service to be healthy
-        port = model_info['port']
-        endpoint = model_info.get('endpoint', '/')
-        base_url = f"http://localhost:{port}"
-
-        logger.info(f"Waiting for {model_name} to be ready at {base_url}...")
-
-        max_wait = model_info['startup_time_seconds'] + 60  # Add buffer
-        start_time = time.time()
-
-        async with httpx.AsyncClient() as client:
-            while time.time() - start_time < max_wait:
-                try:
-                    # Try health check or root endpoint
-                    health_url = f"{base_url}/health"
-                    try:
-                        response = await client.get(health_url, timeout=5.0)
-                        if response.status_code == 200:
-                            logger.info(f"{model_name} is ready!")
-                            current_model = model_name
-                            return
-                    except:
-                        # Try root endpoint if /health doesn't exist
-                        response = await client.get(base_url, timeout=5.0)
-                        if response.status_code == 200:
-                            logger.info(f"{model_name} is ready!")
-                            current_model = model_name
-                            return
-                except Exception as e:
-                    logger.debug(f"Waiting for {model_name}... ({e})")
-
-                await asyncio.sleep(5)
-
-        raise HTTPException(
-            status_code=503,
-            detail=f"Model {model_name} failed to start within {max_wait}s"
-        )
-
-    except docker.errors.NotFound:
-        raise HTTPException(
-            status_code=500,
-            detail=f"Docker service {service_name} not found. Is it defined in docker-compose?"
-        )
-    except Exception as e:
-        logger.error(f"Error starting {model_name}: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-
-
-async def ensure_model_running(model_name: str):
-    """Ensure the specified model is running, switching if necessary"""
-    global current_model
-
-    if current_model == model_name:
-        logger.info(f"Model {model_name} already running")
-        return
-
-    logger.info(f"Switching model: {current_model} -> {model_name}")
-
-    # Stop current model
-    await stop_current_model()
-
-    # Start requested model
-    await start_model(model_name)
-
-    logger.info(f"Model switch complete: {model_name} is now active")
-
-
-async def proxy_request(model_name: str, request: Request):
-    """Proxy request to the active model service"""
-    model_info = model_registry[model_name]
-    port = model_info['port']
-
-    # Get request details
-    path = request.url.path
-    method = request.method
-    headers = dict(request.headers)
-    headers.pop('host', None)  # Remove host header
-
-    # Build target URL
-    target_url = f"http://localhost:{port}{path}"
-
-    logger.info(f"Proxying {method} request to {target_url}")
-
-    async with httpx.AsyncClient(timeout=300.0) as client:
-        # Handle different request types
-        if method == "GET":
-            response = await client.get(target_url, headers=headers)
-        elif method == "POST":
-            body = await request.body()
-            response = await client.post(target_url, content=body, headers=headers)
-        else:
-            raise HTTPException(status_code=405, detail=f"Method {method} not supported")
-
-        # Return response
-        return JSONResponse(
-            content=response.json() if response.headers.get('content-type', '').startswith('application/json') else response.text,
-            status_code=response.status_code,
-            headers=dict(response.headers)
-        )
-
-
-@app.on_event("startup")
-async def startup_event():
-    """Load model registry on startup"""
-    load_model_registry()
-    logger.info("AI Model Orchestrator started successfully")
-    logger.info(f"GPU Memory: {config.get('gpu_memory_total_gb', 24)} GB")
-    logger.info(f"Default model: {config.get('default_model', 'qwen-2.5-7b')}")
-
-
-@app.get("/")
-async def root():
-    """Root endpoint"""
-    return {
-        "service": "AI Model Orchestrator",
-        "version": "1.0.0",
-        "current_model": current_model,
-        "available_models": list(model_registry.keys())
-    }
-
-
-@app.get("/health")
-async def health():
-    """Health check endpoint"""
-    return {
-        "status": "healthy",
-        "current_model": current_model,
-        "model_info": model_registry.get(current_model) if current_model else None,
-        "gpu_memory_total_gb": config.get('gpu_memory_total_gb', 24),
-        "models_available": len(model_registry)
-    }
-
-
-@app.get("/models")
-async def list_models():
-    """List all available models"""
-    return {
-        "models": model_registry,
-        "current_model": current_model
-    }
-
-
-@app.post("/v1/chat/completions")
-async def chat_completions(request: Request):
-    """OpenAI-compatible chat completions endpoint (text models)"""
-    # Parse request to get model name
-    body = await request.json()
-    model_name = body.get('model', config.get('default_model', 'qwen-2.5-7b'))
-
-    # Validate model type
-    if model_name not in model_registry:
-        raise HTTPException(status_code=404, detail=f"Model {model_name} not found")
-
-    if model_registry[model_name]['type'] != 'text':
-        raise HTTPException(status_code=400, detail=f"Model {model_name} is not a text model")
-
-    # Ensure model is running
-    await ensure_model_running(model_name)
-
-    # Proxy request to model
-    return await proxy_request(model_name, request)
-
-
-@app.post("/v1/images/generations")
-async def image_generations(request: Request):
-    """OpenAI-compatible image generation endpoint"""
-    # Parse request to get model name
-    body = await request.json()
-    model_name = body.get('model', 'flux-schnell')
-
-    # Validate model type
-    if model_name not in model_registry:
-        raise HTTPException(status_code=404, detail=f"Model {model_name} not found")
-
-    if model_registry[model_name]['type'] != 'image':
-        raise HTTPException(status_code=400, detail=f"Model {model_name} is not an image model")
-
-    # Ensure model is running
-    await ensure_model_running(model_name)
-
-    # Proxy request to model
-    return await proxy_request(model_name, request)
-
-
-@app.post("/v1/audio/generations")
-async def audio_generations(request: Request):
-    """Custom audio generation endpoint (music/sound effects)"""
-    # Parse request to get model name
-    body = await request.json()
-    model_name = body.get('model', 'musicgen-medium')
-
-    # Validate model type
-    if model_name not in model_registry:
-        raise HTTPException(status_code=404, detail=f"Model {model_name} not found")
-
-    if model_registry[model_name]['type'] != 'audio':
-        raise HTTPException(status_code=400, detail=f"Model {model_name} is not an audio model")
-
-    # Ensure model is running
-    await ensure_model_running(model_name)
-
-    # Proxy request to model
-    return await proxy_request(model_name, request)
-
-
-@app.post("/switch")
-async def switch_model(request: Request):
-    """Manually switch to a specific model"""
-    body = await request.json()
-    model_name = body.get('model')
-
-    if not model_name:
-        raise HTTPException(status_code=400, detail="Model name required")
-
-    if model_name not in model_registry:
-        raise HTTPException(status_code=404, detail=f"Model {model_name} not found")
-
-    await ensure_model_running(model_name)
-
-    return {
-        "status": "success",
-        "model": model_name,
-        "message": f"Switched to {model_name}"
-    }
-
-
-if __name__ == "__main__":
-    import uvicorn
-
-    host = os.getenv("HOST", "0.0.0.0")
-    port = int(os.getenv("PORT", "9000"))
-
-    logger.info(f"Starting AI Model Orchestrator on {host}:{port}")
-
-    uvicorn.run(
-        app,
-        host=host,
-        port=port,
-        log_level="info",
-        access_log=True,
-    )
diff --git a/ai/model-orchestrator/requirements.txt b/ai/model-orchestrator/requirements.txt
deleted file mode 100644
index 794b4af..0000000
--- a/ai/model-orchestrator/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-fastapi==0.104.1
-uvicorn[standard]==0.24.0
-httpx==0.25.1
-docker==6.1.3
-pyyaml==6.0.1
-pydantic==2.5.0
diff --git a/ai/musicgen/Dockerfile b/ai/musicgen/Dockerfile
deleted file mode 100644
index 5044496..0000000
--- a/ai/musicgen/Dockerfile
+++ /dev/null
@@ -1,38 +0,0 @@
-FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
-
-WORKDIR /app
-
-# Install Python and system dependencies
-RUN apt-get update && apt-get install -y \
-    python3.10 \
-    python3-pip \
-    ffmpeg \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-
-# Upgrade pip
-RUN pip3 install --no-cache-dir --upgrade pip
-
-# Install PyTorch with CUDA support
-RUN pip3 install --no-cache-dir torch==2.1.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121
-
-# Copy requirements and install dependencies
-COPY requirements.txt .
-RUN pip3 install --no-cache-dir -r requirements.txt
-
-# Copy application code
-COPY server.py .
-
-# Create directory for model cache
-RUN mkdir -p /app/models
-
-# Environment variables
-ENV HF_HOME=/app/models
-ENV TORCH_HOME=/app/models
-ENV MODEL_NAME=facebook/musicgen-medium
-
-# Expose port
-EXPOSE 8000
-
-# Run the server
-CMD ["python3", "server.py"]
diff --git a/ai/musicgen/requirements.txt b/ai/musicgen/requirements.txt
deleted file mode 100644
index 37cf773..0000000
--- a/ai/musicgen/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-torch==2.1.0
-torchaudio==2.1.0
-audiocraft==1.3.0
-fastapi==0.104.1
-uvicorn[standard]==0.24.0
-pydantic==2.5.0
diff --git a/ai/musicgen/server.py b/ai/musicgen/server.py
deleted file mode 100644
index 5ea6218..0000000
--- a/ai/musicgen/server.py
+++ /dev/null
@@ -1,194 +0,0 @@
-#!/usr/bin/env python3
-"""
-MusicGen API Server
-OpenAI-compatible API for music generation using Meta's MusicGen
-
-Endpoints:
-- POST /v1/audio/generations - Generate music from text prompt
-- GET /health - Health check
-- GET / - Service info
-"""
-
-import base64
-import io
-import logging
-import os
-import tempfile
-from typing import Optional
-
-import torch
-import torchaudio
-from audiocraft.models import MusicGen
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import JSONResponse
-from pydantic import BaseModel, Field
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-
-# FastAPI app
-app = FastAPI(title="MusicGen API Server", version="1.0.0")
-
-# Global model instance
-model: Optional[MusicGen] = None
-model_name: str = os.getenv("MODEL_NAME", "facebook/musicgen-medium")
-device: str = "cuda" if torch.cuda.is_available() else "cpu"
-
-
-class AudioGenerationRequest(BaseModel):
-    """Music generation request"""
-    model: str = Field(default="musicgen-medium", description="Model name")
-    prompt: str = Field(..., description="Text description of the music to generate")
-    duration: float = Field(default=30.0, ge=1.0, le=30.0, description="Duration in seconds")
-    temperature: float = Field(default=1.0, ge=0.1, le=2.0, description="Sampling temperature")
-    top_k: int = Field(default=250, ge=0, le=500, description="Top-k sampling")
-    top_p: float = Field(default=0.0, ge=0.0, le=1.0, description="Top-p (nucleus) sampling")
-    cfg_coef: float = Field(default=3.0, ge=1.0, le=15.0, description="Classifier-free guidance coefficient")
-    response_format: str = Field(default="wav", description="Audio format (wav or mp3)")
-
-
-class AudioGenerationResponse(BaseModel):
-    """Music generation response"""
-    audio: str = Field(..., description="Base64-encoded audio data")
-    format: str = Field(..., description="Audio format (wav or mp3)")
-    duration: float = Field(..., description="Duration in seconds")
-    sample_rate: int = Field(..., description="Sample rate in Hz")
-
-
-@app.on_event("startup")
-async def startup_event():
-    """Load MusicGen model on startup"""
-    global model
-
-    logger.info(f"Loading MusicGen model: {model_name}")
-    logger.info(f"Device: {device}")
-
-    # Load model
-    model = MusicGen.get_pretrained(model_name, device=device)
-
-    logger.info(f"MusicGen model loaded successfully")
-    logger.info(f"Max duration: 30 seconds at 32kHz")
-
-
-@app.get("/")
-async def root():
-    """Root endpoint"""
-    return {
-        "service": "MusicGen API Server",
-        "model": model_name,
-        "device": device,
-        "max_duration": 30.0,
-        "sample_rate": 32000
-    }
-
-
-@app.get("/health")
-async def health():
-    """Health check endpoint"""
-    return {
-        "status": "healthy" if model else "initializing",
-        "model": model_name,
-        "device": device,
-        "ready": model is not None,
-        "gpu_available": torch.cuda.is_available()
-    }
-
-
-@app.post("/v1/audio/generations")
-async def generate_audio(request: AudioGenerationRequest) -> AudioGenerationResponse:
-    """Generate music from text prompt"""
-    if not model:
-        raise HTTPException(status_code=503, detail="Model not initialized")
-
-    logger.info(f"Generating music: {request.prompt[:100]}...")
-    logger.info(f"Duration: {request.duration}s, Temperature: {request.temperature}")
-
-    try:
-        # Set generation parameters
-        model.set_generation_params(
-            duration=request.duration,
-            temperature=request.temperature,
-            top_k=request.top_k,
-            top_p=request.top_p,
-            cfg_coef=request.cfg_coef,
-        )
-
-        # Generate audio
-        descriptions = [request.prompt]
-        with torch.no_grad():
-            wav = model.generate(descriptions)
-
-        # wav shape: [batch_size, channels, samples]
-        # Extract first batch item
-        audio_data = wav[0].cpu()  # [channels, samples]
-
-        # Get sample rate
-        sample_rate = model.sample_rate
-
-        # Save to temporary file
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
-            temp_path = temp_file.name
-            torchaudio.save(temp_path, audio_data, sample_rate)
-
-        # Read audio file and encode to base64
-        with open(temp_path, 'rb') as f:
-            audio_bytes = f.read()
-
-        # Clean up temporary file
-        os.unlink(temp_path)
-
-        # Encode to base64
-        audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
-
-        logger.info(f"Generated {request.duration}s of audio")
-
-        return AudioGenerationResponse(
-            audio=audio_base64,
-            format="wav",
-            duration=request.duration,
-            sample_rate=sample_rate
-        )
-
-    except Exception as e:
-        logger.error(f"Error generating audio: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-
-
-@app.get("/v1/models")
-async def list_models():
-    """List available models (OpenAI-compatible)"""
-    return {
-        "object": "list",
-        "data": [
-            {
-                "id": "musicgen-medium",
-                "object": "model",
-                "created": 1234567890,
-                "owned_by": "meta",
-                "permission": [],
-                "root": model_name,
-                "parent": None,
-            }
-        ]
-    }
-
-
-if __name__ == "__main__":
-    import uvicorn
-
-    host = os.getenv("HOST", "0.0.0.0")
-    port = int(os.getenv("PORT", "8000"))
-
-    logger.info(f"Starting MusicGen API server on {host}:{port}")
-
-    uvicorn.run(
-        app,
-        host=host,
-        port=port,
-        log_level="info",
-        access_log=True,
-    )
diff --git a/ai/simple_vllm_server.py b/ai/simple_vllm_server.py
deleted file mode 100644
index 0075bd2..0000000
--- a/ai/simple_vllm_server.py
+++ /dev/null
@@ -1,302 +0,0 @@
-#!/usr/bin/env python3
-"""
-Simple vLLM server using AsyncLLMEngine directly
-Bypasses the multiprocessing issues we hit with the default vLLM API server
-OpenAI-compatible endpoints: /v1/models and /v1/completions
-"""
-
-import asyncio
-import json
-import logging
-import os
-from typing import AsyncIterator, Dict, List, Optional
-
-from fastapi import FastAPI, Request
-from fastapi.responses import JSONResponse, StreamingResponse
-from pydantic import BaseModel, Field
-from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
-from vllm.utils import random_uuid
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-
-# FastAPI app
-app = FastAPI(title="Simple vLLM Server", version="1.0.0")
-
-# Global engine instance
-engine: Optional[AsyncLLMEngine] = None
-model_name: str = "Qwen/Qwen2.5-7B-Instruct"
-
-# Request/Response models
-class CompletionRequest(BaseModel):
-    """OpenAI-compatible completion request"""
-    model: str = Field(default="qwen-2.5-7b")
-    prompt: str | List[str] = Field(..., description="Text prompt(s)")
-    max_tokens: int = Field(default=512, ge=1, le=4096)
-    temperature: float = Field(default=0.7, ge=0.0, le=2.0)
-    top_p: float = Field(default=1.0, ge=0.0, le=1.0)
-    n: int = Field(default=1, ge=1, le=10)
-    stream: bool = Field(default=False)
-    stop: Optional[str | List[str]] = None
-    presence_penalty: float = Field(default=0.0, ge=-2.0, le=2.0)
-    frequency_penalty: float = Field(default=0.0, ge=-2.0, le=2.0)
-
-class ChatMessage(BaseModel):
-    """Chat message format"""
-    role: str = Field(..., description="Role: system, user, or assistant")
-    content: str = Field(..., description="Message content")
-
-class ChatCompletionRequest(BaseModel):
-    """OpenAI-compatible chat completion request"""
-    model: str = Field(default="qwen-2.5-7b")
-    messages: List[ChatMessage] = Field(..., description="Chat messages")
-    max_tokens: int = Field(default=512, ge=1, le=4096)
-    temperature: float = Field(default=0.7, ge=0.0, le=2.0)
-    top_p: float = Field(default=1.0, ge=0.0, le=1.0)
-    n: int = Field(default=1, ge=1, le=10)
-    stream: bool = Field(default=False)
-    stop: Optional[str | List[str]] = None
-
-@app.on_event("startup")
-async def startup_event():
-    """Initialize vLLM engine on startup"""
-    global engine, model_name
-
-    logger.info(f"Initializing vLLM AsyncLLMEngine with model: {model_name}")
-
-    # Configure engine
-    engine_args = AsyncEngineArgs(
-        model=model_name,
-        tensor_parallel_size=1,  # Single GPU
-        gpu_memory_utilization=0.85,  # Use 85% of GPU memory
-        max_model_len=4096,  # Context length
-        dtype="auto",  # Auto-detect dtype
-        download_dir="/workspace/huggingface_cache",  # Large disk
-        trust_remote_code=True,  # Some models require this
-        enforce_eager=False,  # Use CUDA graphs for better performance
-    )
-
-    # Create async engine
-    engine = AsyncLLMEngine.from_engine_args(engine_args)
-
-    logger.info("vLLM AsyncLLMEngine initialized successfully")
-
-@app.get("/")
-async def root():
-    """Health check endpoint"""
-    return {"status": "ok", "model": model_name}
-
-@app.get("/health")
-async def health():
-    """Detailed health check"""
-    return {
-        "status": "healthy" if engine else "initializing",
-        "model": model_name,
-        "ready": engine is not None
-    }
-
-@app.get("/v1/models")
-async def list_models():
-    """OpenAI-compatible models endpoint"""
-    return {
-        "object": "list",
-        "data": [
-            {
-                "id": "qwen-2.5-7b",
-                "object": "model",
-                "created": 1234567890,
-                "owned_by": "pivoine-gpu",
-                "permission": [],
-                "root": model_name,
-                "parent": None,
-            }
-        ]
-    }
-
-def messages_to_prompt(messages: List[ChatMessage]) -> str:
-    """Convert chat messages to a single prompt string"""
-    # Qwen 2.5 chat template format
-    prompt_parts = []
-
-    for msg in messages:
-        role = msg.role
-        content = msg.content
-
-        if role == "system":
-            prompt_parts.append(f"<|im_start|>system\n{content}<|im_end|>")
-        elif role == "user":
-            prompt_parts.append(f"<|im_start|>user\n{content}<|im_end|>")
-        elif role == "assistant":
-            prompt_parts.append(f"<|im_start|>assistant\n{content}<|im_end|>")
-
-    # Add final assistant prompt
-    prompt_parts.append("<|im_start|>assistant\n")
-
-    return "\n".join(prompt_parts)
-
-@app.post("/v1/completions")
-async def create_completion(request: CompletionRequest):
-    """OpenAI-compatible completion endpoint"""
-    if not engine:
-        return JSONResponse(
-            status_code=503,
-            content={"error": "Engine not initialized"}
-        )
-
-    # Handle both single prompt and batch prompts
-    prompts = [request.prompt] if isinstance(request.prompt, str) else request.prompt
-
-    # Configure sampling parameters
-    sampling_params = SamplingParams(
-        temperature=request.temperature,
-        top_p=request.top_p,
-        max_tokens=request.max_tokens,
-        n=request.n,
-        stop=request.stop if request.stop else [],
-        presence_penalty=request.presence_penalty,
-        frequency_penalty=request.frequency_penalty,
-    )
-
-    # Generate completions
-    results = []
-    for prompt in prompts:
-        request_id = random_uuid()
-
-        if request.stream:
-            # Streaming response
-            async def generate_stream():
-                async for output in engine.generate(prompt, sampling_params, request_id):
-                    chunk = {
-                        "id": request_id,
-                        "object": "text_completion",
-                        "created": 1234567890,
-                        "model": request.model,
-                        "choices": [
-                            {
-                                "text": output.outputs[0].text,
-                                "index": 0,
-                                "logprobs": None,
-                                "finish_reason": output.outputs[0].finish_reason,
-                            }
-                        ]
-                    }
-                    yield f"data: {json.dumps(chunk)}\n\n"
-                yield "data: [DONE]\n\n"
-
-            return StreamingResponse(generate_stream(), media_type="text/event-stream")
-        else:
-            # Non-streaming response
-            async for output in engine.generate(prompt, sampling_params, request_id):
-                final_output = output
-
-            results.append({
-                "text": final_output.outputs[0].text,
-                "index": len(results),
-                "logprobs": None,
-                "finish_reason": final_output.outputs[0].finish_reason,
-            })
-
-    return {
-        "id": random_uuid(),
-        "object": "text_completion",
-        "created": 1234567890,
-        "model": request.model,
-        "choices": results,
-        "usage": {
-            "prompt_tokens": 0,  # vLLM doesn't expose this easily
-            "completion_tokens": 0,
-            "total_tokens": 0,
-        }
-    }
-
-@app.post("/v1/chat/completions")
-async def create_chat_completion(request: ChatCompletionRequest):
-    """OpenAI-compatible chat completion endpoint"""
-    if not engine:
-        return JSONResponse(
-            status_code=503,
-            content={"error": "Engine not initialized"}
-        )
-
-    # Convert messages to prompt
-    prompt = messages_to_prompt(request.messages)
-
-    # Configure sampling parameters
-    sampling_params = SamplingParams(
-        temperature=request.temperature,
-        top_p=request.top_p,
-        max_tokens=request.max_tokens,
-        n=request.n,
-        stop=request.stop if request.stop else ["<|im_end|>"],
-    )
-
-    request_id = random_uuid()
-
-    if request.stream:
-        # Streaming response
-        async def generate_stream():
-            async for output in engine.generate(prompt, sampling_params, request_id):
-                chunk = {
-                    "id": request_id,
-                    "object": "chat.completion.chunk",
-                    "created": 1234567890,
-                    "model": request.model,
-                    "choices": [
-                        {
-                            "index": 0,
-                            "delta": {"content": output.outputs[0].text},
-                            "finish_reason": output.outputs[0].finish_reason,
-                        }
-                    ]
-                }
-                yield f"data: {json.dumps(chunk)}\n\n"
-            yield "data: [DONE]\n\n"
-
-        return StreamingResponse(generate_stream(), media_type="text/event-stream")
-    else:
-        # Non-streaming response
-        async for output in engine.generate(prompt, sampling_params, request_id):
-            final_output = output
-
-        return {
-            "id": request_id,
-            "object": "chat.completion",
-            "created": 1234567890,
-            "model": request.model,
-            "choices": [
-                {
-                    "index": 0,
-                    "message": {
-                        "role": "assistant",
-                        "content": final_output.outputs[0].text,
-                    },
-                    "finish_reason": final_output.outputs[0].finish_reason,
-                }
-            ],
-            "usage": {
-                "prompt_tokens": 0,
-                "completion_tokens": 0,
-                "total_tokens": 0,
-            }
-        }
-
-if __name__ == "__main__":
-    import uvicorn
-
-    # Get configuration from environment
-    host = os.getenv("VLLM_HOST", "0.0.0.0")
-    port = int(os.getenv("VLLM_PORT", "8000"))
-
-    logger.info(f"Starting vLLM server on {host}:{port}")
-
-    uvicorn.run(
-        app,
-        host=host,
-        port=port,
-        log_level="info",
-        access_log=True,
-    )
diff --git a/ai/vllm/Dockerfile b/ai/vllm/Dockerfile
deleted file mode 100644
index 7dde2d6..0000000
--- a/ai/vllm/Dockerfile
+++ /dev/null
@@ -1,34 +0,0 @@
-FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04
-
-WORKDIR /app
-
-# Install Python and system dependencies
-RUN apt-get update && apt-get install -y \
-    python3.11 \
-    python3-pip \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-
-# Upgrade pip
-RUN pip3 install --no-cache-dir --upgrade pip
-
-# Install vLLM and dependencies
-COPY requirements.txt .
-RUN pip3 install --no-cache-dir -r requirements.txt
-
-# Copy application code
-COPY server.py .
-
-# Create directory for model cache
-RUN mkdir -p /workspace/huggingface_cache
-
-# Environment variables
-ENV HF_HOME=/workspace/huggingface_cache
-ENV VLLM_HOST=0.0.0.0
-ENV VLLM_PORT=8000
-
-# Expose port
-EXPOSE 8000
-
-# Run the server
-CMD ["python3", "server.py"]
diff --git a/ai/vllm/requirements.txt b/ai/vllm/requirements.txt
deleted file mode 100644
index b702e45..0000000
--- a/ai/vllm/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-vllm==0.6.4.post1
-fastapi==0.104.1
-uvicorn[standard]==0.24.0
-pydantic==2.5.0
diff --git a/ai/vllm/server.py b/ai/vllm/server.py
deleted file mode 100644
index 0075bd2..0000000
--- a/ai/vllm/server.py
+++ /dev/null
@@ -1,302 +0,0 @@
-#!/usr/bin/env python3
-"""
-Simple vLLM server using AsyncLLMEngine directly
-Bypasses the multiprocessing issues we hit with the default vLLM API server
-OpenAI-compatible endpoints: /v1/models and /v1/completions
-"""
-
-import asyncio
-import json
-import logging
-import os
-from typing import AsyncIterator, Dict, List, Optional
-
-from fastapi import FastAPI, Request
-from fastapi.responses import JSONResponse, StreamingResponse
-from pydantic import BaseModel, Field
-from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
-from vllm.utils import random_uuid
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-logger = logging.getLogger(__name__)
-
-# FastAPI app
-app = FastAPI(title="Simple vLLM Server", version="1.0.0")
-
-# Global engine instance
-engine: Optional[AsyncLLMEngine] = None
-model_name: str = "Qwen/Qwen2.5-7B-Instruct"
-
-# Request/Response models
-class CompletionRequest(BaseModel):
-    """OpenAI-compatible completion request"""
-    model: str = Field(default="qwen-2.5-7b")
-    prompt: str | List[str] = Field(..., description="Text prompt(s)")
-    max_tokens: int = Field(default=512, ge=1, le=4096)
-    temperature: float = Field(default=0.7, ge=0.0, le=2.0)
-    top_p: float = Field(default=1.0, ge=0.0, le=1.0)
-    n: int = Field(default=1, ge=1, le=10)
-    stream: bool = Field(default=False)
-    stop: Optional[str | List[str]] = None
-    presence_penalty: float = Field(default=0.0, ge=-2.0, le=2.0)
-    frequency_penalty: float = Field(default=0.0, ge=-2.0, le=2.0)
-
-class ChatMessage(BaseModel):
-    """Chat message format"""
-    role: str = Field(..., description="Role: system, user, or assistant")
-    content: str = Field(..., description="Message content")
-
-class ChatCompletionRequest(BaseModel):
-    """OpenAI-compatible chat completion request"""
-    model: str = Field(default="qwen-2.5-7b")
-    messages: List[ChatMessage] = Field(..., description="Chat messages")
-    max_tokens: int = Field(default=512, ge=1, le=4096)
-    temperature: float = Field(default=0.7, ge=0.0, le=2.0)
-    top_p: float = Field(default=1.0, ge=0.0, le=1.0)
-    n: int = Field(default=1, ge=1, le=10)
-    stream: bool = Field(default=False)
-    stop: Optional[str | List[str]] = None
-
-@app.on_event("startup")
-async def startup_event():
-    """Initialize vLLM engine on startup"""
-    global engine, model_name
-
-    logger.info(f"Initializing vLLM AsyncLLMEngine with model: {model_name}")
-
-    # Configure engine
-    engine_args = AsyncEngineArgs(
-        model=model_name,
-        tensor_parallel_size=1,  # Single GPU
-        gpu_memory_utilization=0.85,  # Use 85% of GPU memory
-        max_model_len=4096,  # Context length
-        dtype="auto",  # Auto-detect dtype
-        download_dir="/workspace/huggingface_cache",  # Large disk
-        trust_remote_code=True,  # Some models require this
-        enforce_eager=False,  # Use CUDA graphs for better performance
-    )
-
-    # Create async engine
-    engine = AsyncLLMEngine.from_engine_args(engine_args)
-
-    logger.info("vLLM AsyncLLMEngine initialized successfully")
-
-@app.get("/")
-async def root():
-    """Health check endpoint"""
-    return {"status": "ok", "model": model_name}
-
-@app.get("/health")
-async def health():
-    """Detailed health check"""
-    return {
-        "status": "healthy" if engine else "initializing",
-        "model": model_name,
-        "ready": engine is not None
-    }
-
-@app.get("/v1/models")
-async def list_models():
-    """OpenAI-compatible models endpoint"""
-    return {
-        "object": "list",
-        "data": [
-            {
-                "id": "qwen-2.5-7b",
-                "object": "model",
-                "created": 1234567890,
-                "owned_by": "pivoine-gpu",
-                "permission": [],
-                "root": model_name,
-                "parent": None,
-            }
-        ]
-    }
-
-def messages_to_prompt(messages: List[ChatMessage]) -> str:
-    """Convert chat messages to a single prompt string"""
-    # Qwen 2.5 chat template format
-    prompt_parts = []
-
-    for msg in messages:
-        role = msg.role
-        content = msg.content
-
-        if role == "system":
-            prompt_parts.append(f"<|im_start|>system\n{content}<|im_end|>")
-        elif role == "user":
-            prompt_parts.append(f"<|im_start|>user\n{content}<|im_end|>")
-        elif role == "assistant":
-            prompt_parts.append(f"<|im_start|>assistant\n{content}<|im_end|>")
-
-    # Add final assistant prompt
-    prompt_parts.append("<|im_start|>assistant\n")
-
-    return "\n".join(prompt_parts)
-
-@app.post("/v1/completions")
-async def create_completion(request: CompletionRequest):
-    """OpenAI-compatible completion endpoint"""
-    if not engine:
-        return JSONResponse(
-            status_code=503,
-            content={"error": "Engine not initialized"}
-        )
-
-    # Handle both single prompt and batch prompts
-    prompts = [request.prompt] if isinstance(request.prompt, str) else request.prompt
-
-    # Configure sampling parameters
-    sampling_params = SamplingParams(
-        temperature=request.temperature,
-        top_p=request.top_p,
-        max_tokens=request.max_tokens,
-        n=request.n,
-        stop=request.stop if request.stop else [],
-        presence_penalty=request.presence_penalty,
-        frequency_penalty=request.frequency_penalty,
-    )
-
-    # Generate completions
-    results = []
-    for prompt in prompts:
-        request_id = random_uuid()
-
-        if request.stream:
-            # Streaming response
-            async def generate_stream():
-                async for output in engine.generate(prompt, sampling_params, request_id):
-                    chunk = {
-                        "id": request_id,
-                        "object": "text_completion",
-                        "created": 1234567890,
-                        "model": request.model,
-                        "choices": [
-                            {
-                                "text": output.outputs[0].text,
-                                "index": 0,
-                                "logprobs": None,
-                                "finish_reason": output.outputs[0].finish_reason,
-                            }
-                        ]
-                    }
-                    yield f"data: {json.dumps(chunk)}\n\n"
-                yield "data: [DONE]\n\n"
-
-            return StreamingResponse(generate_stream(), media_type="text/event-stream")
-        else:
-            # Non-streaming response
-            async for output in engine.generate(prompt, sampling_params, request_id):
-                final_output = output
-
-            results.append({
-                "text": final_output.outputs[0].text,
-                "index": len(results),
-                "logprobs": None,
-                "finish_reason": final_output.outputs[0].finish_reason,
-            })
-
-    return {
-        "id": random_uuid(),
-        "object": "text_completion",
-        "created": 1234567890,
-        "model": request.model,
-        "choices": results,
-        "usage": {
-            "prompt_tokens": 0,  # vLLM doesn't expose this easily
-            "completion_tokens": 0,
-            "total_tokens": 0,
-        }
-    }
-
-@app.post("/v1/chat/completions")
-async def create_chat_completion(request: ChatCompletionRequest):
-    """OpenAI-compatible chat completion endpoint"""
-    if not engine:
-        return JSONResponse(
-            status_code=503,
-            content={"error": "Engine not initialized"}
-        )
-
-    # Convert messages to prompt
-    prompt = messages_to_prompt(request.messages)
-
-    # Configure sampling parameters
-    sampling_params = SamplingParams(
-        temperature=request.temperature,
-        top_p=request.top_p,
-        max_tokens=request.max_tokens,
-        n=request.n,
-        stop=request.stop if request.stop else ["<|im_end|>"],
-    )
-
-    request_id = random_uuid()
-
-    if request.stream:
-        # Streaming response
-        async def generate_stream():
-            async for output in engine.generate(prompt, sampling_params, request_id):
-                chunk = {
-                    "id": request_id,
-                    "object": "chat.completion.chunk",
-                    "created": 1234567890,
-                    "model": request.model,
-                    "choices": [
-                        {
-                            "index": 0,
-                            "delta": {"content": output.outputs[0].text},
-                            "finish_reason": output.outputs[0].finish_reason,
-                        }
-                    ]
-                }
-                yield f"data: {json.dumps(chunk)}\n\n"
-            yield "data: [DONE]\n\n"
-
-        return StreamingResponse(generate_stream(), media_type="text/event-stream")
-    else:
-        # Non-streaming response
-        async for output in engine.generate(prompt, sampling_params, request_id):
-            final_output = output
-
-        return {
-            "id": request_id,
-            "object": "chat.completion",
-            "created": 1234567890,
-            "model": request.model,
-            "choices": [
-                {
-                    "index": 0,
-                    "message": {
-                        "role": "assistant",
-                        "content": final_output.outputs[0].text,
-                    },
-                    "finish_reason": final_output.outputs[0].finish_reason,
-                }
-            ],
-            "usage": {
-                "prompt_tokens": 0,
-                "completion_tokens": 0,
-                "total_tokens": 0,
-            }
-        }
-
-if __name__ == "__main__":
-    import uvicorn
-
-    # Get configuration from environment
-    host = os.getenv("VLLM_HOST", "0.0.0.0")
-    port = int(os.getenv("VLLM_PORT", "8000"))
-
-    logger.info(f"Starting vLLM server on {host}:{port}")
-
-    uvicorn.run(
-        app,
-        host=host,
-        port=port,
-        log_level="info",
-        access_log=True,
-    )