From d5e37dbd3f7ec2460b34b8e6c93414043685c21d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= Date: Fri, 21 Nov 2025 14:41:10 +0100 Subject: [PATCH] cleanup: remove GPU/RunPod files from docker-compose repository Removed GPU orchestration files migrated to dedicated runpod repository: - Model orchestrator, vLLM, Flux, MusicGen services - GPU Docker Compose files and configs - GPU deployment scripts and documentation Kept VPS AI services and facefusion: - compose.yaml (VPS AI + facefusion) - litellm-config.yaml (VPS LiteLLM) - postgres/ (VPS PostgreSQL init) - Dockerfile, entrypoint.sh, disable-nsfw-filter.patch (facefusion) - README.md (updated with runpod reference) GPU infrastructure now maintained at: ssh://git@dev.pivoine.art:2222/valknar/runpod.git --- ai/.env.example | 9 - ai/DOCKER_GPU_SETUP.md | 430 -------- ai/GPU_DEPLOYMENT_LOG.md | 421 -------- ai/GPU_EXPANSION_PLAN.md | 1306 ------------------------ ai/README_GPU_SETUP.md | 444 -------- ai/SETUP_GUIDE.md | 261 ----- ai/TAILSCALE_SETUP.md | 417 -------- ai/WIREGUARD_SETUP.md | 393 ------- ai/deploy-gpu-stack.sh | 229 ----- ai/docker-compose.gpu.yaml | 104 -- ai/flux/config/config.json | 13 - ai/gpu-server-compose.yaml | 237 ----- ai/litellm-config-gpu.yaml | 199 ---- ai/model-orchestrator/Dockerfile | 22 - ai/model-orchestrator/models.yaml | 89 -- ai/model-orchestrator/orchestrator.py | 359 ------- ai/model-orchestrator/requirements.txt | 6 - ai/musicgen/Dockerfile | 38 - ai/musicgen/requirements.txt | 6 - ai/musicgen/server.py | 194 ---- ai/simple_vllm_server.py | 302 ------ ai/vllm/Dockerfile | 34 - ai/vllm/requirements.txt | 4 - ai/vllm/server.py | 302 ------ 24 files changed, 5819 deletions(-) delete mode 100644 ai/.env.example delete mode 100644 ai/DOCKER_GPU_SETUP.md delete mode 100644 ai/GPU_DEPLOYMENT_LOG.md delete mode 100644 ai/GPU_EXPANSION_PLAN.md delete mode 100644 ai/README_GPU_SETUP.md delete mode 100644 ai/SETUP_GUIDE.md delete mode 100644 ai/TAILSCALE_SETUP.md delete mode 100644 ai/WIREGUARD_SETUP.md delete mode 100755 ai/deploy-gpu-stack.sh delete mode 100644 ai/docker-compose.gpu.yaml delete mode 100644 ai/flux/config/config.json delete mode 100644 ai/gpu-server-compose.yaml delete mode 100644 ai/litellm-config-gpu.yaml delete mode 100644 ai/model-orchestrator/Dockerfile delete mode 100644 ai/model-orchestrator/models.yaml delete mode 100644 ai/model-orchestrator/orchestrator.py delete mode 100644 ai/model-orchestrator/requirements.txt delete mode 100644 ai/musicgen/Dockerfile delete mode 100644 ai/musicgen/requirements.txt delete mode 100644 ai/musicgen/server.py delete mode 100644 ai/simple_vllm_server.py delete mode 100644 ai/vllm/Dockerfile delete mode 100644 ai/vllm/requirements.txt delete mode 100644 ai/vllm/server.py diff --git a/ai/.env.example b/ai/.env.example deleted file mode 100644 index 0c5c769..0000000 --- a/ai/.env.example +++ /dev/null @@ -1,9 +0,0 @@ -# Environment Variables for Multi-Modal AI Orchestration -# Copy this file to .env and fill in your values - -# Hugging Face Token (for downloading models) -# Get from: https://huggingface.co/settings/tokens -HF_TOKEN=hf_your_token_here - -# Tailscale IP of GPU Server (for VPS to connect) -GPU_TAILSCALE_IP=100.100.108.13 diff --git a/ai/DOCKER_GPU_SETUP.md b/ai/DOCKER_GPU_SETUP.md deleted file mode 100644 index e60d103..0000000 --- a/ai/DOCKER_GPU_SETUP.md +++ /dev/null @@ -1,430 +0,0 @@ -# Docker & NVIDIA Container Toolkit Setup - -## Day 5: Docker Configuration on GPU Server - -This guide sets up Docker with GPU support on your RunPod server. - ---- - -## Step 1: Install Docker - -### Quick Install (Recommended) - -```bash -# SSH into GPU server -ssh gpu-pivoine - -# Download and run Docker install script -curl -fsSL https://get.docker.com -o get-docker.sh -sh get-docker.sh - -# Verify installation -docker --version -docker compose version -``` - -Expected output: -``` -Docker version 24.0.7, build afdd53b -Docker Compose version v2.23.0 -``` - -### Manual Install (Alternative) - -```bash -# Add Docker's official GPG key -apt-get update -apt-get install -y ca-certificates curl gnupg -install -m 0755 -d /etc/apt/keyrings -curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg -chmod a+r /etc/apt/keyrings/docker.gpg - -# Add repository -echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ - $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ - tee /etc/apt/sources.list.d/docker.list > /dev/null - -# Install Docker -apt-get update -apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin - -# Start Docker -systemctl enable docker -systemctl start docker -``` - ---- - -## Step 2: Install NVIDIA Container Toolkit - -This enables Docker containers to use the GPU. - -```bash -# Add NVIDIA repository -distribution=$(. /etc/os-release;echo $ID$VERSION_ID) -curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ - gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg - -curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \ - sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ - tee /etc/apt/sources.list.d/nvidia-container-toolkit.list - -# Install toolkit -apt-get update -apt-get install -y nvidia-container-toolkit - -# Configure Docker to use NVIDIA runtime -nvidia-ctk runtime configure --runtime=docker - -# Restart Docker -systemctl restart docker -``` - ---- - -## Step 3: Test GPU Access in Docker - -### Test 1: Basic CUDA Container - -```bash -docker run --rm --runtime=nvidia --gpus all \ - nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi -``` - -Expected output: Same as `nvidia-smi` output showing your RTX 4090. - -### Test 2: PyTorch Container - -```bash -docker run --rm --runtime=nvidia --gpus all \ - pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime \ - python -c "import torch; print('CUDA:', torch.cuda.is_available(), 'Device:', torch.cuda.get_device_name(0))" -``` - -Expected output: -``` -CUDA: True Device: NVIDIA GeForce RTX 4090 -``` - -### Test 3: Multi-GPU Query (if you have multiple GPUs) - -```bash -docker run --rm --runtime=nvidia --gpus all \ - nvidia/cuda:12.1.0-base-ubuntu22.04 \ - bash -c "echo 'GPU Count:' && nvidia-smi --list-gpus" -``` - ---- - -## Step 4: Configure Docker Compose with GPU Support - -Docker Compose needs to know about NVIDIA runtime. - -### Create daemon.json - -```bash -cat > /etc/docker/daemon.json << 'EOF' -{ - "runtimes": { - "nvidia": { - "path": "nvidia-container-runtime", - "runtimeArgs": [] - } - }, - "default-runtime": "nvidia", - "log-driver": "json-file", - "log-opts": { - "max-size": "10m", - "max-file": "3" - } -} -EOF - -# Restart Docker -systemctl restart docker -``` - ---- - -## Step 5: Create GPU Project Structure - -```bash -cd /workspace - -# Create directory structure -mkdir -p gpu-stack/{vllm,comfyui,training,jupyter,monitoring} -cd gpu-stack - -# Create .env file -cat > .env << 'EOF' -# GPU Stack Environment Variables - -# Timezone -TIMEZONE=Europe/Berlin - -# VPN Network -VPS_IP=10.8.0.1 -GPU_IP=10.8.0.2 - -# Model Storage -MODELS_PATH=/workspace/models - -# Hugging Face (optional, for private models) -HF_TOKEN= - -# PostgreSQL (on VPS) -DB_HOST=10.8.0.1 -DB_PORT=5432 -DB_USER=valknar -DB_PASSWORD=ragnarok98 -DB_NAME=openwebui - -# Weights & Biases (optional, for training logging) -WANDB_API_KEY= -EOF - -chmod 600 .env -``` - ---- - -## Step 6: Test Full Stack (Quick Smoke Test) - -Let's deploy a minimal vLLM container to verify everything works: - -```bash -cd /workspace/gpu-stack - -# Create test compose file -cat > test-compose.yaml << 'EOF' -services: - test-vllm: - image: vllm/vllm-openai:latest - container_name: test_vllm - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - command: - - --model - - facebook/opt-125m # Tiny model for testing - - --host - - 0.0.0.0 - - --port - - 8000 - ports: - - "8000:8000" - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] -EOF - -# Start test -docker compose -f test-compose.yaml up -d - -# Wait 30 seconds for model download -sleep 30 - -# Check logs -docker compose -f test-compose.yaml logs - -# Test inference -curl http://localhost:8000/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "facebook/opt-125m", - "prompt": "Hello, my name is", - "max_tokens": 10 - }' -``` - -Expected output (JSON response with generated text). - -**Clean up test:** -```bash -docker compose -f test-compose.yaml down -``` - ---- - -## Step 7: Install Additional Tools - -```bash -# Python tools -apt install -y python3-pip python3-venv - -# Monitoring tools -apt install -y htop nvtop iotop - -# Network tools -apt install -y iperf3 tcpdump - -# Development tools -apt install -y build-essential - -# Git LFS (for large model files) -apt install -y git-lfs -git lfs install -``` - ---- - -## Step 8: Configure Automatic Updates (Optional) - -```bash -# Install unattended-upgrades -apt install -y unattended-upgrades - -# Configure -dpkg-reconfigure -plow unattended-upgrades - -# Enable automatic security updates -cat > /etc/apt/apt.conf.d/50unattended-upgrades << 'EOF' -Unattended-Upgrade::Allowed-Origins { - "${distro_id}:${distro_codename}-security"; -}; -Unattended-Upgrade::Automatic-Reboot "false"; -Unattended-Upgrade::Remove-Unused-Dependencies "true"; -EOF -``` - ---- - -## Troubleshooting - -### Docker can't access GPU - -**Problem:** `docker: Error response from daemon: could not select device driver "" with capabilities: [[gpu]]` - -**Solution:** -```bash -# Verify NVIDIA runtime is configured -docker info | grep -i runtime - -# Should show nvidia in runtimes list -# If not, reinstall nvidia-container-toolkit - -# Check daemon.json -cat /etc/docker/daemon.json - -# Restart Docker -systemctl restart docker -``` - -### Permission denied on docker commands - -**Solution:** -```bash -# Add your user to docker group (if not root) -usermod -aG docker $USER - -# Or always use sudo -sudo docker ... -``` - -### Out of disk space - -**Check usage:** -```bash -df -h -du -sh /var/lib/docker -docker system df -``` - -**Clean up:** -```bash -# Remove unused images -docker image prune -a - -# Remove unused volumes -docker volume prune - -# Full cleanup -docker system prune -a --volumes -``` - ---- - -## Verification Checklist - -Before deploying the full stack: - -- [ ] Docker installed and running -- [ ] `docker --version` shows 24.x or newer -- [ ] `docker compose version` works -- [ ] NVIDIA Container Toolkit installed -- [ ] `docker run --gpus all nvidia/cuda:12.1.0-base nvidia-smi` works -- [ ] PyTorch container can see GPU -- [ ] Test vLLM deployment successful -- [ ] /workspace directory structure created -- [ ] .env file configured with VPN IPs -- [ ] Additional tools installed (nvtop, htop, etc.) - ---- - -## Performance Monitoring Commands - -**GPU Monitoring:** -```bash -# Real-time GPU stats -watch -n 1 nvidia-smi - -# Or with nvtop (prettier) -nvtop - -# GPU memory usage -nvidia-smi --query-gpu=memory.used,memory.total --format=csv -``` - -**Docker Stats:** -```bash -# Container resource usage -docker stats - -# Specific container -docker stats vllm --no-stream -``` - -**System Resources:** -```bash -# Overall system -htop - -# I/O stats -iotop - -# Network -iftop -``` - ---- - -## Next: Deploy Production Stack - -Now you're ready to deploy the full GPU stack with vLLM, ComfyUI, and training tools. - -**Proceed to:** Deploying the production docker-compose.yaml - -**Save your progress:** - -```bash -cat >> /workspace/SERVER_INFO.md << 'EOF' - -## Docker Configuration -- Docker Version: [docker --version] -- NVIDIA Runtime: Enabled -- GPU Access in Containers: ✓ -- Test vLLM Deployment: Successful -- Directory: /workspace/gpu-stack - -## Tools Installed -- nvtop: GPU monitoring -- htop: System monitoring -- Docker Compose: v2.x -- Git LFS: Large file support -EOF -``` diff --git a/ai/GPU_DEPLOYMENT_LOG.md b/ai/GPU_DEPLOYMENT_LOG.md deleted file mode 100644 index 206097b..0000000 --- a/ai/GPU_DEPLOYMENT_LOG.md +++ /dev/null @@ -1,421 +0,0 @@ -# GPU Server Deployment Log - -## Current Deployment (2025-11-21) - -### Infrastructure -- **Provider**: RunPod (Spot Instance) -- **GPU**: NVIDIA RTX 4090 24GB -- **Disk**: 50GB local SSD (expanded from 20GB) -- **Network Volume**: 922TB at `/workspace` -- **Region**: Europe -- **Cost**: ~$0.50/hour (~$360/month if running 24/7) - -### Network Configuration -- **VPN**: Tailscale (replaces WireGuard due to RunPod UDP restrictions) -- **GPU Server Tailscale IP**: 100.100.108.13 -- **VPS Tailscale IP**: (get with `tailscale ip -4` on VPS) - -### SSH Access -``` -Host gpu-pivoine - HostName 213.173.102.232 - Port 29695 - User root - IdentityFile ~/.ssh/id_ed25519 -``` - -**Note**: RunPod Spot instances can be terminated and restarted with new ports/IPs. Update SSH config accordingly. - -### Software Stack -- **Python**: 3.11.10 -- **vLLM**: 0.6.4.post1 (installed with pip) -- **PyTorch**: 2.5.1 with CUDA 12.4 -- **Tailscale**: Installed via official script - -### vLLM Deployment - -**Custom Server**: `ai/simple_vllm_server.py` -- Uses `AsyncLLMEngine` directly to bypass multiprocessing issues -- OpenAI-compatible API endpoints: - - `GET /v1/models` - List available models - - `POST /v1/completions` - Text completion - - `POST /v1/chat/completions` - Chat completion -- Default model: Qwen/Qwen2.5-7B-Instruct -- Cache directory: `/workspace/huggingface_cache` - -**Deployment Command**: -```bash -# Copy server script to GPU server -scp ai/simple_vllm_server.py gpu-pivoine:/workspace/ - -# Start server -ssh gpu-pivoine "cd /workspace && nohup python3 simple_vllm_server.py > vllm.log 2>&1 &" - -# Check status -ssh gpu-pivoine "curl http://localhost:8000/v1/models" -``` - -**Server Configuration** (environment variables): -- `VLLM_HOST`: 0.0.0.0 (default) -- `VLLM_PORT`: 8000 (default) - -### Model Configuration -- **Model**: Qwen/Qwen2.5-7B-Instruct (no auth required) -- **Context Length**: 4096 tokens -- **GPU Memory**: 85% utilization -- **Tensor Parallel**: 1 (single GPU) - -### Known Issues & Solutions - -#### Issue 1: vLLM Multiprocessing Errors -**Problem**: Default vLLM v1 engine fails with ZMQ/CUDA multiprocessing errors on RunPod. -**Solution**: Custom `AsyncLLMEngine` FastAPI server bypasses multiprocessing layer entirely. - -#### Issue 2: Disk Space (Solved) -**Problem**: Original 20GB disk filled up with Hugging Face cache. -**Solution**: Expanded to 50GB and use `/workspace` for model cache. - -#### Issue 3: Gated Models -**Problem**: Llama models require Hugging Face authentication. -**Solution**: Use Qwen 2.5 7B Instruct (no auth required) or set `HF_TOKEN` environment variable. - -#### Issue 4: Spot Instance Volatility -**Problem**: RunPod Spot instances can be terminated anytime. -**Solution**: Accept as trade-off for cost savings. Document SSH details for quick reconnection. - -### Monitoring - -**Check vLLM logs**: -```bash -ssh gpu-pivoine "tail -f /workspace/vllm.log" -``` - -**Check GPU usage**: -```bash -ssh gpu-pivoine "nvidia-smi" -``` - -**Check Tailscale status**: -```bash -ssh gpu-pivoine "tailscale status" -``` - -**Test API locally (on GPU server)**: -```bash -ssh gpu-pivoine "curl http://localhost:8000/v1/models" -``` - -**Test API via Tailscale (from VPS)**: -```bash -curl http://100.100.108.13:8000/v1/models -``` - -### LiteLLM Integration - -Update VPS LiteLLM config at `ai/litellm-config-gpu.yaml`: - -```yaml -# Replace old WireGuard IP (10.8.0.2) with Tailscale IP -- model_name: qwen-2.5-7b - litellm_params: - model: openai/qwen-2.5-7b - api_base: http://100.100.108.13:8000/v1 # Tailscale IP - api_key: dummy - rpm: 1000 - tpm: 100000 -``` - -Restart LiteLLM: -```bash -arty restart litellm -``` - -### Troubleshooting - -**Server not responding**: -1. Check if process is running: `pgrep -f simple_vllm_server` -2. Check logs: `tail -100 /workspace/vllm.log` -3. Check GPU availability: `nvidia-smi` -4. Restart server: `pkill -f simple_vllm_server && python3 /workspace/simple_vllm_server.py &` - -**Tailscale not connected**: -1. Check status: `tailscale status` -2. Check daemon: `ps aux | grep tailscaled` -3. Restart: `tailscale down && tailscale up` - -**Model download failing**: -1. Check disk space: `df -h` -2. Check cache directory: `ls -lah /workspace/huggingface_cache` -3. Clear cache if needed: `rm -rf /workspace/huggingface_cache/*` - -### Deployment Status ✅ COMPLETE - -**Deployment Date**: 2025-11-21 - -1. ✅ Deploy vLLM with Qwen 2.5 7B - COMPLETE -2. ✅ Test API endpoints locally and via Tailscale - COMPLETE -3. ✅ Update VPS LiteLLM configuration - COMPLETE -4. ✅ Test end-to-end: Open WebUI → LiteLLM → vLLM - COMPLETE -5. ⏳ Monitor performance and costs - ONGOING - -**Model Available**: `qwen-2.5-7b` visible in Open WebUI at https://ai.pivoine.art - -### Next Steps (2025-11-21 Original) -6. ✅ Consider adding more models → COMPLETE (added Flux.1 Schnell + MusicGen Medium) -7. ⏹️ Set up auto-stop for idle periods to save costs - ---- - -## Multi-Modal Architecture (2025-11-21 Update) - -### Overview - -Expanded GPU deployment to support **text, image, and music generation** with intelligent model orchestration. All models run sequentially on a single RTX 4090 GPU with automatic switching based on request type. - -### Architecture Components - -#### 1. **Orchestrator Service** (Port 9000 - Always Running) -- **Location**: `ai/model-orchestrator/` -- **Purpose**: Central service managing model lifecycle -- **Features**: - - Detects request type (text/image/audio) - - Automatically unloads current model - - Loads requested model - - Proxies requests to active model - - Tracks GPU memory usage -- **Technology**: FastAPI + Docker SDK Python -- **Endpoints**: - - `POST /v1/chat/completions` → Routes to text models - - `POST /v1/images/generations` → Routes to image models - - `POST /v1/audio/generations` → Routes to music models - - `GET /health` → Shows active model and status - - `GET /models` → Lists all available models - - `POST /switch` → Manually switch models - -#### 2. **Text Generation** (vLLM + Qwen 2.5 7B) -- **Service**: `vllm-qwen` (Port 8001) -- **Location**: `ai/vllm/` -- **Model**: Qwen/Qwen2.5-7B-Instruct -- **VRAM**: 14GB (85% GPU utilization) -- **Speed**: ~50 tokens/second -- **Startup**: 120 seconds -- **Status**: ✅ Working (same as original deployment) - -#### 3. **Image Generation** (Flux.1 Schnell) -- **Service**: `flux` (Port 8002) -- **Location**: `ai/flux/` -- **Model**: black-forest-labs/FLUX.1-schnell -- **VRAM**: 14GB with CPU offloading -- **Speed**: 4-5 seconds per image -- **Startup**: 60 seconds -- **Features**: OpenAI DALL-E compatible API -- **Image**: `ghcr.io/matatonic/openedai-images-flux:latest` - -#### 4. **Music Generation** (MusicGen Medium) -- **Service**: `musicgen` (Port 8003) -- **Location**: `ai/musicgen/` -- **Model**: facebook/musicgen-medium -- **VRAM**: 11GB -- **Speed**: 60-90 seconds for 30 seconds of audio -- **Startup**: 45 seconds -- **Features**: Text-to-music generation with sampling controls -- **Technology**: Meta's AudioCraft + custom FastAPI wrapper - -### Model Registry (`models.yaml`) - -Simple configuration file for managing all models: - -```yaml -models: - qwen-2.5-7b: - type: text - framework: vllm - docker_service: vllm-qwen - port: 8001 - vram_gb: 14 - startup_time_seconds: 120 - endpoint: /v1/chat/completions - - flux-schnell: - type: image - framework: openedai-images - docker_service: flux - port: 8002 - vram_gb: 14 - startup_time_seconds: 60 - endpoint: /v1/images/generations - - musicgen-medium: - type: audio - framework: audiocraft - docker_service: musicgen - port: 8003 - vram_gb: 11 - startup_time_seconds: 45 - endpoint: /v1/audio/generations -``` - -**Adding new models**: Just add a new entry to this file and define the Docker service. - -### Deployment Changes - -#### Docker Compose Structure -- **File**: `docker-compose.gpu.yaml` -- **Services**: 4 total (1 orchestrator + 3 models) -- **Profiles**: `text`, `image`, `audio` (orchestrator manages activation) -- **Restart Policy**: `no` for models (orchestrator controls lifecycle) -- **Volumes**: All model caches on `/workspace` (922TB network volume) - -#### LiteLLM Integration -Updated `litellm-config.yaml` to route all self-hosted models through orchestrator: - -```yaml -# Text -- model_name: qwen-2.5-7b - api_base: http://100.100.108.13:9000/v1 # Orchestrator - -# Image -- model_name: flux-schnell - api_base: http://100.100.108.13:9000/v1 # Orchestrator - -# Music -- model_name: musicgen-medium - api_base: http://100.100.108.13:9000/v1 # Orchestrator -``` - -All models now available via Open WebUI at https://ai.pivoine.art - -### Usage Examples - -**Text Generation**: -```bash -curl http://100.100.108.13:9000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{"model": "qwen-2.5-7b", "messages": [{"role": "user", "content": "Hello"}]}' -``` - -**Image Generation**: -```bash -curl http://100.100.108.13:9000/v1/images/generations \ - -H "Content-Type: application/json" \ - -d '{"model": "flux-schnell", "prompt": "a cute cat", "size": "1024x1024"}' -``` - -**Music Generation**: -```bash -curl http://100.100.108.13:9000/v1/audio/generations \ - -H "Content-Type: application/json" \ - -d '{"model": "musicgen-medium", "prompt": "upbeat electronic", "duration": 30}' -``` - -### Deployment Commands - -```bash -# Copy all files to RunPod -scp -r ai/* gpu-pivoine:/workspace/ai/ - -# SSH to GPU server -ssh gpu-pivoine -cd /workspace/ai/ - -# Start orchestrator (manages everything) -docker compose -f docker-compose.gpu.yaml up -d orchestrator - -# Check status -curl http://100.100.108.13:9000/health - -# View logs -docker logs -f ai_orchestrator - -# Manually switch models (optional) -curl -X POST http://100.100.108.13:9000/switch \ - -H "Content-Type: application/json" \ - -d '{"model": "flux-schnell"}' -``` - -### Performance Characteristics - -| Model | VRAM | Startup Time | Generation Time | Notes | -|-------|------|--------------|-----------------|-------| -| Qwen 2.5 7B | 14GB | 120s | ~50 tok/sec | Fast text generation | -| Flux.1 Schnell | 14GB | 60s | 4-5s/image | High-quality images | -| MusicGen Medium | 11GB | 45s | 60-90s for 30s audio | Text-to-music | - -**Model Switching Overhead**: 30-120 seconds (unload + load) - -### Cost Analysis - -**Current (Single GPU Sequential)**: -- Cost: ~$0.50/hour -- Monthly: ~$360 (24/7) or ~$120 (8hr/day) -- Trade-off: 30-120s switching time - -**Alternative (Multi-GPU Concurrent)**: -- Cost: ~$0.75/hour (+50%) -- Monthly: ~$540 (24/7) or ~$180 (8hr/day) -- Benefit: No switching time, all models always available - -**Decision**: Stick with single GPU for cost optimization. Switching time is acceptable for most use cases. - -### Known Limitations - -1. **Sequential Only**: Only one model active at a time -2. **Switching Latency**: 30-120 seconds to change models -3. **MusicGen License**: Pre-trained weights are CC-BY-NC (non-commercial) -4. **Spot Instance Volatility**: Pod can be terminated anytime - -### Monitoring - -**Check active model**: -```bash -curl http://100.100.108.13:9000/health | jq '{model: .current_model, vram: .model_info.vram_gb}' -``` - -**View orchestrator logs**: -```bash -docker logs -f ai_orchestrator -``` - -**GPU usage**: -```bash -ssh gpu-pivoine "nvidia-smi" -``` - -### Deployment Status ✅ COMPLETE (Multi-Modal) - -**Deployment Date**: 2025-11-21 - -1. ✅ Create model orchestrator service - COMPLETE -2. ✅ Deploy vLLM text generation (Qwen 2.5 7B) - COMPLETE -3. ✅ Deploy Flux.1 Schnell image generation - COMPLETE -4. ✅ Deploy MusicGen Medium music generation - COMPLETE -5. ✅ Update LiteLLM configuration - COMPLETE -6. ✅ Test all three model types via orchestrator - READY FOR TESTING -7. ⏳ Monitor performance and costs - ONGOING - -**Models Available**: `qwen-2.5-7b`, `flux-schnell`, `musicgen-medium` via Open WebUI - -### Future Model Additions - -**Easy to add** (just edit `models.yaml`): -- Llama 3.1 8B Instruct (text, gated model) -- Whisper Large v3 (speech-to-text) -- XTTS v2 (text-to-speech) -- Stable Diffusion XL (alternative image generation) - -See `README.md` for detailed instructions on adding new models. - -### Cost Optimization Ideas -1. **Auto-stop**: Configure RunPod to auto-stop after 30 minutes idle -2. **Spot Instances**: Already using Spot for 50% cost reduction -3. **Scheduled Operation**: Run only during business hours (8 hours/day = $120/month) -4. **Smaller Models**: Use Mistral 7B or quantized models for lighter workloads -5. **Pay-as-you-go**: Manually start/stop pod as needed - -### Performance Benchmarks -*To be measured after deployment* - -Expected (based on RTX 4090): -- Qwen 2.5 7B: 50-80 tokens/second -- Context processing: ~2-3 seconds for 1000 tokens -- First token latency: ~200-300ms diff --git a/ai/GPU_EXPANSION_PLAN.md b/ai/GPU_EXPANSION_PLAN.md deleted file mode 100644 index d34ea01..0000000 --- a/ai/GPU_EXPANSION_PLAN.md +++ /dev/null @@ -1,1306 +0,0 @@ -# GPU-Enhanced AI Stack Expansion Plan - -## Executive Summary - -This document outlines a comprehensive plan to extend the current AI stack (LiteLLM, Open WebUI, Crawl4AI) with dedicated GPU hosting capabilities for: -- **LLM Model Hosting**: Self-hosted models (Llama, Mistral, Qwen, etc.) -- **Model Training**: Fine-tuning and training workflows -- **Image Generation**: Stable Diffusion, FLUX via ComfyUI -- **Video Generation**: AnimateDiff, CogVideo, etc. - -**Current Architecture**: CPU-based stack on pivoine.art VPS → Claude API via LiteLLM -**Target Architecture**: Hybrid stack with GPU server(s) for self-hosted models + API-based models - ---- - -## Phase 1: Current Stack Analysis - -### Existing Components - -1. **ai_postgres** (pgvector/pgvector:pg16) - - PostgreSQL with pgvector for RAG - - Stores: conversations, embeddings, LiteLLM logs - -2. **webui** (Open WebUI) - - User-facing ChatGPT-like interface - - URL: https://ai.pivoine.art - - Features: RAG, web search, document upload - - Connected to LiteLLM proxy - -3. **litellm** (LiteLLM proxy) - - Currently proxies Anthropic Claude API - - OpenAI-compatible endpoint at http://litellm:4000 - - Supports multiple providers via config - -4. **crawl4ai** - - Internal web scraping for LLM content prep - - Port 11235 (internal only) - -5. **facefusion** (CPU-only) - - Face swapping/enhancement - - Currently CPU-based (slow) - - Protected by Authelia SSO - -### Current Limitations - -- ❌ No self-hosted LLMs (relies on expensive API calls) -- ❌ No GPU acceleration for facefusion -- ❌ No image generation capabilities -- ❌ No model training/fine-tuning capabilities -- ❌ No video generation -- ❌ High operational costs for API usage - ---- - -## Phase 2: GPU Provider Comparison - -### Provider Options - -#### 1. **RunPod** ⭐ RECOMMENDED -**Pros:** -- Pay-per-second GPU billing -- Wide GPU selection (RTX 4090, A100, H100) -- Docker-first platform -- Global locations -- Easy HTTP/SSH tunneling -- Volume persistence - -**Pricing (Approximate):** -- RTX 4090 (24GB): ~$0.50/hour ($360/month 24/7) -- RTX 3090 (24GB): ~$0.35/hour ($250/month) -- A6000 (48GB): ~$0.80/hour ($576/month) -- A100 (40GB): ~$1.50/hour ($1,080/month) - -**Best for:** On-demand workloads, experimentation, cost-conscious hosting - ---- - -#### 2. **Lambda Labs** -**Pros:** -- Flat monthly pricing -- High-end GPUs (A100, H100) -- Jupyter notebooks included -- Fast network - -**Pricing:** -- 1x A100 (40GB): $1.10/hour ($792/month) -- 8x A100 (40GB): $8.00/hour (~$5,760/month) - -**Best for:** Research, high-utilization workloads - ---- - -#### 3. **Vast.ai** -**Pros:** -- Marketplace model (cheapest) -- Many GPU options -- Spot pricing available - -**Cons:** -- Variable reliability -- Setup complexity -- Community-hosted machines - -**Pricing:** -- RTX 4090: ~$0.25-0.40/hour -- A100: ~$0.80-1.20/hour - -**Best for:** Budget-conscious, experimental workloads - ---- - -#### 4. **Google Cloud Platform (GCP)** -**Pros:** -- Enterprise reliability -- Auto-scaling -- Integration with Google services -- Preemptible instances available - -**Pricing:** -- T4 (16GB): ~$0.35/hour -- V100 (16GB): ~$2.48/hour -- A100 (40GB): ~$2.93/hour -- TPU options available - -**Best for:** Enterprise workloads, auto-scaling needs - ---- - -#### 5. **AWS** -**Pros:** -- Global infrastructure -- Broad GPU selection -- Spot instances for cost savings -- Enterprise support - -**Pricing:** -- g4dn.xlarge (T4 16GB): ~$0.526/hour -- p3.2xlarge (V100 16GB): ~$3.06/hour -- p4d.24xlarge (8x A100 40GB): ~$32.77/hour - -**Best for:** Enterprise, existing AWS infrastructure - ---- - -#### 6. **Hugging Face Spaces / Inference Endpoints** -**Pros:** -- Managed model hosting -- Auto-scaling -- Simple deployment -- Community models - -**Pricing:** -- CPU: $0.03/hour -- T4: $0.60/hour -- A10G: $1.00/hour -- A100: $4.00/hour - -**Best for:** Quick model deployment, serverless inference - ---- - -### Recommendation: **RunPod** for Primary GPU Server - -**Rationale:** -1. **Cost-effective**: Pay-per-second billing, ~$0.50/hour for RTX 4090 -2. **Docker-native**: Easy integration with existing compose stack -3. **Flexibility**: Start/stop as needed, scale up for training -4. **Community**: Large user base, good documentation -5. **Network**: Built-in HTTP/SSH tunneling - -**Supplementary**: Use Hugging Face for specific model hosting if needed - ---- - -## Phase 3: Architecture Design - -### Network Topology - -``` -┌─────────────────────────────────────────────────────────────┐ -│ pivoine.art VPS (CPU-based) │ -├─────────────────────────────────────────────────────────────┤ -│ │ -│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ -│ │ Open │─────▶│ LiteLLM │◀────▶│ ai_ │ │ -│ │ WebUI │ │ Proxy │ │ postgres │ │ -│ └──────────┘ └──────────┘ └──────────┘ │ -│ │ │ │ -│ │ │ │ -└───────┼──────────────────┼──────────────────────────────────┘ - │ │ - │ ▼ - │ ┌─────────────────┐ - │ │ Anthropic API │ - │ │ (Claude) │ - │ └─────────────────┘ - │ - ▼ -┌────────────────────────────────────────────────────────────┐ -│ GPU Server (RunPod) │ -├────────────────────────────────────────────────────────────┤ -│ │ -│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ -│ │ vLLM │ │ ComfyUI │ │ Model │ │ JupyterLab│ │ -│ │ (LLMs) │ │ (SD/FLUX)│ │ Training │ │ │ │ -│ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │ -│ │ │ │ │ │ -│ └──────────────┴─────────────┴──────────────┘ │ -│ │ │ -│ ┌───────────────┐ │ -│ │ Model Storage │ │ -│ │ (Persistent) │ │ -│ └───────────────┘ │ -│ │ -└────────────────────────────────────────────────────────────┘ - │ - ▼ (Tunneled via WireGuard or Tailscale) -┌────────────────────────────────────────────────────────────┐ -│ Integration Options: │ -├────────────────────────────────────────────────────────────┤ -│ 1. LiteLLM adds vLLM endpoint (http://gpu.internal:8000) │ -│ 2. ComfyUI exposed via subdomain (comfy.ai.pivoine.art) │ -│ 3. Model storage synced via rclone/restic │ -└────────────────────────────────────────────────────────────┘ -``` - -### Connection Methods - -#### Option A: WireGuard VPN (RECOMMENDED) -- Create WireGuard tunnel between VPS and GPU server -- GPU services accessible via private IPs -- Secure, low overhead, easy to manage -- Already have wg-easy in your stack - -**Setup:** -1. Deploy WireGuard on GPU server -2. Add GPU server as VPN peer -3. Configure LiteLLM to use VPN IPs - -#### Option B: SSH Tunnel -- SSH reverse tunnel from GPU to VPS -- Simple, no additional software -- Higher latency - -#### Option C: Tailscale -- Zero-config VPN mesh -- Easy setup, good UX -- Proprietary (but free tier available) - ---- - -## Phase 4: Service Implementation Plans - -### 4.1 LLM Hosting with vLLM - -**vLLM** is the industry-standard for high-performance LLM inference. - -#### Features: -- PagedAttention for efficient KV cache -- Continuous batching -- OpenAI-compatible API -- Tensor parallelism for multi-GPU -- Quantization support (AWQ, GPTQ) - -#### Docker Compose Configuration: - -```yaml -services: - vllm: - image: vllm/vllm-openai:latest - container_name: gpu_vllm - restart: unless-stopped - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - CUDA_VISIBLE_DEVICES: 0 - volumes: - - vllm_models:/root/.cache/huggingface - command: - - --model - - meta-llama/Meta-Llama-3.1-8B-Instruct # or any model - - --host - - 0.0.0.0 - - --port - - 8000 - - --tensor-parallel-size - - '1' - - --gpu-memory-utilization - - '0.9' - - --max-model-len - - '8192' - ports: - - "8000:8000" - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] -``` - -#### Recommended Models for RTX 4090 (24GB): - -**Text Generation:** -- Llama 3.1 8B Instruct (8GB VRAM, fast) -- Qwen2.5 14B Instruct (14GB VRAM, multilingual) -- Mistral 7B Instruct v0.3 (7GB VRAM) -- Nous Hermes 2 Mixtral 8x7B (with quantization, 16GB) - -**Code:** -- DeepSeek Coder 6.7B (7GB VRAM) -- CodeLlama 13B (13GB VRAM) -- Qwen2.5-Coder 14B (14GB VRAM) - -#### Integration with LiteLLM: - -Add to `ai/litellm-config.yaml`: - -```yaml -model_list: - # Existing Anthropic - - model_name: claude-sonnet-4-5 - litellm_params: - model: anthropic/claude-sonnet-4-5-20250929 - api_key: os.environ/ANTHROPIC_API_KEY - - # New vLLM models - - model_name: llama-3.1-8b - litellm_params: - model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct - api_base: http://gpu.internal:8000/v1 - api_key: dummy - - - model_name: qwen-2.5-14b - litellm_params: - model: openai/Qwen/Qwen2.5-14B-Instruct - api_base: http://gpu.internal:8000/v1 - api_key: dummy -``` - ---- - -### 4.2 ComfyUI for Image/Video Generation - -**ComfyUI** is a node-based UI for Stable Diffusion with advanced workflows. - -#### Features: -- Node-based workflow editor -- Support for SD 1.5, SDXL, SD3, FLUX -- ControlNet, LoRA, embeddings -- Video generation (AnimateDiff, SVD) -- API for automation - -#### Docker Compose Configuration: - -```yaml -services: - comfyui: - image: ghcr.io/ai-dock/comfyui:latest - container_name: gpu_comfyui - restart: unless-stopped - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - # Custom nodes auto-install - COMFYUI_FLAGS: --listen 0.0.0.0 --port 8188 - volumes: - - comfyui_data:/data - - comfyui_models:/opt/ComfyUI/models - - comfyui_output:/opt/ComfyUI/output - ports: - - "8188:8188" - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] -``` - -#### Model Downloads (via ComfyUI Manager): - -**Stable Diffusion Models:** -- FLUX.1-dev (12GB, newest, best quality) -- FLUX.1-schnell (12GB, fast) -- SDXL Base 1.0 (6.9GB) -- SD 1.5 (4GB, fast, wide LoRA support) - -**ControlNet Models:** -- controlnet-canny-sdxl -- controlnet-depth-sdxl -- controlnet-openpose-sdxl - -**LoRA Models** (download from Civitai): -- Style LoRAs (anime, realistic, etc.) -- Character LoRAs -- Concept LoRAs - -#### Traefik Integration: - -Add subdomain routing for ComfyUI: - -```yaml -labels: - - 'traefik.enable=true' - - 'traefik.http.routers.comfyui-web-secure.rule=Host(`comfy.ai.pivoine.art`)' - - 'traefik.http.routers.comfyui-web-secure.tls.certresolver=resolver' - - 'traefik.http.routers.comfyui-web-secure.entrypoints=web-secure' - - 'traefik.http.routers.comfyui-web-secure.middlewares=net-authelia,security-headers@file' - - 'traefik.http.services.comfyui.loadbalancer.server.port=8188' -``` - -#### Open WebUI Integration: - -ComfyUI has a REST API that can be called from Open WebUI using function calling. - -Example workflow API call: -```python -import requests - -def generate_image(prompt: str, negative_prompt: str = ""): - workflow = { - # ComfyUI workflow JSON - } - response = requests.post( - "http://comfyui:8188/prompt", - json={"prompt": workflow} - ) - return response.json() -``` - ---- - -### 4.3 Model Training Infrastructure - -For fine-tuning LLMs and training custom models. - -#### Option A: Axolotl (Recommended) - -**Axolotl** is a user-friendly fine-tuning framework supporting: -- LoRA, QLoRA -- Full fine-tuning -- RLHF/DPO -- Multi-GPU training - -```yaml -services: - axolotl: - image: winglian/axolotl:main-py3.11-cu121-2.2.2 - container_name: gpu_training - runtime: nvidia - volumes: - - ./training/configs:/workspace/configs - - ./training/data:/workspace/data - - ./training/output:/workspace/output - - training_cache:/root/.cache - environment: - NVIDIA_VISIBLE_DEVICES: all - WANDB_API_KEY: ${WANDB_API_KEY:-} # Optional: Weights & Biases logging - command: | - bash -c " - accelerate launch -m axolotl.cli.train /workspace/configs/config.yaml - " - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] -``` - -#### Training Workflow: -1. Prepare dataset (JSONL format) -2. Create Axolotl config (LoRA, batch size, epochs) -3. Start training container -4. Monitor via Weights & Biases or TensorBoard -5. Export LoRA adapters -6. Merge with base model or use in vLLM - -#### Example Config: -```yaml -# training/configs/lora-llama3.yaml -base_model: meta-llama/Meta-Llama-3.1-8B-Instruct -model_type: AutoModelForCausalLM -tokenizer_type: AutoTokenizer - -load_in_8bit: false -load_in_4bit: true -strict: false - -datasets: - - path: /workspace/data/train.jsonl - type: completion - field: text - -output_dir: /workspace/output/llama3-lora - -adapter: lora -lora_r: 16 -lora_alpha: 32 -lora_dropout: 0.05 -lora_target_modules: - - q_proj - - v_proj - - k_proj - - o_proj - -gradient_accumulation_steps: 4 -micro_batch_size: 2 -num_epochs: 3 -learning_rate: 0.0002 - -optimizer: adamw_bnb_8bit -lr_scheduler: cosine -warmup_steps: 100 -``` - -#### Option B: JupyterLab for Custom Training - -For research and custom training scripts: - -```yaml -services: - jupyter: - image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel - container_name: gpu_jupyter - runtime: nvidia - volumes: - - ./notebooks:/workspace - - jupyter_cache:/root/.cache - ports: - - "8888:8888" - environment: - NVIDIA_VISIBLE_DEVICES: all - JUPYTER_ENABLE_LAB: "yes" - command: | - bash -c " - pip install jupyterlab transformers datasets accelerate bitsandbytes peft && - jupyter lab --ip=0.0.0.0 --allow-root --no-browser --NotebookApp.token='' - " - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] -``` - ---- - -### 4.4 Model Storage Strategy - -#### Storage Requirements: - -**Per Model Type:** -- LLM 7B: ~14GB (FP16) -- LLM 13B: ~26GB -- SDXL: ~7GB -- FLUX: ~12GB -- ControlNet: ~2.5GB each -- LoRA: ~100-500MB each - -**Total Estimated:** -- 3-4 LLMs: ~80GB -- SD models + LoRAs: ~50GB -- Training checkpoints: ~100GB -- **Total: 250-300GB minimum** - -#### RunPod Storage Options: - -1. **Network Volume** (Recommended) - - Persistent across pod restarts - - Shared between multiple pods - - ~$0.10/GB/month - - 500GB = $50/month - -2. **Container Disk** - - Included with pod - - Lost when pod stops - - Good for temporary storage - -3. **External Storage (rclone)** - - Sync to/from VPS or cloud storage - - Backup models to Backblaze B2 or Wasabi - - Good for disaster recovery - -#### Model Management: - -Use **Hugging Face Hub** as model cache: - -```bash -# Download models on first run -huggingface-cli download meta-llama/Meta-Llama-3.1-8B-Instruct \ - --local-dir /models/llama-3.1-8b - -# Or let vLLM/ComfyUI auto-download -``` - -**Model Sync Script:** -```bash -#!/bin/bash -# sync-models.sh - Sync models from VPS to GPU server - -rclone sync \ - /mnt/hidrive/AI/models \ - gpu:/workspace/models \ - --progress \ - --transfers 4 -``` - ---- - -## Phase 5: Implementation Roadmap - -### Week 1: Infrastructure Setup - -**Day 1-2: RunPod Account & GPU Server** -- [ ] Create RunPod account -- [ ] Deploy RTX 4090 pod with Ubuntu 22.04 + PyTorch template -- [ ] Configure persistent network volume (500GB) -- [ ] Set up SSH access - -**Day 3-4: Network Configuration** -- [ ] Deploy WireGuard on GPU server -- [ ] Add GPU server as peer to existing VPN (vpn/compose.yaml) -- [ ] Test connectivity between VPS and GPU server -- [ ] Configure firewall rules - -**Day 5: Docker Setup on GPU Server** -- [ ] Install Docker + NVIDIA Container Toolkit -- [ ] Create docker-compose.yaml for GPU services -- [ ] Test GPU access in containers - ---- - -### Week 2: LLM Hosting - -**Day 1-2: vLLM Deployment** -- [ ] Deploy vLLM container -- [ ] Download Llama 3.1 8B Instruct -- [ ] Test inference locally -- [ ] Benchmark performance (tokens/sec) - -**Day 3-4: LiteLLM Integration** -- [ ] Update litellm-config.yaml with vLLM endpoint -- [ ] Test via Open WebUI -- [ ] Configure model routing (cheap models → vLLM, complex → Claude) -- [ ] Set up usage monitoring - -**Day 5: Model Expansion** -- [ ] Download Qwen 2.5 14B -- [ ] Download Mistral 7B Instruct -- [ ] Test model switching in Open WebUI -- [ ] Document performance characteristics - ---- - -### Week 3: Image Generation - -**Day 1-2: ComfyUI Setup** -- [ ] Deploy ComfyUI container -- [ ] Download FLUX.1-schnell -- [ ] Download SDXL -- [ ] Install ComfyUI Manager - -**Day 3-4: Model Downloads** -- [ ] Download ControlNet models -- [ ] Download VAE models -- [ ] Download popular LoRAs from Civitai -- [ ] Organize model directory - -**Day 5: Integration & Workflows** -- [ ] Create basic text-to-image workflow -- [ ] Create ControlNet workflow -- [ ] Test API access -- [ ] Add Traefik subdomain (comfy.ai.pivoine.art) - ---- - -### Week 4: Training Infrastructure - -**Day 1-2: Axolotl Setup** -- [ ] Deploy Axolotl container -- [ ] Create sample dataset -- [ ] Test LoRA fine-tuning with tiny model -- [ ] Verify GPU utilization - -**Day 3-4: JupyterLab Setup** -- [ ] Deploy JupyterLab container -- [ ] Install ML libraries -- [ ] Create example notebooks -- [ ] Test custom training scripts - -**Day 5: Documentation & Testing** -- [ ] Write training guides -- [ ] Test end-to-end workflows -- [ ] Benchmark training speeds -- [ ] Document best practices - ---- - -### Ongoing: Optimization & Expansion - -**Month 2:** -- Monitor costs and optimize GPU utilization -- Implement model caching strategies -- Add more models based on usage patterns -- Set up automated model updates -- Implement usage quotas per user - -**Month 3+:** -- Consider multi-GPU setup for larger models -- Implement model quantization (AWQ/GPTQ) -- Add video generation (AnimateDiff, CogVideo) -- Explore voice synthesis (XTTS, Bark) -- Custom model training for specific use cases - ---- - -## Phase 6: Cost Analysis - -### Scenario A: Single RTX 4090 (24/7) - -**GPU Server (RunPod):** -- RTX 4090 pod: $0.50/hour × 720 hours = $360/month -- 500GB network volume: $50/month -- **Subtotal: $410/month** - -**VPS (Existing):** -- No change in cost - -**Total: ~$410/month** - -**Savings:** -- Claude API costs reduced by ~80% (self-hosted for routine tasks) -- Break-even if currently spending >$500/month on API calls - ---- - -### Scenario B: Pay-as-you-go (8 hours/day) - -**GPU Server (RunPod):** -- RTX 4090: $0.50/hour × 8 hours × 30 days = $120/month -- Storage: $50/month -- **Subtotal: $170/month** - -**Best for:** -- Development/experimentation -- Burst workloads -- Image generation on-demand - ---- - -### Scenario C: Dual GPU (Training + Inference) - -**GPU Server 1 (Inference):** -- RTX 4090 24/7: $360/month - -**GPU Server 2 (Training - On-demand):** -- A100 40GB: $1.50/hour × 40 hours/month = $60/month -- Used only for fine-tuning sessions - -**Storage:** -- 1TB network volume: $100/month - -**Total: ~$520/month** - ---- - -### Cost Optimization Tips - -1. **Auto-stop idle pods**: RunPod can auto-stop after X minutes idle -2. **Use spot instances**: ~50% cheaper but can be interrupted -3. **Quantized models**: 4-bit models use 4x less VRAM → cheaper GPUs -4. **Batch processing**: Queue image gen jobs to maximize GPU usage -5. **Model sharing**: One vLLM instance can serve multiple models via adapters -6. **Monitoring**: Track per-model costs to optimize routing - ---- - -## Phase 7: Monitoring & Operations - -### Metrics to Track - -**GPU Utilization:** -- nvidia-smi metrics (utilization %, memory usage) -- Temperature and power draw -- Per-process GPU usage - -**Model Performance:** -- Tokens per second (LLM inference) -- Images per second (SD/FLUX) -- Training time per epoch - -**Costs:** -- GPU hours consumed -- Storage usage -- API vs self-hosted breakdown - -### Monitoring Stack - -**Option A: Netdata (Already deployed)** - -Add GPU monitoring to existing Netdata: - -```yaml -# On GPU server -services: - netdata: - image: netdata/netdata:latest - container_name: gpu_netdata - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - volumes: - - /sys:/host/sys:ro - - /proc:/host/proc:ro - - /var/run/docker.sock:/var/run/docker.sock:ro - command: | - bash -c " - # Enable nvidia_smi plugin - /usr/libexec/netdata/plugins.d/charts.d.plugin nvidia_smi - " -``` - -**Option B: Prometheus + Grafana** - -For detailed metrics: - -```yaml -services: - prometheus: - image: prom/prometheus:latest - volumes: - - ./prometheus.yml:/etc/prometheus/prometheus.yml - - prometheus_data:/prometheus - command: - - '--config.file=/etc/prometheus/prometheus.yml' - - dcgm-exporter: - image: nvidia/dcgm-exporter:latest - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - - grafana: - image: grafana/grafana:latest - ports: - - "3000:3000" - volumes: - - grafana_data:/var/lib/grafana -``` - -Import Grafana dashboard #12219 for GPU metrics. - ---- - -## Phase 8: Backup & Disaster Recovery - -### What to Backup - -1. **Models** (250-300GB) - - Base models can be re-downloaded - - Custom fine-tuned models: CRITICAL - - LoRAs: CRITICAL - -2. **Training Data** (~10-50GB) - - Datasets - - Preprocessing scripts - -3. **Configurations** (<1GB) - - Docker compose files - - Training configs - - Workflow JSONs - -### Backup Strategy - -**Tier 1: Critical (Daily)** -- Fine-tuned models -- Training checkpoints -- Custom datasets - -**Backup to:** -- Restic → HiDrive (already configured) -- Backblaze B2 (~$6/TB/month) - -```bash -# Add to core/compose.yaml backrest config -- gpu_models:/volumes/gpu_models:ro -- gpu_checkpoints:/volumes/gpu_checkpoints:ro -``` - -**Tier 2: Nice-to-have (Weekly)** -- Base models (can re-download) -- ComfyUI outputs - -**Tier 3: Ephemeral (No backup)** -- Inference cache -- Temporary generations - ---- - -## Phase 9: Security Considerations - -### GPU Server Security - -1. **Firewall:** - - Only allow WireGuard port (51820) - - All services accessed via VPN - - No public exposure - -2. **SSH:** - - Key-based auth only - - Disable password auth - - Change default port - -3. **Docker:** - - Rootless Docker (optional but recommended) - - Limited container capabilities - - No privileged containers except for nvidia-runtime - -4. **Secrets:** - - Store API keys in .env - - Use Docker secrets for sensitive data - - Rotate keys periodically - -### Access Control - -- **ComfyUI**: Protected by Authelia SSO (already configured) -- **vLLM**: Internal only, accessed via LiteLLM proxy -- **JupyterLab**: Password-protected or Authelia -- **Training**: No public access, VPN only - ---- - -## Phase 10: Advanced Features (Future) - -### Multi-GPU Scaling - -**Tensor Parallelism** (vLLM): -- Split large models across multiple GPUs -- Example: 70B model on 2x A100s - -```yaml -command: - - --model - - meta-llama/Meta-Llama-3.1-70B-Instruct - - --tensor-parallel-size - - '2' # Use 2 GPUs -``` - -**Pipeline Parallelism** (training): -- Split model layers across GPUs -- Useful for very large models - -### Model Serving Optimization - -**vLLM Features:** -- Speculative decoding (faster generation) -- Prefix caching (faster for repeated prompts) -- Multi-LoRA serving (multiple adapters, one base model) - -**Example multi-LoRA:** -```yaml -command: - - --model - - meta-llama/Meta-Llama-3.1-8B-Instruct - - --enable-lora - - --max-loras - - '4' - - --lora-modules - - customer-support=/models/loras/support-lora - - creative-writing=/models/loras/writing-lora -``` - -### Video Generation - -**AnimateDiff in ComfyUI:** -- Generate short videos from text prompts -- Animate static images -- ~8GB VRAM for 512x512 16-frame videos - -**CogVideo:** -- High-quality video generation -- Requires A100 or H100 -- 5-second clips at 720p - -### Voice Synthesis - -**XTTS v2:** -- High-quality voice cloning -- Multi-language support -- ~6GB VRAM - -**Bark:** -- Text-to-speech with emotions -- Sound effects -- ~10GB VRAM - ---- - -## Appendix A: Quick Start Commands - -### Initial GPU Server Setup - -```bash -# SSH into RunPod instance -ssh root@gpu.runpod.io -p 12345 - -# Install Docker -curl -fsSL https://get.docker.com -o get-docker.sh -sh get-docker.sh - -# Install NVIDIA Container Toolkit -distribution=$(. /etc/os-release;echo $ID$VERSION_ID) -curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg -curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \ - sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ - tee /etc/apt/sources.list.d/nvidia-container-toolkit.list -apt-get update -apt-get install -y nvidia-container-toolkit -systemctl restart docker - -# Test GPU access -docker run --rm --runtime=nvidia --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi -``` - -### Deploy vLLM (Quick Test) - -```bash -# Create directory -mkdir -p /workspace/vllm -cd /workspace/vllm - -# Run vLLM -docker run -d \ - --name vllm \ - --runtime=nvidia \ - --gpus all \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - -p 8000:8000 \ - vllm/vllm-openai:latest \ - --model meta-llama/Meta-Llama-3.1-8B-Instruct \ - --dtype auto \ - --max-model-len 8192 - -# Test inference -curl http://localhost:8000/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "prompt": "Once upon a time", - "max_tokens": 50 - }' -``` - -### Deploy ComfyUI (Quick Test) - -```bash -docker run -d \ - --name comfyui \ - --runtime=nvidia \ - --gpus all \ - -v /workspace/comfyui:/data \ - -p 8188:8188 \ - ghcr.io/ai-dock/comfyui:latest - -# Access at http://gpu-ip:8188 -``` - ---- - -## Appendix B: Sample Docker Compose (Full GPU Stack) - -```yaml -# gpu-server/compose.yaml -version: '3.8' - -services: - # vLLM for LLM inference - vllm: - image: vllm/vllm-openai:latest - container_name: gpu_vllm - restart: unless-stopped - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - CUDA_VISIBLE_DEVICES: 0 - volumes: - - vllm_models:/root/.cache/huggingface - command: - - --model - - meta-llama/Meta-Llama-3.1-8B-Instruct - - --host - - 0.0.0.0 - - --port - - 8000 - - --gpu-memory-utilization - - '0.9' - ports: - - "8000:8000" - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - - # ComfyUI for image generation - comfyui: - image: ghcr.io/ai-dock/comfyui:latest - container_name: gpu_comfyui - restart: unless-stopped - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - volumes: - - comfyui_data:/data - - comfyui_models:/opt/ComfyUI/models - - comfyui_output:/opt/ComfyUI/output - ports: - - "8188:8188" - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - - # Axolotl for model training - axolotl: - image: winglian/axolotl:main-py3.11-cu121-2.2.2 - container_name: gpu_training - runtime: nvidia - volumes: - - ./training/configs:/workspace/configs - - ./training/data:/workspace/data - - ./training/output:/workspace/output - - training_cache:/root/.cache - environment: - NVIDIA_VISIBLE_DEVICES: all - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - # Only start when training - profiles: - - training - - # JupyterLab for research - jupyter: - image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel - container_name: gpu_jupyter - restart: unless-stopped - runtime: nvidia - volumes: - - ./notebooks:/workspace - - jupyter_cache:/root/.cache - ports: - - "8888:8888" - environment: - NVIDIA_VISIBLE_DEVICES: all - JUPYTER_ENABLE_LAB: "yes" - command: | - bash -c " - pip install jupyterlab transformers datasets accelerate bitsandbytes peft && - jupyter lab --ip=0.0.0.0 --allow-root --no-browser - " - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - - # Netdata monitoring - netdata: - image: netdata/netdata:latest - container_name: gpu_netdata - restart: unless-stopped - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - volumes: - - /sys:/host/sys:ro - - /proc:/host/proc:ro - - /var/run/docker.sock:/var/run/docker.sock:ro - ports: - - "19999:19999" - cap_add: - - SYS_PTRACE - - SYS_ADMIN - security_opt: - - apparmor:unconfined - -volumes: - vllm_models: - comfyui_data: - comfyui_models: - comfyui_output: - training_cache: - jupyter_cache: -``` - ---- - -## Appendix C: Cost Calculator - -**Monthly GPU Costs:** - -| GPU Model | VRAM | $/hour | 24/7 Month | 8hr/day | Use Case | -|-----------|------|--------|------------|---------|----------| -| RTX 3090 | 24GB | $0.35 | $252 | $84 | Development, small models | -| RTX 4090 | 24GB | $0.50 | $360 | $120 | Production inference, SD | -| A6000 | 48GB | $0.80 | $576 | $192 | Large models, training | -| A100 40GB | 40GB | $1.50 | $1,080 | $360 | Enterprise, training | -| A100 80GB | 80GB | $2.50 | $1,800 | $600 | Massive models, research | - -**Storage Costs:** -- Network volume: $0.10/GB/month -- 500GB = $50/month -- 1TB = $100/month - -**Total Estimated Monthly:** -- RTX 4090 + 500GB storage = $410/month (24/7) -- RTX 4090 + 500GB storage = $170/month (8hr/day) - -**Break-even Analysis:** -- If spending >$500/month on API calls → GPU server saves money -- If spending <$200/month → stick with APIs - ---- - -## Appendix D: Model Recommendations by Use Case - -### General Chat (24/7 Inference) -**Best:** Qwen 2.5 14B Instruct -- Excellent multilingual support -- Fast inference -- Good reasoning - -**Alternative:** Mistral 7B Instruct v0.3 -- Fastest inference -- Lower VRAM - -### Code Generation -**Best:** Qwen 2.5 Coder 14B -- SOTA coding performance -- Multi-language support - -**Alternative:** DeepSeek Coder 6.7B -- Faster, lighter - -### Creative Writing -**Best:** Nous Hermes 2 Mixtral 8x7B (quantized) -- Creative, engaging -- Follows instructions well - -### Image Generation (Realistic) -**Best:** FLUX.1-dev -- Highest quality -- Best prompt following - -**Alternative:** SDXL + RealVisXL LoRA -- Faster generation -- Good quality - -### Image Generation (Anime) -**Best:** SDXL + AnimagineXL LoRA -- Anime-specific training -- Vibrant colors - -### Video Generation -**Best:** AnimateDiff + SDXL -- 16-frame clips -- Good quality - -**Needs:** A100 40GB or better - ---- - -## Next Steps - -1. **Review this plan** and provide feedback -2. **Set budget** for GPU infrastructure -3. **Choose provider** (recommend RunPod) -4. **Define priority services** (LLM hosting first? Image gen first?) -5. **Schedule implementation** (4-week timeline above) - -Would you like me to: -- Create the detailed Docker Compose configurations? -- Set up a cost estimation spreadsheet? -- Research specific models for your use cases? -- Begin implementation with Phase 1? - -Let me know how you'd like to proceed! 🚀 diff --git a/ai/README_GPU_SETUP.md b/ai/README_GPU_SETUP.md deleted file mode 100644 index 34974f0..0000000 --- a/ai/README_GPU_SETUP.md +++ /dev/null @@ -1,444 +0,0 @@ -# GPU-Enhanced AI Stack - Implementation Guide - -Welcome to your GPU expansion setup! This directory contains everything you need to deploy a production-ready GPU server for LLM hosting, image generation, and model training. - -## 📚 Documentation Files - -### Planning & Architecture -- **`GPU_EXPANSION_PLAN.md`** - Complete 70-page plan with provider comparison, architecture, and roadmap -- **`README_GPU_SETUP.md`** - This file - -### Step-by-Step Setup Guides -1. **`SETUP_GUIDE.md`** - Day 1-2: RunPod account & GPU server deployment -2. **`WIREGUARD_SETUP.md`** - Day 3-4: VPN connection between VPS and GPU server -3. **`DOCKER_GPU_SETUP.md`** - Day 5: Docker + NVIDIA Container Toolkit configuration - -### Configuration Files -- **`gpu-server-compose.yaml`** - Production Docker Compose for GPU server -- **`litellm-config-gpu.yaml`** - Updated LiteLLM config with self-hosted models -- **`deploy-gpu-stack.sh`** - Automated deployment script - ---- - -## 🚀 Quick Start (Week 1 Checklist) - -### Day 1-2: RunPod & GPU Server ✓ -- [ ] Create RunPod account at https://www.runpod.io/ -- [ ] Add billing method ($50 initial credit recommended) -- [ ] Deploy RTX 4090 pod with PyTorch template -- [ ] Configure 500GB network volume -- [ ] Verify SSH access -- [ ] Test GPU with `nvidia-smi` -- [ ] **Guide:** `SETUP_GUIDE.md` - -### Day 3-4: Network Configuration ✓ -- [ ] Install Tailscale on VPS -- [ ] Install Tailscale on GPU server -- [ ] Authenticate both devices -- [ ] Test VPN connectivity -- [ ] Configure firewall rules -- [ ] Verify VPS can reach GPU server -- [ ] **Guide:** `TAILSCALE_SETUP.md` - -### Day 5: Docker & GPU Setup ✓ -- [ ] Install Docker on GPU server -- [ ] Install NVIDIA Container Toolkit -- [ ] Test GPU access in containers -- [ ] Create /workspace/gpu-stack directory -- [ ] Copy configuration files -- [ ] **Guide:** `DOCKER_GPU_SETUP.md` - -### Day 6-7: Deploy Services ✓ -- [ ] Copy `gpu-server-compose.yaml` to GPU server -- [ ] Edit `.env` with your settings -- [ ] Run `./deploy-gpu-stack.sh` -- [ ] Wait for vLLM to load model (~5 minutes) -- [ ] Test vLLM: `curl http://localhost:8000/v1/models` -- [ ] Access ComfyUI: `http://[tailscale-ip]:8188` -- [ ] **Script:** `deploy-gpu-stack.sh` - ---- - -## 📦 Services Included - -### vLLM (http://[tailscale-ip]:8000) -**Purpose:** High-performance LLM inference -**Default Model:** Llama 3.1 8B Instruct -**Performance:** 50-80 tokens/second on RTX 4090 -**Use for:** General chat, Q&A, code generation, summarization - -**Switch models:** -Edit `gpu-server-compose.yaml`, change `--model` parameter, restart: -```bash -docker compose restart vllm -``` - -### ComfyUI (http://[tailscale-ip]:8188) -**Purpose:** Advanced Stable Diffusion interface -**Features:** FLUX, SDXL, ControlNet, LoRA -**Use for:** Image generation, img2img, inpainting - -**Download models:** -Access web UI → ComfyUI Manager → Install Models - -### JupyterLab (http://[tailscale-ip]:8888) -**Purpose:** Interactive development environment -**Token:** `pivoine-ai-2025` (change in `.env`) -**Use for:** Research, experimentation, custom training scripts - -### Axolotl (Training - on-demand) -**Purpose:** LLM fine-tuning framework -**Start:** `docker compose --profile training up -d axolotl` -**Use for:** LoRA training, full fine-tuning, RLHF - -### Netdata (http://[tailscale-ip]:19999) -**Purpose:** System & GPU monitoring -**Features:** Real-time metrics, GPU utilization, memory usage -**Use for:** Performance monitoring, troubleshooting - ---- - -## 🔧 Configuration - -### Environment Variables (.env) - -```bash -# VPN Network (Tailscale) -VPS_IP=100.x.x.x # Your VPS Tailscale IP (get with: tailscale ip -4) -GPU_IP=100.x.x.x # GPU server Tailscale IP (get with: tailscale ip -4) - -# Model Storage -MODELS_PATH=/workspace/models - -# Hugging Face Token (for gated models like Llama) -HF_TOKEN=hf_xxxxxxxxxxxxx - -# Weights & Biases (for training logging) -WANDB_API_KEY= - -# JupyterLab Access -JUPYTER_TOKEN=pivoine-ai-2025 - -# PostgreSQL (on VPS) -DB_HOST=100.x.x.x # Your VPS Tailscale IP -DB_PORT=5432 -DB_USER=valknar -DB_PASSWORD=ragnarok98 -DB_NAME=openwebui -``` - -### Updating LiteLLM on VPS - -After GPU server is running, update your VPS LiteLLM config: - -```bash -# On VPS -cd ~/Projects/docker-compose/ai - -# Backup current config -cp litellm-config.yaml litellm-config.yaml.backup - -# Copy new config with GPU models -cp litellm-config-gpu.yaml litellm-config.yaml - -# Restart LiteLLM -arty restart litellm -``` - -Now Open WebUI will have access to both Claude (API) and Llama (self-hosted)! - ---- - -## 💰 Cost Management - -### Current Costs (24/7 Operation) -- **GPU Server:** RTX 4090 @ $0.50/hour = $360/month -- **Storage:** 500GB network volume = $50/month -- **Total:** **$410/month** - -### Cost-Saving Options - -**1. Pay-as-you-go (8 hours/day)** -- GPU: $0.50 × 8 × 30 = $120/month -- Storage: $50/month -- **Total: $170/month** - -**2. Auto-stop idle pods** -RunPod can auto-stop after X minutes idle: -- Dashboard → Pod Settings → Auto-stop after 30 minutes - -**3. Use smaller models** -- Mistral 7B instead of Llama 8B: Faster, cheaper GPU -- Quantized models: 4-bit = 1/4 the VRAM - -**4. Batch image generation** -- Generate multiple images at once -- Use scheduled jobs (cron) during off-peak hours - -### Cost Tracking - -**Check GPU usage:** -```bash -# On RunPod dashboard -Billing → Usage History - -# See hourly costs, total spent -``` - -**Check API vs GPU savings:** -```bash -# On VPS, check LiteLLM logs -docker logs ai_litellm | grep "model=" - -# Count requests to llama-3.1-8b vs claude-* -``` - -**Expected savings:** -- 80% of requests → self-hosted = $0 cost -- 20% of requests → Claude = API cost -- Break-even if currently spending >$500/month on APIs - ---- - -## 🔍 Monitoring & Troubleshooting - -### Check Service Status - -```bash -# On GPU server -cd /workspace/gpu-stack - -# View all services -docker compose ps - -# Check specific service logs -docker compose logs -f vllm -docker compose logs -f comfyui -docker compose logs -f jupyter - -# Check GPU usage -nvidia-smi -# or prettier: -nvtop -``` - -### Common Issues - -**vLLM not loading model:** -```bash -# Check logs -docker compose logs vllm - -# Common causes: -# - Model download in progress (wait 5-10 minutes) -# - Out of VRAM (try smaller model) -# - Missing HF_TOKEN (for gated models like Llama) -``` - -**ComfyUI slow/crashing:** -```bash -# Check GPU memory -nvidia-smi - -# If VRAM full: -# - Close vLLM temporarily -# - Use smaller models -# - Reduce batch size in ComfyUI -``` - -**Can't access from VPS:** -```bash -# Test VPN -ping [tailscale-ip] - -# If fails: -# - Check Tailscale status: tailscale status -# - Restart Tailscale: tailscale down && tailscale up -# - Check firewall: ufw status -``` - -**Docker can't see GPU:** -```bash -# Test GPU access -docker run --rm --runtime=nvidia --gpus all nvidia/cuda:12.1.0-base nvidia-smi - -# If fails: -# - Check NVIDIA driver: nvidia-smi -# - Check nvidia-docker: nvidia-ctk --version -# - Restart Docker: systemctl restart docker -``` - ---- - -## 📊 Performance Benchmarks - -### Expected Performance (RTX 4090) - -**LLM Inference (vLLM):** -- Llama 3.1 8B: 50-80 tokens/second -- Qwen 2.5 14B: 30-50 tokens/second -- Batch size 32: ~1500 tokens/second - -**Image Generation (ComfyUI):** -- SDXL (1024×1024): ~4-6 seconds -- FLUX (1024×1024): ~8-12 seconds -- SD 1.5 (512×512): ~1-2 seconds - -**Training (Axolotl):** -- LoRA fine-tuning (8B model): ~3-5 hours for 3 epochs -- Full fine-tuning: Not recommended on 24GB VRAM - ---- - -## 🔐 Security Best Practices - -### Network Security -✅ All services behind Tailscale VPN (end-to-end encrypted) -✅ No public exposure (except RunPod's SSH) -✅ Firewall configured (no additional ports needed) - -### Access Control -✅ JupyterLab password-protected -✅ ComfyUI accessible via VPN only -✅ vLLM internal API (no auth needed) - -### SSH Security -```bash -# On GPU server, harden SSH -nano /etc/ssh/sshd_config - -# Set: -PermitRootLogin prohibit-password -PasswordAuthentication no -PubkeyAuthentication yes - -systemctl restart sshd -``` - -### Regular Updates -```bash -# Weekly updates -apt update && apt upgrade -y - -# Update Docker images -docker compose pull -docker compose up -d -``` - ---- - -## 📈 Scaling Up - -### When to Add More GPUs - -**Current limitations (1× RTX 4090):** -- Can run ONE of these at a time: - - 8B LLM at full speed - - 14B LLM at moderate speed - - SDXL image generation - - Training job - -**Add 2nd GPU if:** -- You want LLM + image gen simultaneously -- Training + inference at same time -- Multiple users with high demand - -**Multi-GPU options:** -- 2× RTX 4090: Run vLLM + ComfyUI separately ($720/month) -- 1× A100 40GB: Larger models (70B with quantization) ($1,080/month) -- Mix: RTX 4090 (inference) + A100 (training) (~$1,300/month) - -### Deploying Larger Models - -**70B models (need 2× A100 or 4× RTX 4090):** -```yaml -# In gpu-server-compose.yaml -vllm: - command: - - --model - - meta-llama/Meta-Llama-3.1-70B-Instruct - - --tensor-parallel-size - - "2" # Split across 2 GPUs - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 2 # Use 2 GPUs - capabilities: [gpu] -``` - ---- - -## 🎯 Next Steps (Week 2+) - -### Week 2: LLM Production Deployment -- [ ] Test Llama 3.1 8B performance -- [ ] Download additional models (Qwen, Mistral) -- [ ] Configure model routing in LiteLLM -- [ ] Set up usage monitoring -- [ ] Benchmark tokens/second for each model - -### Week 3: Image Generation -- [ ] Download FLUX and SDXL models -- [ ] Install ComfyUI Manager -- [ ] Download ControlNet models -- [ ] Create sample workflows -- [ ] Test API integration with Open WebUI - -### Week 4: Training Infrastructure -- [ ] Prepare a sample dataset -- [ ] Test LoRA fine-tuning with Axolotl -- [ ] Set up Weights & Biases logging -- [ ] Create training documentation -- [ ] Benchmark training speed - ---- - -## 🆘 Getting Help - -### Resources -- **RunPod Docs:** https://docs.runpod.io/ -- **vLLM Docs:** https://docs.vllm.ai/ -- **ComfyUI Wiki:** https://github.com/comfyanonymous/ComfyUI/wiki -- **Axolotl Docs:** https://github.com/OpenAccess-AI-Collective/axolotl - -### Community -- **RunPod Discord:** https://discord.gg/runpod -- **vLLM Discord:** https://discord.gg/vllm -- **r/LocalLLaMA:** https://reddit.com/r/LocalLLaMA - -### Support -If you encounter issues: -1. Check logs: `docker compose logs -f [service]` -2. Check GPU: `nvidia-smi` -3. Check VPN: `wg show` -4. Restart service: `docker compose restart [service]` -5. Full restart: `docker compose down && docker compose up -d` - ---- - -## ✅ Success Criteria - -You're ready to proceed when: -- [ ] GPU server responds to `ping [tailscale-ip]` from VPS -- [ ] vLLM returns models: `curl http://[tailscale-ip]:8000/v1/models` -- [ ] ComfyUI web interface loads: `http://[tailscale-ip]:8188` -- [ ] JupyterLab accessible with token -- [ ] Netdata shows GPU metrics -- [ ] Open WebUI shows both Claude and Llama models - -**Total setup time:** 4-6 hours (if following guides sequentially) - ---- - -## 🎉 You're All Set! - -Your GPU-enhanced AI stack is ready. You now have: -- ✅ Self-hosted LLM inference (saves $$$) -- ✅ Advanced image generation (FLUX, SDXL) -- ✅ Model training capabilities (LoRA, fine-tuning) -- ✅ Secure VPN connection -- ✅ Full monitoring and logging - -Enjoy building with your new AI infrastructure! 🚀 diff --git a/ai/SETUP_GUIDE.md b/ai/SETUP_GUIDE.md deleted file mode 100644 index 1d14145..0000000 --- a/ai/SETUP_GUIDE.md +++ /dev/null @@ -1,261 +0,0 @@ -# GPU Server Setup Guide - Week 1 - -## Day 1-2: RunPod Account & GPU Server - -### Step 1: Create RunPod Account - -1. **Go to RunPod**: https://www.runpod.io/ -2. **Sign up** with email or GitHub -3. **Add billing method**: - - Credit card required - - No charges until you deploy a pod - - Recommended: Add $50 initial credit - -4. **Verify email** and complete account setup - -### Step 2: Deploy Your First GPU Pod - -#### 2.1 Navigate to Pods - -1. Click **"Deploy"** in top menu -2. Select **"GPU Pods"** - -#### 2.2 Choose GPU Type - -**Recommended: RTX 4090** -- 24GB VRAM -- ~$0.50/hour -- Perfect for LLMs up to 14B params -- Great for SDXL/FLUX - -**Filter options:** -- GPU Type: RTX 4090 -- GPU Count: 1 -- Sort by: Price (lowest first) -- Region: Europe (lower latency to Germany) - -#### 2.3 Select Template - -Choose: **"RunPod PyTorch"** template -- Includes: CUDA, PyTorch, Python -- Pre-configured for GPU workloads -- Docker pre-installed - -**Alternative**: "Ubuntu 22.04 with CUDA 12.1" (more control) - -#### 2.4 Configure Pod - -**Container Settings:** -- **Container Disk**: 50GB (temporary, auto-included) -- **Expose Ports**: - - Add: 22 (SSH) - - Add: 8000 (vLLM) - - Add: 8188 (ComfyUI) - - Add: 8888 (JupyterLab) - -**Volume Settings:** -- Click **"+ Network Volume"** -- **Name**: `gpu-models-storage` -- **Size**: 500GB -- **Region**: Same as pod -- **Cost**: ~$50/month - -**Environment Variables:** -- Add later (not needed for initial setup) - -#### 2.5 Deploy Pod - -1. Review configuration -2. Click **"Deploy On-Demand"** (not Spot for reliability) -3. Wait 2-3 minutes for deployment - -**Expected cost:** -- GPU: $0.50/hour = $360/month (24/7) -- Storage: $50/month -- **Total: $410/month** - -### Step 3: Access Your GPU Server - -#### 3.1 Get Connection Info - -Once deployed, you'll see: -- **Pod ID**: e.g., `abc123def456` -- **SSH Command**: `ssh root@.runpod.io -p 12345` -- **Public IP**: May not be directly accessible (use SSH) - -#### 3.2 SSH Access - -RunPod automatically generates SSH keys for you: - -```bash -# Copy the SSH command from RunPod dashboard -ssh root@abc123def456.runpod.io -p 12345 - -# First time: Accept fingerprint -# You should now be in the GPU server! -``` - -**Verify GPU:** -```bash -nvidia-smi -``` - -Expected output: -``` -+-----------------------------------------------------------------------------+ -| NVIDIA-SMI 535.xx Driver Version: 535.xx CUDA Version: 12.1 | -|-------------------------------+----------------------+----------------------+ -| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | -|===============================+======================+======================| -| 0 NVIDIA GeForce ... Off | 00000000:01:00.0 Off | N/A | -| 30% 45C P0 50W / 450W | 0MiB / 24564MiB | 0% Default | -+-------------------------------+----------------------+----------------------+ -``` - -### Step 4: Initial Server Configuration - -#### 4.1 Update System - -```bash -# Update package lists -apt update - -# Upgrade existing packages -apt upgrade -y - -# Install essential tools -apt install -y \ - vim \ - htop \ - tmux \ - curl \ - wget \ - git \ - net-tools \ - iptables-persistent -``` - -#### 4.2 Set Timezone - -```bash -timedatectl set-timezone Europe/Berlin -date # Verify -``` - -#### 4.3 Create Working Directory - -```bash -# Create workspace -mkdir -p /workspace/{models,configs,data,scripts} - -# Check network volume mount -ls -la /workspace -# Should show your 500GB volume -``` - -#### 4.4 Configure SSH (Optional but Recommended) - -**Generate your own SSH key on your local machine:** - -```bash -# On your local machine (not GPU server) -ssh-keygen -t ed25519 -C "gpu-server-pivoine" -f ~/.ssh/gpu_pivoine - -# Copy public key to GPU server -ssh-copy-id -i ~/.ssh/gpu_pivoine.pub root@abc123def456.runpod.io -p 12345 -``` - -**Add to your local ~/.ssh/config:** - -```bash -Host gpu-pivoine - HostName abc123def456.runpod.io - Port 12345 - User root - IdentityFile ~/.ssh/gpu_pivoine -``` - -Now you can connect with: `ssh gpu-pivoine` - -### Step 5: Verify GPU Access - -Run this test: - -```bash -# Test CUDA -python3 -c "import torch; print('CUDA available:', torch.cuda.is_available()); print('GPU count:', torch.cuda.device_count())" -``` - -Expected output: -``` -CUDA available: True -GPU count: 1 -``` - -### Troubleshooting - -**Problem: Can't connect via SSH** -- Check pod is running (not stopped) -- Verify port number in SSH command -- Try web terminal in RunPod dashboard - -**Problem: GPU not detected** -- Run `nvidia-smi` -- Check RunPod selected correct GPU type -- Restart pod if needed - -**Problem: Network volume not mounted** -- Check RunPod dashboard → Volume tab -- Verify volume is attached to pod -- Try: `df -h` to see mounts - -### Next Steps - -Once SSH access works and GPU is verified: -✅ Proceed to **Day 3-4: Network Configuration (Tailscale VPN)** - -### Save Important Info - -Create a file to track your setup: - -```bash -# On GPU server -cat > /workspace/SERVER_INFO.md << 'EOF' -# GPU Server Information - -## Connection -- SSH: ssh root@abc123def456.runpod.io -p 12345 -- Pod ID: abc123def456 -- Region: [YOUR_REGION] - -## Hardware -- GPU: RTX 4090 24GB -- CPU: [Check with: lscpu] -- RAM: [Check with: free -h] -- Storage: 500GB network volume at /workspace - -## Costs -- GPU: $0.50/hour -- Storage: $50/month -- Total: ~$410/month (24/7) - -## Deployed: [DATE] -EOF -``` - ---- - -## Checkpoint ✓ - -Before moving to Day 3, verify: -- [ ] RunPod account created and billing added -- [ ] RTX 4090 pod deployed successfully -- [ ] 500GB network volume attached -- [ ] SSH access working -- [ ] `nvidia-smi` shows GPU -- [ ] `torch.cuda.is_available()` returns True -- [ ] Timezone set to Europe/Berlin -- [ ] Essential tools installed - -**Ready for Tailscale setup? Let's go!** diff --git a/ai/TAILSCALE_SETUP.md b/ai/TAILSCALE_SETUP.md deleted file mode 100644 index 9950469..0000000 --- a/ai/TAILSCALE_SETUP.md +++ /dev/null @@ -1,417 +0,0 @@ -# Tailscale VPN Setup - Better Alternative to WireGuard - -## Why Tailscale? - -RunPod doesn't support UDP ports, which blocks WireGuard. Tailscale solves this by: -- ✅ Works over HTTPS (TCP) - no UDP needed -- ✅ Zero configuration - automatic setup -- ✅ Free for personal use -- ✅ Built on WireGuard (same security) -- ✅ Automatic NAT traversal -- ✅ Peer-to-peer when possible (low latency) - ---- - -## Step 1: Create Tailscale Account - -1. Go to: https://tailscale.com/ -2. Click **"Get Started"** -3. Sign up with **GitHub** or **Google** (easiest) -4. You'll be redirected to the Tailscale admin console - -**No credit card required!** Free tier is perfect for our use case. - ---- - -## Step 2: Install Tailscale on VPS - -**SSH into your VPS:** - -```bash -ssh root@vps -``` - -**Install Tailscale:** - -```bash -# Download and run install script -curl -fsSL https://tailscale.com/install.sh | sh - -# Start Tailscale -tailscale up - -# You'll see a URL like: -# https://login.tailscale.com/a/xxxxxxxxxx -``` - -**Authenticate:** -1. Copy the URL and open in browser -2. Click **"Connect"** to authorize the device -3. Name it: `pivoine-vps` - -**Check status:** -```bash -tailscale status -``` - -You should see your VPS listed with an IP like `100.x.x.x` - -**Save your VPS Tailscale IP:** -```bash -tailscale ip -4 -# Example output: 100.101.102.103 -``` - -**Write this down - you'll need it!** - ---- - -## Step 3: Install Tailscale on GPU Server - -**SSH into your RunPod GPU server:** - -```bash -ssh root@abc123def456-12345678.runpod.io -p 12345 -``` - -**Install Tailscale:** - -```bash -# Download and run install script -curl -fsSL https://tailscale.com/install.sh | sh - -# Start Tailscale -tailscale up --advertise-tags=tag:gpu - -# You'll see another URL -``` - -**Authenticate:** -1. Copy the URL and open in browser -2. Click **"Connect"** -3. Name it: `gpu-runpod` - -**Check status:** -```bash -tailscale status -``` - -You should now see BOTH devices: -- `pivoine-vps` - 100.x.x.x -- `gpu-runpod` - 100.x.x.x - -**Save your GPU server Tailscale IP:** -```bash -tailscale ip -4 -# Example output: 100.104.105.106 -``` - ---- - -## Step 4: Test Connectivity - -**From VPS, ping GPU server:** - -```bash -# SSH into VPS -ssh root@vps - -# Ping GPU server (use its Tailscale IP) -ping 100.104.105.106 -c 4 -``` - -Expected output: -``` -PING 100.104.105.106 (100.104.105.106) 56(84) bytes of data. -64 bytes from 100.104.105.106: icmp_seq=1 ttl=64 time=15.3 ms -64 bytes from 100.104.105.106: icmp_seq=2 ttl=64 time=14.8 ms -... -``` - -**From GPU server, ping VPS:** - -```bash -# SSH into GPU server -ssh root@abc123def456-12345678.runpod.io -p 12345 - -# Ping VPS (use its Tailscale IP) -ping 100.101.102.103 -c 4 -``` - -**Both should work!** ✅ - ---- - -## Step 5: Update Configuration Files - -Now update the IP addresses in your configs to use Tailscale IPs. - -### On GPU Server (.env file) - -**Edit your .env file:** - -```bash -# On GPU server -cd /workspace/gpu-stack - -nano .env -``` - -**Update these lines:** -```bash -# VPN Network (use your actual Tailscale IPs) -VPS_IP=100.101.102.103 # Your VPS Tailscale IP -GPU_IP=100.104.105.106 # Your GPU Tailscale IP - -# PostgreSQL (on VPS) -DB_HOST=100.101.102.103 # Your VPS Tailscale IP -DB_PORT=5432 -``` - -Save and exit (Ctrl+X, Y, Enter) - -### On VPS (LiteLLM config) - -**Edit your LiteLLM config:** - -```bash -# On VPS -ssh root@vps -cd ~/Projects/docker-compose/ai - -nano litellm-config-gpu.yaml -``` - -**Update the GPU server IP:** - -```yaml -# Find this section and update IP: - - model_name: llama-3.1-8b - litellm_params: - model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct - api_base: http://100.104.105.106:8000/v1 # Use GPU Tailscale IP - api_key: dummy -``` - -Save and exit. - ---- - -## Step 6: Verify PostgreSQL Access - -**From GPU server, test database connection:** - -```bash -# Install PostgreSQL client -apt install -y postgresql-client - -# Test connection (use your VPS Tailscale IP) -psql -h 100.101.102.103 -U valknar -d openwebui -c "SELECT 1;" -``` - -**If this fails, allow Tailscale network on VPS PostgreSQL:** - -```bash -# On VPS -ssh root@vps - -# Check if postgres allows Tailscale network -docker exec core_postgres cat /var/lib/postgresql/data/pg_hba.conf | grep 100 - -# If not present, add it: -docker exec -it core_postgres bash - -# Inside container: -echo "host all all 100.0.0.0/8 scram-sha-256" >> /var/lib/postgresql/data/pg_hba.conf - -# Restart postgres -exit -docker restart core_postgres -``` - -Try connecting again - should work now! - ---- - -## Tailscale Management - -### View Connected Devices - -**Web dashboard:** -https://login.tailscale.com/admin/machines - -You'll see all your devices with their Tailscale IPs. - -**Command line:** -```bash -tailscale status -``` - -### Disconnect/Reconnect - -```bash -# Stop Tailscale -tailscale down - -# Start Tailscale -tailscale up -``` - -### Remove Device - -From web dashboard: -1. Click on device -2. Click "..." menu -3. Select "Disable" or "Delete" - ---- - -## Advantages Over WireGuard - -✅ **Works anywhere** - No UDP ports needed -✅ **Auto-reconnect** - Survives network changes -✅ **Multiple devices** - Easy to add laptop, phone, etc. -✅ **NAT traversal** - Direct peer-to-peer when possible -✅ **Access Control** - Manage from web dashboard -✅ **Monitoring** - See connection status in real-time - ---- - -## Security Notes - -🔒 **Tailscale is secure:** -- End-to-end encrypted (WireGuard) -- Zero-trust architecture -- No Tailscale servers can see your traffic -- Only authenticated devices can connect - -🔒 **Access control:** -- Only devices you authorize can join -- Revoke access anytime from dashboard -- Set ACLs for fine-grained control - ---- - -## Network Reference (Updated) - -**Old (WireGuard):** -- VPS: `10.8.0.1` -- GPU: `10.8.0.2` - -**New (Tailscale):** -- VPS: `100.101.102.103` (example - use your actual IP) -- GPU: `100.104.105.106` (example - use your actual IP) - -**All services now accessible via Tailscale:** - -**From VPS to GPU:** -- vLLM: `http://100.104.105.106:8000` -- ComfyUI: `http://100.104.105.106:8188` -- JupyterLab: `http://100.104.105.106:8888` -- Netdata: `http://100.104.105.106:19999` - -**From GPU to VPS:** -- PostgreSQL: `100.101.102.103:5432` -- Redis: `100.101.102.103:6379` -- LiteLLM: `http://100.101.102.103:4000` - ---- - -## Troubleshooting - -### Can't ping between devices - -**Check Tailscale status:** -```bash -tailscale status -``` - -Both devices should show "active" or "online". - -**Check connectivity:** -```bash -tailscale ping 100.104.105.106 -``` - -**Restart Tailscale:** -```bash -tailscale down && tailscale up -``` - -### PostgreSQL connection refused - -**Check if postgres is listening on all interfaces:** -```bash -# On VPS -docker exec core_postgres cat /var/lib/postgresql/data/postgresql.conf | grep listen_addresses -``` - -Should show: `listen_addresses = '*'` - -**Check pg_hba.conf allows Tailscale network:** -```bash -docker exec core_postgres cat /var/lib/postgresql/data/pg_hba.conf | grep 100 -``` - -Should have line: -``` -host all all 100.0.0.0/8 scram-sha-256 -``` - -### Device not showing in network - -**Re-authenticate:** -```bash -tailscale logout -tailscale up -# Click the new URL to re-authenticate -``` - ---- - -## Verification Checklist - -Before proceeding: -- [ ] Tailscale account created -- [ ] Tailscale installed on VPS -- [ ] Tailscale installed on GPU server -- [ ] Both devices visible in `tailscale status` -- [ ] VPS can ping GPU server (via Tailscale IP) -- [ ] GPU server can ping VPS (via Tailscale IP) -- [ ] PostgreSQL accessible from GPU server -- [ ] .env file updated with Tailscale IPs -- [ ] LiteLLM config updated with GPU Tailscale IP - ---- - -## Next Steps - -✅ **Network configured!** Proceed to Docker & GPU setup: - -```bash -cat /home/valknar/Projects/docker-compose/ai/DOCKER_GPU_SETUP.md -``` - -**Your Tailscale IPs (save these!):** -- VPS: `__________________` (from `tailscale ip -4` on VPS) -- GPU: `__________________` (from `tailscale ip -4` on GPU server) - ---- - -## Bonus: Add Your Local Machine - -Want to access GPU server from your laptop? - -```bash -# On your local machine -curl -fsSL https://tailscale.com/install.sh | sh -tailscale up - -# Now you can SSH directly via Tailscale: -ssh root@100.104.105.106 - -# Or access ComfyUI in browser: -# http://100.104.105.106:8188 -``` - -No more port forwarding needed! 🎉 diff --git a/ai/WIREGUARD_SETUP.md b/ai/WIREGUARD_SETUP.md deleted file mode 100644 index 0f274fa..0000000 --- a/ai/WIREGUARD_SETUP.md +++ /dev/null @@ -1,393 +0,0 @@ -# WireGuard VPN Setup - Connecting GPU Server to VPS - -## Day 3-4: Network Configuration - -This guide connects your RunPod GPU server to your VPS via WireGuard VPN, enabling secure, low-latency communication. - -### Architecture - -``` -┌─────────────────────────────┐ ┌──────────────────────────────┐ -│ VPS (pivoine.art) │ │ GPU Server (RunPod) │ -│ 10.8.0.1 (WireGuard) │◄───────►│ 10.8.0.2 (WireGuard) │ -├─────────────────────────────┤ ├──────────────────────────────┤ -│ - LiteLLM Proxy │ │ - vLLM (10.8.0.2:8000) │ -│ - Open WebUI │ │ - ComfyUI (10.8.0.2:8188) │ -│ - PostgreSQL │ │ - Training │ -└─────────────────────────────┘ └──────────────────────────────┘ -``` - -### Prerequisites - -- ✅ VPS with root access -- ✅ GPU server with root access -- ✅ Both servers have public IPs - ---- - -## Method 1: Using Existing wg-easy (Recommended) - -You already have `wg-easy` running on your VPS. Let's use it! - -### Step 1: Access wg-easy Dashboard - -**On your local machine:** - -1. Open browser: https://vpn.pivoine.art (or whatever your wg-easy URL is) -2. Login with admin password - -**Don't have wg-easy set up? Skip to Method 2.** - -### Step 2: Create GPU Server Client - -1. In wg-easy dashboard, click **"+ New Client"** -2. **Name**: `gpu-server-runpod` -3. Click **"Create"** -4. **Download** configuration file (or copy QR code data) - -You'll get a file like: `gpu-server-runpod.conf` - -### Step 3: Install WireGuard on GPU Server - -**SSH into GPU server:** - -```bash -ssh gpu-pivoine # or your SSH command - -# Install WireGuard -apt update -apt install -y wireguard wireguard-tools -``` - -### Step 4: Configure WireGuard on GPU Server - -**Upload the config file:** - -```bash -# On your local machine, copy the config to GPU server -scp gpu-server-runpod.conf gpu-pivoine:/etc/wireguard/wg0.conf - -# Or manually create it on GPU server: -nano /etc/wireguard/wg0.conf -# Paste the configuration from wg-easy -``` - -**Example config (yours will be different):** -```ini -[Interface] -PrivateKey = -Address = 10.8.0.2/24 -DNS = 10.8.0.1 - -[Peer] -PublicKey = -PresharedKey = -AllowedIPs = 10.8.0.0/24 -Endpoint = :51820 -PersistentKeepalive = 25 -``` - -### Step 5: Start WireGuard - -```bash -# Enable IP forwarding -echo "net.ipv4.ip_forward=1" >> /etc/sysctl.conf -sysctl -p - -# Set permissions -chmod 600 /etc/wireguard/wg0.conf - -# Start WireGuard -systemctl enable wg-quick@wg0 -systemctl start wg-quick@wg0 - -# Check status -systemctl status wg-quick@wg0 -wg show -``` - -Expected output: -``` -interface: wg0 - public key: - private key: (hidden) - listening port: 51820 - -peer: - endpoint: :51820 - allowed ips: 10.8.0.0/24 - latest handshake: 1 second ago - transfer: 1.2 KiB received, 892 B sent - persistent keepalive: every 25 seconds -``` - -### Step 6: Test Connectivity - -**From GPU server, ping VPS:** - -```bash -ping 10.8.0.1 -c 4 -``` - -Expected output: -``` -PING 10.8.0.1 (10.8.0.1) 56(84) bytes of data. -64 bytes from 10.8.0.1: icmp_seq=1 ttl=64 time=25.3 ms -64 bytes from 10.8.0.1: icmp_seq=2 ttl=64 time=24.8 ms -... -``` - -**From VPS, ping GPU server:** - -```bash -ssh root@vps -ping 10.8.0.2 -c 4 -``` - -**Test PostgreSQL access from GPU server:** - -```bash -# On GPU server -apt install -y postgresql-client - -# Try connecting to VPS postgres -psql -h 10.8.0.1 -U valknar -d openwebui -c "SELECT 1;" -# Should work if postgres allows 10.8.0.0/24 -``` - ---- - -## Method 2: Manual WireGuard Setup (If no wg-easy) - -### Step 1: Install WireGuard on Both Servers - -**On VPS:** -```bash -ssh root@vps -apt update -apt install -y wireguard wireguard-tools -``` - -**On GPU Server:** -```bash -ssh gpu-pivoine -apt update -apt install -y wireguard wireguard-tools -``` - -### Step 2: Generate Keys - -**On VPS:** -```bash -cd /etc/wireguard -umask 077 -wg genkey | tee vps-private.key | wg pubkey > vps-public.key -``` - -**On GPU Server:** -```bash -cd /etc/wireguard -umask 077 -wg genkey | tee gpu-private.key | wg pubkey > gpu-public.key -``` - -### Step 3: Create Config on VPS - -**On VPS (`/etc/wireguard/wg0.conf`):** - -```bash -cat > /etc/wireguard/wg0.conf << 'EOF' -[Interface] -PrivateKey = -Address = 10.8.0.1/24 -ListenPort = 51820 -SaveConfig = false - -# GPU Server Peer -[Peer] -PublicKey = -AllowedIPs = 10.8.0.2/32 -PersistentKeepalive = 25 -EOF -``` - -Replace `` with contents of `vps-private.key` -Replace `` with contents from GPU server's `gpu-public.key` - -### Step 4: Create Config on GPU Server - -**On GPU Server (`/etc/wireguard/wg0.conf`):** - -```bash -cat > /etc/wireguard/wg0.conf << 'EOF' -[Interface] -PrivateKey = -Address = 10.8.0.2/24 - -[Peer] -PublicKey = -AllowedIPs = 10.8.0.0/24 -Endpoint = :51820 -PersistentKeepalive = 25 -EOF -``` - -Replace: -- `` with contents of `gpu-private.key` -- `` with contents from VPS's `vps-public.key` -- `` with your VPS's public IP address - -### Step 5: Start WireGuard on Both - -**On VPS:** -```bash -# Enable IP forwarding -echo "net.ipv4.ip_forward=1" >> /etc/sysctl.conf -sysctl -p - -# Start WireGuard -chmod 600 /etc/wireguard/wg0.conf -systemctl enable wg-quick@wg0 -systemctl start wg-quick@wg0 -``` - -**On GPU Server:** -```bash -# Enable IP forwarding -echo "net.ipv4.ip_forward=1" >> /etc/sysctl.conf -sysctl -p - -# Start WireGuard -chmod 600 /etc/wireguard/wg0.conf -systemctl enable wg-quick@wg0 -systemctl start wg-quick@wg0 -``` - -### Step 6: Configure Firewall - -**On VPS:** -```bash -# Allow WireGuard port -ufw allow 51820/udp -ufw reload - -# Or with iptables -iptables -A INPUT -p udp --dport 51820 -j ACCEPT -iptables-save > /etc/iptables/rules.v4 -``` - -**On GPU Server (RunPod):** -```bash -# Allow WireGuard -ufw allow 51820/udp -ufw reload -``` - -### Step 7: Test Connection - -Same as Method 1 Step 6. - ---- - -## Troubleshooting - -### No handshake - -**Check:** -```bash -wg show -``` - -If "latest handshake" shows "never": -1. Verify public keys are correct (easy to swap them!) -2. Check firewall allows UDP 51820 -3. Verify endpoint IP is correct -4. Check `systemctl status wg-quick@wg0` for errors - -### Can ping but can't access services - -**On VPS, check PostgreSQL allows 10.8.0.0/24:** - -```bash -# Edit postgresql.conf -nano /var/lib/postgresql/data/postgresql.conf -# Add or modify: -listen_addresses = '*' - -# Edit pg_hba.conf -nano /var/lib/postgresql/data/pg_hba.conf -# Add: -host all all 10.8.0.0/24 scram-sha-256 - -# Restart -docker restart core_postgres -``` - -### WireGuard won't start - -```bash -# Check logs -journalctl -u wg-quick@wg0 -n 50 - -# Common issues: -# - Wrong permissions: chmod 600 /etc/wireguard/wg0.conf -# - Invalid keys: regenerate with wg genkey -# - Port already in use: lsof -i :51820 -``` - ---- - -## Verification Checklist - -Before proceeding to Day 5: - -- [ ] WireGuard installed on both VPS and GPU server -- [ ] VPN tunnel established (wg show shows handshake) -- [ ] GPU server can ping VPS (10.8.0.1) -- [ ] VPS can ping GPU server (10.8.0.2) -- [ ] Firewall allows WireGuard (UDP 51820) -- [ ] PostgreSQL accessible from GPU server -- [ ] WireGuard starts on boot (systemctl enable) - ---- - -## Network Reference - -**VPN IPs:** -- VPS: `10.8.0.1` -- GPU Server: `10.8.0.2` - -**Service Access from GPU Server:** -- PostgreSQL: `postgresql://valknar:password@10.8.0.1:5432/dbname` -- Redis: `10.8.0.1:6379` -- LiteLLM: `http://10.8.0.1:4000` -- Mailpit: `10.8.0.1:1025` - -**Service Access from VPS:** -- vLLM: `http://10.8.0.2:8000` -- ComfyUI: `http://10.8.0.2:8188` -- JupyterLab: `http://10.8.0.2:8888` - ---- - -## Next: Docker & GPU Setup - -Once VPN is working, proceed to **Day 5: Docker & NVIDIA Container Toolkit Setup**. - -**Save connection info:** - -```bash -# On GPU server -cat >> /workspace/SERVER_INFO.md << 'EOF' - -## VPN Configuration -- VPN IP: 10.8.0.2 -- VPS VPN IP: 10.8.0.1 -- WireGuard Status: Active -- Latest Handshake: [Check with: wg show] - -## Network Access -- Can reach VPS services: ✓ -- VPS can reach GPU services: ✓ -EOF -``` diff --git a/ai/deploy-gpu-stack.sh b/ai/deploy-gpu-stack.sh deleted file mode 100755 index f770946..0000000 --- a/ai/deploy-gpu-stack.sh +++ /dev/null @@ -1,229 +0,0 @@ -#!/bin/bash -# GPU Stack Deployment Script -# Run this on the GPU server after SSH access is established - -set -e # Exit on error - -echo "==================================" -echo "GPU Stack Deployment Script" -echo "==================================" -echo "" - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -# Functions -print_success() { - echo -e "${GREEN}✓ $1${NC}" -} - -print_error() { - echo -e "${RED}✗ $1${NC}" -} - -print_info() { - echo -e "${YELLOW}→ $1${NC}" -} - -# Check if running as root -if [[ $EUID -ne 0 ]]; then - print_error "This script must be run as root (use sudo)" - exit 1 -fi - -# Step 1: Check prerequisites -print_info "Checking prerequisites..." - -if ! command -v docker &> /dev/null; then - print_error "Docker is not installed. Please run DOCKER_GPU_SETUP.md first." - exit 1 -fi -print_success "Docker installed" - -if ! command -v nvidia-smi &> /dev/null; then - print_error "nvidia-smi not found. Is this a GPU server?" - exit 1 -fi -print_success "NVIDIA GPU detected" - -if ! docker run --rm --runtime=nvidia --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi &> /dev/null; then - print_error "Docker cannot access GPU. Please configure NVIDIA Container Toolkit." - exit 1 -fi -print_success "Docker GPU access working" - -# Step 2: Create directory structure -print_info "Creating directory structure..." - -mkdir -p /workspace/gpu-stack/{vllm,comfyui,training/{configs,data,output},notebooks,monitoring} -cd /workspace/gpu-stack - -print_success "Directory structure created" - -# Step 3: Create .env file -if [ ! -f .env ]; then - print_info "Creating .env file..." - - cat > .env << 'EOF' -# GPU Stack Environment Variables - -# Timezone -TIMEZONE=Europe/Berlin - -# VPN Network -VPS_IP=10.8.0.1 -GPU_IP=10.8.0.2 - -# Model Storage (network volume) -MODELS_PATH=/workspace/models - -# Hugging Face Token (optional, for gated models like Llama) -# Get from: https://huggingface.co/settings/tokens -HF_TOKEN= - -# Weights & Biases (optional, for training logging) -# Get from: https://wandb.ai/authorize -WANDB_API_KEY= - -# JupyterLab Access Token -JUPYTER_TOKEN=pivoine-ai-2025 - -# PostgreSQL (on VPS) -DB_HOST=10.8.0.1 -DB_PORT=5432 -DB_USER=valknar -DB_PASSWORD=ragnarok98 -DB_NAME=openwebui -EOF - - chmod 600 .env - print_success ".env file created (please edit with your tokens)" -else - print_success ".env file already exists" -fi - -# Step 4: Download docker-compose.yaml -print_info "Downloading docker-compose.yaml..." - -# In production, this would be copied from the repo -# For now, assume it's already in the current directory -if [ ! -f docker-compose.yaml ]; then - print_error "docker-compose.yaml not found. Please copy gpu-server-compose.yaml to docker-compose.yaml" - exit 1 -fi - -print_success "docker-compose.yaml found" - -# Step 5: Pre-download models (optional but recommended) -print_info "Do you want to pre-download models? (y/n)" -read -r response - -if [[ "$response" =~ ^[Yy]$ ]]; then - print_info "Downloading Llama 3.1 8B Instruct (this will take a while)..." - - mkdir -p /workspace/models - - # Use huggingface-cli to download - pip install -q huggingface-hub - - huggingface-cli download \ - meta-llama/Meta-Llama-3.1-8B-Instruct \ - --local-dir /workspace/models/Meta-Llama-3.1-8B-Instruct \ - --local-dir-use-symlinks False || print_error "Model download failed (may need HF_TOKEN)" - - print_success "Model downloaded to /workspace/models" -fi - -# Step 6: Start services -print_info "Starting GPU stack services..." - -docker compose up -d vllm comfyui jupyter netdata - -print_success "Services starting (this may take a few minutes)..." - -# Step 7: Wait for services -print_info "Waiting for services to be ready..." - -sleep 10 - -# Check service health -print_info "Checking service status..." - -if docker ps | grep -q gpu_vllm; then - print_success "vLLM container running" -else - print_error "vLLM container not running" -fi - -if docker ps | grep -q gpu_comfyui; then - print_success "ComfyUI container running" -else - print_error "ComfyUI container not running" -fi - -if docker ps | grep -q gpu_jupyter; then - print_success "JupyterLab container running" -else - print_error "JupyterLab container not running" -fi - -if docker ps | grep -q gpu_netdata; then - print_success "Netdata container running" -else - print_error "Netdata container not running" -fi - -# Step 8: Display access information -echo "" -echo "==================================" -echo "Deployment Complete!" -echo "==================================" -echo "" -echo "Services accessible via VPN (from VPS):" -echo " - vLLM API: http://10.8.0.2:8000" -echo " - ComfyUI: http://10.8.0.2:8188" -echo " - JupyterLab: http://10.8.0.2:8888 (token: pivoine-ai-2025)" -echo " - Netdata: http://10.8.0.2:19999" -echo "" -echo "Local access (from GPU server):" -echo " - vLLM API: http://localhost:8000" -echo " - ComfyUI: http://localhost:8188" -echo " - JupyterLab: http://localhost:8888" -echo " - Netdata: http://localhost:19999" -echo "" -echo "Useful commands:" -echo " - View logs: docker compose logs -f" -echo " - Check status: docker compose ps" -echo " - Stop all: docker compose down" -echo " - Restart service: docker compose restart vllm" -echo " - Start training: docker compose --profile training up -d axolotl" -echo "" -echo "Next steps:" -echo " 1. Wait for vLLM to load model (check logs: docker compose logs -f vllm)" -echo " 2. Test vLLM: curl http://localhost:8000/v1/models" -echo " 3. Configure LiteLLM on VPS to use http://10.8.0.2:8000" -echo " 4. Download ComfyUI models via web interface" -echo "" - -# Step 9: Create helpful aliases -print_info "Creating helpful aliases..." - -cat >> ~/.bashrc << 'EOF' - -# GPU Stack Aliases -alias gpu-logs='cd /workspace/gpu-stack && docker compose logs -f' -alias gpu-ps='cd /workspace/gpu-stack && docker compose ps' -alias gpu-restart='cd /workspace/gpu-stack && docker compose restart' -alias gpu-down='cd /workspace/gpu-stack && docker compose down' -alias gpu-up='cd /workspace/gpu-stack && docker compose up -d' -alias gpu-stats='watch -n 1 nvidia-smi' -alias gpu-top='nvtop' -EOF - -print_success "Aliases added to ~/.bashrc (reload with: source ~/.bashrc)" - -echo "" -print_success "All done! 🚀" diff --git a/ai/docker-compose.gpu.yaml b/ai/docker-compose.gpu.yaml deleted file mode 100644 index 9ddfe84..0000000 --- a/ai/docker-compose.gpu.yaml +++ /dev/null @@ -1,104 +0,0 @@ -version: '3.8' - -# Multi-Modal AI Orchestration for RunPod RTX 4090 -# Manages text, image, and music generation with sequential model loading - -services: - # ============================================================================ - # ORCHESTRATOR (Always Running) - # ============================================================================ - orchestrator: - build: ./model-orchestrator - container_name: ai_orchestrator - ports: - - "9000:9000" - volumes: - - /var/run/docker.sock:/var/run/docker.sock:ro - - ./model-orchestrator/models.yaml:/app/models.yaml:ro - environment: - - MODELS_CONFIG=/app/models.yaml - - COMPOSE_PROJECT_NAME=ai - - GPU_MEMORY_GB=24 - restart: unless-stopped - network_mode: host - - # ============================================================================ - # TEXT GENERATION (vLLM + Qwen 2.5 7B) - # ============================================================================ - vllm-qwen: - build: ./vllm - container_name: ai_vllm-qwen_1 - ports: - - "8001:8000" - volumes: - - /workspace/huggingface_cache:/workspace/huggingface_cache - environment: - - HF_TOKEN=${HF_TOKEN} - - VLLM_HOST=0.0.0.0 - - VLLM_PORT=8000 - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - profiles: ["text"] # Only start when requested by orchestrator - restart: "no" # Orchestrator manages lifecycle - - # ============================================================================ - # IMAGE GENERATION (Flux.1 Schnell) - # ============================================================================ - flux: - image: ghcr.io/matatonic/openedai-images-flux:latest - container_name: ai_flux_1 - ports: - - "8002:5005" - volumes: - - /workspace/flux/models:/app/models - - ./flux/config:/app/config:ro - environment: - - HF_TOKEN=${HF_TOKEN} - - CONFIG_PATH=/app/config/config.json - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - profiles: ["image"] # Only start when requested by orchestrator - restart: "no" # Orchestrator manages lifecycle - - # ============================================================================ - # MUSIC GENERATION (MusicGen Medium) - # ============================================================================ - musicgen: - build: ./musicgen - container_name: ai_musicgen_1 - ports: - - "8003:8000" - volumes: - - /workspace/musicgen/models:/app/models - environment: - - HF_TOKEN=${HF_TOKEN} - - MODEL_NAME=facebook/musicgen-medium - - HOST=0.0.0.0 - - PORT=8000 - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - profiles: ["audio"] # Only start when requested by orchestrator - restart: "no" # Orchestrator manages lifecycle - -# ============================================================================ -# VOLUMES -# ============================================================================ -# Model caches are stored on RunPod's /workspace directory (922TB network volume) -# This persists across pod restarts and reduces model download times - -# No named volumes - using host paths on RunPod /workspace diff --git a/ai/flux/config/config.json b/ai/flux/config/config.json deleted file mode 100644 index 50d9669..0000000 --- a/ai/flux/config/config.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "model": "flux-schnell", - "offload": true, - "sequential_cpu_offload": false, - "vae_tiling": true, - "enable_model_cpu_offload": true, - "low_vram_mode": false, - "torch_compile": false, - "safety_checker": false, - "watermark": false, - "flux_device": "cuda", - "compile": false -} diff --git a/ai/gpu-server-compose.yaml b/ai/gpu-server-compose.yaml deleted file mode 100644 index 9cb2f70..0000000 --- a/ai/gpu-server-compose.yaml +++ /dev/null @@ -1,237 +0,0 @@ -# GPU Server Docker Compose Configuration -# Deploy on RunPod GPU server (10.8.0.2) -# Services accessible from VPS (10.8.0.1) via WireGuard VPN - -version: '3.8' - -services: - # ============================================================================= - # vLLM - High-performance LLM Inference Server - # ============================================================================= - vllm: - image: vllm/vllm-openai:latest - container_name: gpu_vllm - restart: unless-stopped - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - CUDA_VISIBLE_DEVICES: "0" - HF_TOKEN: ${HF_TOKEN:-} - volumes: - - ${MODELS_PATH:-/workspace/models}:/root/.cache/huggingface - command: - - --model - - meta-llama/Meta-Llama-3.1-8B-Instruct # Change model here - - --host - - 0.0.0.0 - - --port - - 8000 - - --tensor-parallel-size - - "1" - - --gpu-memory-utilization - - "0.85" # Leave 15% for other tasks - - --max-model-len - - "8192" - - --dtype - - auto - - --trust-remote-code - ports: - - "8000:8000" - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8000/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 120s # Model loading takes time - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - labels: - - "service=vllm" - - "stack=gpu-ai" - - # ============================================================================= - # ComfyUI - Advanced Stable Diffusion Interface - # ============================================================================= - comfyui: - image: ghcr.io/ai-dock/comfyui:latest - container_name: gpu_comfyui - restart: unless-stopped - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - TZ: ${TIMEZONE:-Europe/Berlin} - # ComfyUI auto-installs custom nodes on first run - COMFYUI_FLAGS: "--listen 0.0.0.0 --port 8188" - volumes: - - comfyui_data:/data - - ${MODELS_PATH:-/workspace/models}/comfyui:/opt/ComfyUI/models - - comfyui_output:/opt/ComfyUI/output - - comfyui_input:/opt/ComfyUI/input - ports: - - "8188:8188" - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8188/"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 60s - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - labels: - - "service=comfyui" - - "stack=gpu-ai" - - # ============================================================================= - # Axolotl - LLM Fine-tuning Framework - # ============================================================================= - # Note: This service uses "profiles" - only starts when explicitly requested - # Start with: docker compose --profile training up -d axolotl - axolotl: - image: winglian/axolotl:main-py3.11-cu121-2.2.2 - container_name: gpu_training - runtime: nvidia - volumes: - - ./training/configs:/workspace/configs - - ./training/data:/workspace/data - - ./training/output:/workspace/output - - ${MODELS_PATH:-/workspace/models}:/workspace/models - - training_cache:/root/.cache - environment: - NVIDIA_VISIBLE_DEVICES: all - WANDB_API_KEY: ${WANDB_API_KEY:-} - HF_TOKEN: ${HF_TOKEN:-} - working_dir: /workspace - # Default command - override when running specific training - command: sleep infinity - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - profiles: - - training - labels: - - "service=axolotl" - - "stack=gpu-ai" - - # ============================================================================= - # JupyterLab - Interactive Development Environment - # ============================================================================= - jupyter: - image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel - container_name: gpu_jupyter - restart: unless-stopped - runtime: nvidia - volumes: - - ./notebooks:/workspace/notebooks - - ${MODELS_PATH:-/workspace/models}:/workspace/models - - jupyter_cache:/root/.cache - ports: - - "8888:8888" - environment: - NVIDIA_VISIBLE_DEVICES: all - JUPYTER_ENABLE_LAB: "yes" - JUPYTER_TOKEN: ${JUPYTER_TOKEN:-pivoine-ai-2025} - HF_TOKEN: ${HF_TOKEN:-} - command: | - bash -c " - pip install --quiet jupyterlab transformers datasets accelerate bitsandbytes peft trl sentencepiece protobuf && - jupyter lab --ip=0.0.0.0 --port=8888 --allow-root --no-browser --NotebookApp.token='${JUPYTER_TOKEN:-pivoine-ai-2025}' - " - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8888/"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 60s - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - labels: - - "service=jupyter" - - "stack=gpu-ai" - - # ============================================================================= - # Netdata - System & GPU Monitoring - # ============================================================================= - netdata: - image: netdata/netdata:latest - container_name: gpu_netdata - restart: unless-stopped - runtime: nvidia - hostname: gpu-runpod - cap_add: - - SYS_PTRACE - - SYS_ADMIN - security_opt: - - apparmor:unconfined - environment: - NVIDIA_VISIBLE_DEVICES: all - TZ: ${TIMEZONE:-Europe/Berlin} - volumes: - - /sys:/host/sys:ro - - /proc:/host/proc:ro - - /var/run/docker.sock:/var/run/docker.sock:ro - - /etc/os-release:/host/etc/os-release:ro - - netdata_config:/etc/netdata - - netdata_cache:/var/cache/netdata - - netdata_lib:/var/lib/netdata - ports: - - "19999:19999" - labels: - - "service=netdata" - - "stack=gpu-ai" - -# ============================================================================= -# Volumes -# ============================================================================= -volumes: - # ComfyUI data - comfyui_data: - driver: local - comfyui_output: - driver: local - comfyui_input: - driver: local - - # Training data - training_cache: - driver: local - - # Jupyter data - jupyter_cache: - driver: local - - # Netdata data - netdata_config: - driver: local - netdata_cache: - driver: local - netdata_lib: - driver: local - -# ============================================================================= -# Networks -# ============================================================================= -networks: - default: - driver: bridge - ipam: - config: - - subnet: 172.25.0.0/24 diff --git a/ai/litellm-config-gpu.yaml b/ai/litellm-config-gpu.yaml deleted file mode 100644 index 5313d64..0000000 --- a/ai/litellm-config-gpu.yaml +++ /dev/null @@ -1,199 +0,0 @@ -# LiteLLM Configuration with GPU Server Integration -# This config includes both Anthropic Claude (API) and self-hosted models (vLLM on GPU server) - -model_list: - # ============================================================================= - # Anthropic Claude Models (API-based, for complex reasoning) - # ============================================================================= - - - model_name: claude-sonnet-4 - litellm_params: - model: anthropic/claude-sonnet-4-20250514 - api_key: os.environ/ANTHROPIC_API_KEY - - - model_name: claude-sonnet-4.5 - litellm_params: - model: anthropic/claude-sonnet-4-5-20250929 - api_key: os.environ/ANTHROPIC_API_KEY - - - model_name: claude-3-5-sonnet - litellm_params: - model: anthropic/claude-3-5-sonnet-20241022 - api_key: os.environ/ANTHROPIC_API_KEY - - - model_name: claude-3-opus - litellm_params: - model: anthropic/claude-3-opus-20240229 - api_key: os.environ/ANTHROPIC_API_KEY - - - model_name: claude-3-haiku - litellm_params: - model: anthropic/claude-3-haiku-20240307 - api_key: os.environ/ANTHROPIC_API_KEY - - # ============================================================================= - # Self-Hosted Models (vLLM on GPU server via WireGuard VPN) - # ============================================================================= - - # Llama 3.1 8B Instruct - Fast, general-purpose, good for routine tasks - - model_name: llama-3.1-8b - litellm_params: - model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct - api_base: http://10.8.0.2:8000/v1 - api_key: dummy # vLLM doesn't require auth - rpm: 1000 # Rate limit: requests per minute - tpm: 100000 # Rate limit: tokens per minute - - # Alternative models (uncomment and configure on GPU server as needed) - - # Qwen 2.5 14B Instruct - Excellent multilingual, stronger reasoning - # - model_name: qwen-2.5-14b - # litellm_params: - # model: openai/Qwen/Qwen2.5-14B-Instruct - # api_base: http://10.8.0.2:8000/v1 - # api_key: dummy - # rpm: 800 - # tpm: 80000 - - # Mistral 7B Instruct - Very fast, lightweight - # - model_name: mistral-7b - # litellm_params: - # model: openai/mistralai/Mistral-7B-Instruct-v0.3 - # api_base: http://10.8.0.2:8000/v1 - # api_key: dummy - # rpm: 1200 - # tpm: 120000 - - # DeepSeek Coder 6.7B - Code generation specialist - # - model_name: deepseek-coder-6.7b - # litellm_params: - # model: openai/deepseek-ai/deepseek-coder-6.7b-instruct - # api_base: http://10.8.0.2:8000/v1 - # api_key: dummy - # rpm: 1000 - # tpm: 100000 - -# ============================================================================= -# Router Settings - Intelligent Model Selection -# ============================================================================= - -# Model aliases for easy switching in Open WebUI -model_name_map: - # Default model (self-hosted, fast) - gpt-3.5-turbo: llama-3.1-8b - - # Power users can use Claude for complex tasks - gpt-4: claude-sonnet-4.5 - gpt-4-turbo: claude-sonnet-4.5 - -# LiteLLM Settings -litellm_settings: - drop_params: true - set_verbose: false # Disable verbose logging for better performance - - # Enable caching with Redis for better performance - cache: true - cache_params: - type: redis - host: redis - port: 6379 - ttl: 3600 # Cache for 1 hour - - # Force strip specific parameters globally - allowed_fails: 0 - - # Modify params before sending to provider - modify_params: true - - # Enable success and failure logging but minimize overhead - success_callback: [] # Disable all success callbacks to reduce DB writes - failure_callback: [] # Disable all failure callbacks - -# Router Settings -router_settings: - allowed_fails: 0 - - # Routing strategy: Try self-hosted first, fallback to Claude on failure - routing_strategy: simple-shuffle - - # Cooldown for failed models - cooldown_time: 30 # seconds - -# Drop unsupported parameters -default_litellm_params: - drop_params: true - -# General Settings -general_settings: - disable_responses_id_security: true - - # Disable spend tracking to reduce database overhead - disable_spend_logs: false # Keep enabled to track API vs GPU costs - - # Disable tag tracking - disable_tag_tracking: true - - # Disable daily spend updates - disable_daily_spend_logs: false # Keep enabled for cost analysis - - # Master key for authentication (set via env var) - master_key: os.environ/LITELLM_MASTER_KEY - - # Database for logging (optional but recommended for cost tracking) - database_url: os.environ/DATABASE_URL - - # Enable OpenAPI docs - docs_url: /docs - -# ============================================================================= -# Usage Guidelines (for Open WebUI users) -# ============================================================================= -# -# Model Selection Guide: -# -# Use llama-3.1-8b for: -# - General chat and Q&A -# - Simple code generation -# - Data extraction -# - Summarization -# - Translation -# - Most routine tasks -# Cost: ~$0/month (self-hosted) -# Speed: ~50-80 tokens/second -# -# Use qwen-2.5-14b for: -# - Complex reasoning -# - Multi-step problems -# - Advanced code generation -# - Multilingual tasks -# Cost: ~$0/month (self-hosted) -# Speed: ~30-50 tokens/second -# -# Use claude-sonnet-4.5 for: -# - Very complex reasoning -# - Long documents (200K context) -# - Production-critical code -# - When quality matters most -# Cost: ~$3/million input tokens, ~$15/million output tokens -# Speed: ~30-40 tokens/second -# -# Use claude-3-haiku for: -# - API fallback (if self-hosted down) -# - Very fast responses needed -# Cost: ~$0.25/million input tokens, ~$1.25/million output tokens -# Speed: ~60-80 tokens/second -# -# ============================================================================= - -# Health Check Configuration -health_check: - # Check vLLM health endpoint - enabled: true - interval: 30 # seconds - timeout: 5 # seconds - -# Fallback Configuration -# If GPU server is down, automatically use Claude -fallback: - - ["llama-3.1-8b", "claude-3-haiku"] - - ["qwen-2.5-14b", "claude-sonnet-4.5"] diff --git a/ai/model-orchestrator/Dockerfile b/ai/model-orchestrator/Dockerfile deleted file mode 100644 index bcee1e9..0000000 --- a/ai/model-orchestrator/Dockerfile +++ /dev/null @@ -1,22 +0,0 @@ -FROM python:3.11-slim - -WORKDIR /app - -# Install system dependencies -RUN apt-get update && apt-get install -y \ - curl \ - && rm -rf /var/lib/apt/lists/* - -# Copy requirements and install Python dependencies -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -# Copy application code -COPY orchestrator.py . -COPY models.yaml . - -# Expose port -EXPOSE 9000 - -# Run the orchestrator -CMD ["python", "orchestrator.py"] diff --git a/ai/model-orchestrator/models.yaml b/ai/model-orchestrator/models.yaml deleted file mode 100644 index caf6a95..0000000 --- a/ai/model-orchestrator/models.yaml +++ /dev/null @@ -1,89 +0,0 @@ -# Model Registry for AI Orchestrator -# Add new models by appending to this file - -models: - # Text Generation Models - qwen-2.5-7b: - type: text - framework: vllm - docker_service: vllm-qwen - port: 8001 - vram_gb: 14 - startup_time_seconds: 120 - endpoint: /v1/chat/completions - description: "Qwen 2.5 7B Instruct - Fast text generation, no authentication required" - - # Image Generation Models - flux-schnell: - type: image - framework: openedai-images - docker_service: flux - port: 8002 - vram_gb: 14 - startup_time_seconds: 60 - endpoint: /v1/images/generations - description: "Flux.1 Schnell - Fast high-quality image generation (4-5 sec/image)" - - # Music Generation Models - musicgen-medium: - type: audio - framework: audiocraft - docker_service: musicgen - port: 8003 - vram_gb: 11 - startup_time_seconds: 45 - endpoint: /v1/audio/generations - description: "MusicGen Medium - Text-to-music generation (60-90 sec for 30s audio)" - -# Example: Add more models easily by uncommenting and customizing below - -# Future Text Models: -# llama-3.1-8b: -# type: text -# framework: vllm -# docker_service: vllm-llama -# port: 8004 -# vram_gb: 17 -# startup_time_seconds: 120 -# endpoint: /v1/chat/completions -# description: "Llama 3.1 8B Instruct - Meta's latest model" - -# Future Image Models: -# sdxl: -# type: image -# framework: openedai-images -# docker_service: sdxl -# port: 8005 -# vram_gb: 10 -# startup_time_seconds: 45 -# endpoint: /v1/images/generations -# description: "Stable Diffusion XL - High quality image generation" - -# Future Audio Models: -# whisper-large: -# type: audio -# framework: faster-whisper -# docker_service: whisper -# port: 8006 -# vram_gb: 3 -# startup_time_seconds: 30 -# endpoint: /v1/audio/transcriptions -# description: "Whisper Large v3 - Speech-to-text transcription" -# -# xtts-v2: -# type: audio -# framework: openedai-speech -# docker_service: tts -# port: 8007 -# vram_gb: 3 -# startup_time_seconds: 30 -# endpoint: /v1/audio/speech -# description: "XTTS v2 - High-quality text-to-speech with voice cloning" - -# Configuration -config: - gpu_memory_total_gb: 24 - allow_concurrent_loading: false # Sequential loading only - model_switch_timeout_seconds: 300 # 5 minutes max for model switching - health_check_interval_seconds: 10 - default_model: qwen-2.5-7b diff --git a/ai/model-orchestrator/orchestrator.py b/ai/model-orchestrator/orchestrator.py deleted file mode 100644 index 9091537..0000000 --- a/ai/model-orchestrator/orchestrator.py +++ /dev/null @@ -1,359 +0,0 @@ -#!/usr/bin/env python3 -""" -AI Model Orchestrator for RunPod RTX 4090 -Manages sequential loading of text, image, and music models on a single GPU - -Features: -- Automatic model switching based on request type -- OpenAI-compatible API endpoints -- Docker Compose service management -- GPU memory monitoring -- Simple YAML configuration for adding new models -""" - -import asyncio -import logging -import os -import time -from typing import Dict, Optional, Any - -import docker -import httpx -import yaml -from fastapi import FastAPI, Request, HTTPException -from fastapi.responses import JSONResponse, StreamingResponse -from pydantic import BaseModel - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - -# FastAPI app -app = FastAPI(title="AI Model Orchestrator", version="1.0.0") - -# Docker client -docker_client = docker.from_env() - -# Global state -current_model: Optional[str] = None -model_registry: Dict[str, Dict[str, Any]] = {} -config: Dict[str, Any] = {} - - -def load_model_registry(): - """Load model registry from models.yaml""" - global model_registry, config - - config_path = os.getenv("MODELS_CONFIG", "/app/models.yaml") - logger.info(f"Loading model registry from {config_path}") - - with open(config_path, 'r') as f: - data = yaml.safe_load(f) - - model_registry = data.get('models', {}) - config = data.get('config', {}) - - logger.info(f"Loaded {len(model_registry)} models from registry") - for model_name, model_info in model_registry.items(): - logger.info(f" - {model_name}: {model_info['description']}") - - -def get_docker_service_name(service_name: str) -> str: - """Get full Docker service name with project prefix""" - project_name = os.getenv("COMPOSE_PROJECT_NAME", "ai") - return f"{project_name}_{service_name}_1" - - -async def stop_current_model(): - """Stop the currently running model service""" - global current_model - - if not current_model: - logger.info("No model currently running") - return - - model_info = model_registry.get(current_model) - if not model_info: - logger.warning(f"Model {current_model} not found in registry") - current_model = None - return - - service_name = get_docker_service_name(model_info['docker_service']) - logger.info(f"Stopping model: {current_model} (service: {service_name})") - - try: - container = docker_client.containers.get(service_name) - container.stop(timeout=30) - logger.info(f"Stopped {current_model}") - current_model = None - except docker.errors.NotFound: - logger.warning(f"Container {service_name} not found (already stopped?)") - current_model = None - except Exception as e: - logger.error(f"Error stopping {service_name}: {e}") - raise - - -async def start_model(model_name: str): - """Start a model service""" - global current_model - - if model_name not in model_registry: - raise HTTPException(status_code=404, detail=f"Model {model_name} not found in registry") - - model_info = model_registry[model_name] - service_name = get_docker_service_name(model_info['docker_service']) - - logger.info(f"Starting model: {model_name} (service: {service_name})") - logger.info(f" VRAM requirement: {model_info['vram_gb']} GB") - logger.info(f" Estimated startup time: {model_info['startup_time_seconds']}s") - - try: - # Start the container - container = docker_client.containers.get(service_name) - container.start() - - # Wait for service to be healthy - port = model_info['port'] - endpoint = model_info.get('endpoint', '/') - base_url = f"http://localhost:{port}" - - logger.info(f"Waiting for {model_name} to be ready at {base_url}...") - - max_wait = model_info['startup_time_seconds'] + 60 # Add buffer - start_time = time.time() - - async with httpx.AsyncClient() as client: - while time.time() - start_time < max_wait: - try: - # Try health check or root endpoint - health_url = f"{base_url}/health" - try: - response = await client.get(health_url, timeout=5.0) - if response.status_code == 200: - logger.info(f"{model_name} is ready!") - current_model = model_name - return - except: - # Try root endpoint if /health doesn't exist - response = await client.get(base_url, timeout=5.0) - if response.status_code == 200: - logger.info(f"{model_name} is ready!") - current_model = model_name - return - except Exception as e: - logger.debug(f"Waiting for {model_name}... ({e})") - - await asyncio.sleep(5) - - raise HTTPException( - status_code=503, - detail=f"Model {model_name} failed to start within {max_wait}s" - ) - - except docker.errors.NotFound: - raise HTTPException( - status_code=500, - detail=f"Docker service {service_name} not found. Is it defined in docker-compose?" - ) - except Exception as e: - logger.error(f"Error starting {model_name}: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -async def ensure_model_running(model_name: str): - """Ensure the specified model is running, switching if necessary""" - global current_model - - if current_model == model_name: - logger.info(f"Model {model_name} already running") - return - - logger.info(f"Switching model: {current_model} -> {model_name}") - - # Stop current model - await stop_current_model() - - # Start requested model - await start_model(model_name) - - logger.info(f"Model switch complete: {model_name} is now active") - - -async def proxy_request(model_name: str, request: Request): - """Proxy request to the active model service""" - model_info = model_registry[model_name] - port = model_info['port'] - - # Get request details - path = request.url.path - method = request.method - headers = dict(request.headers) - headers.pop('host', None) # Remove host header - - # Build target URL - target_url = f"http://localhost:{port}{path}" - - logger.info(f"Proxying {method} request to {target_url}") - - async with httpx.AsyncClient(timeout=300.0) as client: - # Handle different request types - if method == "GET": - response = await client.get(target_url, headers=headers) - elif method == "POST": - body = await request.body() - response = await client.post(target_url, content=body, headers=headers) - else: - raise HTTPException(status_code=405, detail=f"Method {method} not supported") - - # Return response - return JSONResponse( - content=response.json() if response.headers.get('content-type', '').startswith('application/json') else response.text, - status_code=response.status_code, - headers=dict(response.headers) - ) - - -@app.on_event("startup") -async def startup_event(): - """Load model registry on startup""" - load_model_registry() - logger.info("AI Model Orchestrator started successfully") - logger.info(f"GPU Memory: {config.get('gpu_memory_total_gb', 24)} GB") - logger.info(f"Default model: {config.get('default_model', 'qwen-2.5-7b')}") - - -@app.get("/") -async def root(): - """Root endpoint""" - return { - "service": "AI Model Orchestrator", - "version": "1.0.0", - "current_model": current_model, - "available_models": list(model_registry.keys()) - } - - -@app.get("/health") -async def health(): - """Health check endpoint""" - return { - "status": "healthy", - "current_model": current_model, - "model_info": model_registry.get(current_model) if current_model else None, - "gpu_memory_total_gb": config.get('gpu_memory_total_gb', 24), - "models_available": len(model_registry) - } - - -@app.get("/models") -async def list_models(): - """List all available models""" - return { - "models": model_registry, - "current_model": current_model - } - - -@app.post("/v1/chat/completions") -async def chat_completions(request: Request): - """OpenAI-compatible chat completions endpoint (text models)""" - # Parse request to get model name - body = await request.json() - model_name = body.get('model', config.get('default_model', 'qwen-2.5-7b')) - - # Validate model type - if model_name not in model_registry: - raise HTTPException(status_code=404, detail=f"Model {model_name} not found") - - if model_registry[model_name]['type'] != 'text': - raise HTTPException(status_code=400, detail=f"Model {model_name} is not a text model") - - # Ensure model is running - await ensure_model_running(model_name) - - # Proxy request to model - return await proxy_request(model_name, request) - - -@app.post("/v1/images/generations") -async def image_generations(request: Request): - """OpenAI-compatible image generation endpoint""" - # Parse request to get model name - body = await request.json() - model_name = body.get('model', 'flux-schnell') - - # Validate model type - if model_name not in model_registry: - raise HTTPException(status_code=404, detail=f"Model {model_name} not found") - - if model_registry[model_name]['type'] != 'image': - raise HTTPException(status_code=400, detail=f"Model {model_name} is not an image model") - - # Ensure model is running - await ensure_model_running(model_name) - - # Proxy request to model - return await proxy_request(model_name, request) - - -@app.post("/v1/audio/generations") -async def audio_generations(request: Request): - """Custom audio generation endpoint (music/sound effects)""" - # Parse request to get model name - body = await request.json() - model_name = body.get('model', 'musicgen-medium') - - # Validate model type - if model_name not in model_registry: - raise HTTPException(status_code=404, detail=f"Model {model_name} not found") - - if model_registry[model_name]['type'] != 'audio': - raise HTTPException(status_code=400, detail=f"Model {model_name} is not an audio model") - - # Ensure model is running - await ensure_model_running(model_name) - - # Proxy request to model - return await proxy_request(model_name, request) - - -@app.post("/switch") -async def switch_model(request: Request): - """Manually switch to a specific model""" - body = await request.json() - model_name = body.get('model') - - if not model_name: - raise HTTPException(status_code=400, detail="Model name required") - - if model_name not in model_registry: - raise HTTPException(status_code=404, detail=f"Model {model_name} not found") - - await ensure_model_running(model_name) - - return { - "status": "success", - "model": model_name, - "message": f"Switched to {model_name}" - } - - -if __name__ == "__main__": - import uvicorn - - host = os.getenv("HOST", "0.0.0.0") - port = int(os.getenv("PORT", "9000")) - - logger.info(f"Starting AI Model Orchestrator on {host}:{port}") - - uvicorn.run( - app, - host=host, - port=port, - log_level="info", - access_log=True, - ) diff --git a/ai/model-orchestrator/requirements.txt b/ai/model-orchestrator/requirements.txt deleted file mode 100644 index 794b4af..0000000 --- a/ai/model-orchestrator/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -fastapi==0.104.1 -uvicorn[standard]==0.24.0 -httpx==0.25.1 -docker==6.1.3 -pyyaml==6.0.1 -pydantic==2.5.0 diff --git a/ai/musicgen/Dockerfile b/ai/musicgen/Dockerfile deleted file mode 100644 index 5044496..0000000 --- a/ai/musicgen/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04 - -WORKDIR /app - -# Install Python and system dependencies -RUN apt-get update && apt-get install -y \ - python3.10 \ - python3-pip \ - ffmpeg \ - git \ - && rm -rf /var/lib/apt/lists/* - -# Upgrade pip -RUN pip3 install --no-cache-dir --upgrade pip - -# Install PyTorch with CUDA support -RUN pip3 install --no-cache-dir torch==2.1.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121 - -# Copy requirements and install dependencies -COPY requirements.txt . -RUN pip3 install --no-cache-dir -r requirements.txt - -# Copy application code -COPY server.py . - -# Create directory for model cache -RUN mkdir -p /app/models - -# Environment variables -ENV HF_HOME=/app/models -ENV TORCH_HOME=/app/models -ENV MODEL_NAME=facebook/musicgen-medium - -# Expose port -EXPOSE 8000 - -# Run the server -CMD ["python3", "server.py"] diff --git a/ai/musicgen/requirements.txt b/ai/musicgen/requirements.txt deleted file mode 100644 index 37cf773..0000000 --- a/ai/musicgen/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -torch==2.1.0 -torchaudio==2.1.0 -audiocraft==1.3.0 -fastapi==0.104.1 -uvicorn[standard]==0.24.0 -pydantic==2.5.0 diff --git a/ai/musicgen/server.py b/ai/musicgen/server.py deleted file mode 100644 index 5ea6218..0000000 --- a/ai/musicgen/server.py +++ /dev/null @@ -1,194 +0,0 @@ -#!/usr/bin/env python3 -""" -MusicGen API Server -OpenAI-compatible API for music generation using Meta's MusicGen - -Endpoints: -- POST /v1/audio/generations - Generate music from text prompt -- GET /health - Health check -- GET / - Service info -""" - -import base64 -import io -import logging -import os -import tempfile -from typing import Optional - -import torch -import torchaudio -from audiocraft.models import MusicGen -from fastapi import FastAPI, HTTPException -from fastapi.responses import JSONResponse -from pydantic import BaseModel, Field - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - -# FastAPI app -app = FastAPI(title="MusicGen API Server", version="1.0.0") - -# Global model instance -model: Optional[MusicGen] = None -model_name: str = os.getenv("MODEL_NAME", "facebook/musicgen-medium") -device: str = "cuda" if torch.cuda.is_available() else "cpu" - - -class AudioGenerationRequest(BaseModel): - """Music generation request""" - model: str = Field(default="musicgen-medium", description="Model name") - prompt: str = Field(..., description="Text description of the music to generate") - duration: float = Field(default=30.0, ge=1.0, le=30.0, description="Duration in seconds") - temperature: float = Field(default=1.0, ge=0.1, le=2.0, description="Sampling temperature") - top_k: int = Field(default=250, ge=0, le=500, description="Top-k sampling") - top_p: float = Field(default=0.0, ge=0.0, le=1.0, description="Top-p (nucleus) sampling") - cfg_coef: float = Field(default=3.0, ge=1.0, le=15.0, description="Classifier-free guidance coefficient") - response_format: str = Field(default="wav", description="Audio format (wav or mp3)") - - -class AudioGenerationResponse(BaseModel): - """Music generation response""" - audio: str = Field(..., description="Base64-encoded audio data") - format: str = Field(..., description="Audio format (wav or mp3)") - duration: float = Field(..., description="Duration in seconds") - sample_rate: int = Field(..., description="Sample rate in Hz") - - -@app.on_event("startup") -async def startup_event(): - """Load MusicGen model on startup""" - global model - - logger.info(f"Loading MusicGen model: {model_name}") - logger.info(f"Device: {device}") - - # Load model - model = MusicGen.get_pretrained(model_name, device=device) - - logger.info(f"MusicGen model loaded successfully") - logger.info(f"Max duration: 30 seconds at 32kHz") - - -@app.get("/") -async def root(): - """Root endpoint""" - return { - "service": "MusicGen API Server", - "model": model_name, - "device": device, - "max_duration": 30.0, - "sample_rate": 32000 - } - - -@app.get("/health") -async def health(): - """Health check endpoint""" - return { - "status": "healthy" if model else "initializing", - "model": model_name, - "device": device, - "ready": model is not None, - "gpu_available": torch.cuda.is_available() - } - - -@app.post("/v1/audio/generations") -async def generate_audio(request: AudioGenerationRequest) -> AudioGenerationResponse: - """Generate music from text prompt""" - if not model: - raise HTTPException(status_code=503, detail="Model not initialized") - - logger.info(f"Generating music: {request.prompt[:100]}...") - logger.info(f"Duration: {request.duration}s, Temperature: {request.temperature}") - - try: - # Set generation parameters - model.set_generation_params( - duration=request.duration, - temperature=request.temperature, - top_k=request.top_k, - top_p=request.top_p, - cfg_coef=request.cfg_coef, - ) - - # Generate audio - descriptions = [request.prompt] - with torch.no_grad(): - wav = model.generate(descriptions) - - # wav shape: [batch_size, channels, samples] - # Extract first batch item - audio_data = wav[0].cpu() # [channels, samples] - - # Get sample rate - sample_rate = model.sample_rate - - # Save to temporary file - with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: - temp_path = temp_file.name - torchaudio.save(temp_path, audio_data, sample_rate) - - # Read audio file and encode to base64 - with open(temp_path, 'rb') as f: - audio_bytes = f.read() - - # Clean up temporary file - os.unlink(temp_path) - - # Encode to base64 - audio_base64 = base64.b64encode(audio_bytes).decode('utf-8') - - logger.info(f"Generated {request.duration}s of audio") - - return AudioGenerationResponse( - audio=audio_base64, - format="wav", - duration=request.duration, - sample_rate=sample_rate - ) - - except Exception as e: - logger.error(f"Error generating audio: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@app.get("/v1/models") -async def list_models(): - """List available models (OpenAI-compatible)""" - return { - "object": "list", - "data": [ - { - "id": "musicgen-medium", - "object": "model", - "created": 1234567890, - "owned_by": "meta", - "permission": [], - "root": model_name, - "parent": None, - } - ] - } - - -if __name__ == "__main__": - import uvicorn - - host = os.getenv("HOST", "0.0.0.0") - port = int(os.getenv("PORT", "8000")) - - logger.info(f"Starting MusicGen API server on {host}:{port}") - - uvicorn.run( - app, - host=host, - port=port, - log_level="info", - access_log=True, - ) diff --git a/ai/simple_vllm_server.py b/ai/simple_vllm_server.py deleted file mode 100644 index 0075bd2..0000000 --- a/ai/simple_vllm_server.py +++ /dev/null @@ -1,302 +0,0 @@ -#!/usr/bin/env python3 -""" -Simple vLLM server using AsyncLLMEngine directly -Bypasses the multiprocessing issues we hit with the default vLLM API server -OpenAI-compatible endpoints: /v1/models and /v1/completions -""" - -import asyncio -import json -import logging -import os -from typing import AsyncIterator, Dict, List, Optional - -from fastapi import FastAPI, Request -from fastapi.responses import JSONResponse, StreamingResponse -from pydantic import BaseModel, Field -from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams -from vllm.utils import random_uuid - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - -# FastAPI app -app = FastAPI(title="Simple vLLM Server", version="1.0.0") - -# Global engine instance -engine: Optional[AsyncLLMEngine] = None -model_name: str = "Qwen/Qwen2.5-7B-Instruct" - -# Request/Response models -class CompletionRequest(BaseModel): - """OpenAI-compatible completion request""" - model: str = Field(default="qwen-2.5-7b") - prompt: str | List[str] = Field(..., description="Text prompt(s)") - max_tokens: int = Field(default=512, ge=1, le=4096) - temperature: float = Field(default=0.7, ge=0.0, le=2.0) - top_p: float = Field(default=1.0, ge=0.0, le=1.0) - n: int = Field(default=1, ge=1, le=10) - stream: bool = Field(default=False) - stop: Optional[str | List[str]] = None - presence_penalty: float = Field(default=0.0, ge=-2.0, le=2.0) - frequency_penalty: float = Field(default=0.0, ge=-2.0, le=2.0) - -class ChatMessage(BaseModel): - """Chat message format""" - role: str = Field(..., description="Role: system, user, or assistant") - content: str = Field(..., description="Message content") - -class ChatCompletionRequest(BaseModel): - """OpenAI-compatible chat completion request""" - model: str = Field(default="qwen-2.5-7b") - messages: List[ChatMessage] = Field(..., description="Chat messages") - max_tokens: int = Field(default=512, ge=1, le=4096) - temperature: float = Field(default=0.7, ge=0.0, le=2.0) - top_p: float = Field(default=1.0, ge=0.0, le=1.0) - n: int = Field(default=1, ge=1, le=10) - stream: bool = Field(default=False) - stop: Optional[str | List[str]] = None - -@app.on_event("startup") -async def startup_event(): - """Initialize vLLM engine on startup""" - global engine, model_name - - logger.info(f"Initializing vLLM AsyncLLMEngine with model: {model_name}") - - # Configure engine - engine_args = AsyncEngineArgs( - model=model_name, - tensor_parallel_size=1, # Single GPU - gpu_memory_utilization=0.85, # Use 85% of GPU memory - max_model_len=4096, # Context length - dtype="auto", # Auto-detect dtype - download_dir="/workspace/huggingface_cache", # Large disk - trust_remote_code=True, # Some models require this - enforce_eager=False, # Use CUDA graphs for better performance - ) - - # Create async engine - engine = AsyncLLMEngine.from_engine_args(engine_args) - - logger.info("vLLM AsyncLLMEngine initialized successfully") - -@app.get("/") -async def root(): - """Health check endpoint""" - return {"status": "ok", "model": model_name} - -@app.get("/health") -async def health(): - """Detailed health check""" - return { - "status": "healthy" if engine else "initializing", - "model": model_name, - "ready": engine is not None - } - -@app.get("/v1/models") -async def list_models(): - """OpenAI-compatible models endpoint""" - return { - "object": "list", - "data": [ - { - "id": "qwen-2.5-7b", - "object": "model", - "created": 1234567890, - "owned_by": "pivoine-gpu", - "permission": [], - "root": model_name, - "parent": None, - } - ] - } - -def messages_to_prompt(messages: List[ChatMessage]) -> str: - """Convert chat messages to a single prompt string""" - # Qwen 2.5 chat template format - prompt_parts = [] - - for msg in messages: - role = msg.role - content = msg.content - - if role == "system": - prompt_parts.append(f"<|im_start|>system\n{content}<|im_end|>") - elif role == "user": - prompt_parts.append(f"<|im_start|>user\n{content}<|im_end|>") - elif role == "assistant": - prompt_parts.append(f"<|im_start|>assistant\n{content}<|im_end|>") - - # Add final assistant prompt - prompt_parts.append("<|im_start|>assistant\n") - - return "\n".join(prompt_parts) - -@app.post("/v1/completions") -async def create_completion(request: CompletionRequest): - """OpenAI-compatible completion endpoint""" - if not engine: - return JSONResponse( - status_code=503, - content={"error": "Engine not initialized"} - ) - - # Handle both single prompt and batch prompts - prompts = [request.prompt] if isinstance(request.prompt, str) else request.prompt - - # Configure sampling parameters - sampling_params = SamplingParams( - temperature=request.temperature, - top_p=request.top_p, - max_tokens=request.max_tokens, - n=request.n, - stop=request.stop if request.stop else [], - presence_penalty=request.presence_penalty, - frequency_penalty=request.frequency_penalty, - ) - - # Generate completions - results = [] - for prompt in prompts: - request_id = random_uuid() - - if request.stream: - # Streaming response - async def generate_stream(): - async for output in engine.generate(prompt, sampling_params, request_id): - chunk = { - "id": request_id, - "object": "text_completion", - "created": 1234567890, - "model": request.model, - "choices": [ - { - "text": output.outputs[0].text, - "index": 0, - "logprobs": None, - "finish_reason": output.outputs[0].finish_reason, - } - ] - } - yield f"data: {json.dumps(chunk)}\n\n" - yield "data: [DONE]\n\n" - - return StreamingResponse(generate_stream(), media_type="text/event-stream") - else: - # Non-streaming response - async for output in engine.generate(prompt, sampling_params, request_id): - final_output = output - - results.append({ - "text": final_output.outputs[0].text, - "index": len(results), - "logprobs": None, - "finish_reason": final_output.outputs[0].finish_reason, - }) - - return { - "id": random_uuid(), - "object": "text_completion", - "created": 1234567890, - "model": request.model, - "choices": results, - "usage": { - "prompt_tokens": 0, # vLLM doesn't expose this easily - "completion_tokens": 0, - "total_tokens": 0, - } - } - -@app.post("/v1/chat/completions") -async def create_chat_completion(request: ChatCompletionRequest): - """OpenAI-compatible chat completion endpoint""" - if not engine: - return JSONResponse( - status_code=503, - content={"error": "Engine not initialized"} - ) - - # Convert messages to prompt - prompt = messages_to_prompt(request.messages) - - # Configure sampling parameters - sampling_params = SamplingParams( - temperature=request.temperature, - top_p=request.top_p, - max_tokens=request.max_tokens, - n=request.n, - stop=request.stop if request.stop else ["<|im_end|>"], - ) - - request_id = random_uuid() - - if request.stream: - # Streaming response - async def generate_stream(): - async for output in engine.generate(prompt, sampling_params, request_id): - chunk = { - "id": request_id, - "object": "chat.completion.chunk", - "created": 1234567890, - "model": request.model, - "choices": [ - { - "index": 0, - "delta": {"content": output.outputs[0].text}, - "finish_reason": output.outputs[0].finish_reason, - } - ] - } - yield f"data: {json.dumps(chunk)}\n\n" - yield "data: [DONE]\n\n" - - return StreamingResponse(generate_stream(), media_type="text/event-stream") - else: - # Non-streaming response - async for output in engine.generate(prompt, sampling_params, request_id): - final_output = output - - return { - "id": request_id, - "object": "chat.completion", - "created": 1234567890, - "model": request.model, - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": final_output.outputs[0].text, - }, - "finish_reason": final_output.outputs[0].finish_reason, - } - ], - "usage": { - "prompt_tokens": 0, - "completion_tokens": 0, - "total_tokens": 0, - } - } - -if __name__ == "__main__": - import uvicorn - - # Get configuration from environment - host = os.getenv("VLLM_HOST", "0.0.0.0") - port = int(os.getenv("VLLM_PORT", "8000")) - - logger.info(f"Starting vLLM server on {host}:{port}") - - uvicorn.run( - app, - host=host, - port=port, - log_level="info", - access_log=True, - ) diff --git a/ai/vllm/Dockerfile b/ai/vllm/Dockerfile deleted file mode 100644 index 7dde2d6..0000000 --- a/ai/vllm/Dockerfile +++ /dev/null @@ -1,34 +0,0 @@ -FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04 - -WORKDIR /app - -# Install Python and system dependencies -RUN apt-get update && apt-get install -y \ - python3.11 \ - python3-pip \ - git \ - && rm -rf /var/lib/apt/lists/* - -# Upgrade pip -RUN pip3 install --no-cache-dir --upgrade pip - -# Install vLLM and dependencies -COPY requirements.txt . -RUN pip3 install --no-cache-dir -r requirements.txt - -# Copy application code -COPY server.py . - -# Create directory for model cache -RUN mkdir -p /workspace/huggingface_cache - -# Environment variables -ENV HF_HOME=/workspace/huggingface_cache -ENV VLLM_HOST=0.0.0.0 -ENV VLLM_PORT=8000 - -# Expose port -EXPOSE 8000 - -# Run the server -CMD ["python3", "server.py"] diff --git a/ai/vllm/requirements.txt b/ai/vllm/requirements.txt deleted file mode 100644 index b702e45..0000000 --- a/ai/vllm/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -vllm==0.6.4.post1 -fastapi==0.104.1 -uvicorn[standard]==0.24.0 -pydantic==2.5.0 diff --git a/ai/vllm/server.py b/ai/vllm/server.py deleted file mode 100644 index 0075bd2..0000000 --- a/ai/vllm/server.py +++ /dev/null @@ -1,302 +0,0 @@ -#!/usr/bin/env python3 -""" -Simple vLLM server using AsyncLLMEngine directly -Bypasses the multiprocessing issues we hit with the default vLLM API server -OpenAI-compatible endpoints: /v1/models and /v1/completions -""" - -import asyncio -import json -import logging -import os -from typing import AsyncIterator, Dict, List, Optional - -from fastapi import FastAPI, Request -from fastapi.responses import JSONResponse, StreamingResponse -from pydantic import BaseModel, Field -from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams -from vllm.utils import random_uuid - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - -# FastAPI app -app = FastAPI(title="Simple vLLM Server", version="1.0.0") - -# Global engine instance -engine: Optional[AsyncLLMEngine] = None -model_name: str = "Qwen/Qwen2.5-7B-Instruct" - -# Request/Response models -class CompletionRequest(BaseModel): - """OpenAI-compatible completion request""" - model: str = Field(default="qwen-2.5-7b") - prompt: str | List[str] = Field(..., description="Text prompt(s)") - max_tokens: int = Field(default=512, ge=1, le=4096) - temperature: float = Field(default=0.7, ge=0.0, le=2.0) - top_p: float = Field(default=1.0, ge=0.0, le=1.0) - n: int = Field(default=1, ge=1, le=10) - stream: bool = Field(default=False) - stop: Optional[str | List[str]] = None - presence_penalty: float = Field(default=0.0, ge=-2.0, le=2.0) - frequency_penalty: float = Field(default=0.0, ge=-2.0, le=2.0) - -class ChatMessage(BaseModel): - """Chat message format""" - role: str = Field(..., description="Role: system, user, or assistant") - content: str = Field(..., description="Message content") - -class ChatCompletionRequest(BaseModel): - """OpenAI-compatible chat completion request""" - model: str = Field(default="qwen-2.5-7b") - messages: List[ChatMessage] = Field(..., description="Chat messages") - max_tokens: int = Field(default=512, ge=1, le=4096) - temperature: float = Field(default=0.7, ge=0.0, le=2.0) - top_p: float = Field(default=1.0, ge=0.0, le=1.0) - n: int = Field(default=1, ge=1, le=10) - stream: bool = Field(default=False) - stop: Optional[str | List[str]] = None - -@app.on_event("startup") -async def startup_event(): - """Initialize vLLM engine on startup""" - global engine, model_name - - logger.info(f"Initializing vLLM AsyncLLMEngine with model: {model_name}") - - # Configure engine - engine_args = AsyncEngineArgs( - model=model_name, - tensor_parallel_size=1, # Single GPU - gpu_memory_utilization=0.85, # Use 85% of GPU memory - max_model_len=4096, # Context length - dtype="auto", # Auto-detect dtype - download_dir="/workspace/huggingface_cache", # Large disk - trust_remote_code=True, # Some models require this - enforce_eager=False, # Use CUDA graphs for better performance - ) - - # Create async engine - engine = AsyncLLMEngine.from_engine_args(engine_args) - - logger.info("vLLM AsyncLLMEngine initialized successfully") - -@app.get("/") -async def root(): - """Health check endpoint""" - return {"status": "ok", "model": model_name} - -@app.get("/health") -async def health(): - """Detailed health check""" - return { - "status": "healthy" if engine else "initializing", - "model": model_name, - "ready": engine is not None - } - -@app.get("/v1/models") -async def list_models(): - """OpenAI-compatible models endpoint""" - return { - "object": "list", - "data": [ - { - "id": "qwen-2.5-7b", - "object": "model", - "created": 1234567890, - "owned_by": "pivoine-gpu", - "permission": [], - "root": model_name, - "parent": None, - } - ] - } - -def messages_to_prompt(messages: List[ChatMessage]) -> str: - """Convert chat messages to a single prompt string""" - # Qwen 2.5 chat template format - prompt_parts = [] - - for msg in messages: - role = msg.role - content = msg.content - - if role == "system": - prompt_parts.append(f"<|im_start|>system\n{content}<|im_end|>") - elif role == "user": - prompt_parts.append(f"<|im_start|>user\n{content}<|im_end|>") - elif role == "assistant": - prompt_parts.append(f"<|im_start|>assistant\n{content}<|im_end|>") - - # Add final assistant prompt - prompt_parts.append("<|im_start|>assistant\n") - - return "\n".join(prompt_parts) - -@app.post("/v1/completions") -async def create_completion(request: CompletionRequest): - """OpenAI-compatible completion endpoint""" - if not engine: - return JSONResponse( - status_code=503, - content={"error": "Engine not initialized"} - ) - - # Handle both single prompt and batch prompts - prompts = [request.prompt] if isinstance(request.prompt, str) else request.prompt - - # Configure sampling parameters - sampling_params = SamplingParams( - temperature=request.temperature, - top_p=request.top_p, - max_tokens=request.max_tokens, - n=request.n, - stop=request.stop if request.stop else [], - presence_penalty=request.presence_penalty, - frequency_penalty=request.frequency_penalty, - ) - - # Generate completions - results = [] - for prompt in prompts: - request_id = random_uuid() - - if request.stream: - # Streaming response - async def generate_stream(): - async for output in engine.generate(prompt, sampling_params, request_id): - chunk = { - "id": request_id, - "object": "text_completion", - "created": 1234567890, - "model": request.model, - "choices": [ - { - "text": output.outputs[0].text, - "index": 0, - "logprobs": None, - "finish_reason": output.outputs[0].finish_reason, - } - ] - } - yield f"data: {json.dumps(chunk)}\n\n" - yield "data: [DONE]\n\n" - - return StreamingResponse(generate_stream(), media_type="text/event-stream") - else: - # Non-streaming response - async for output in engine.generate(prompt, sampling_params, request_id): - final_output = output - - results.append({ - "text": final_output.outputs[0].text, - "index": len(results), - "logprobs": None, - "finish_reason": final_output.outputs[0].finish_reason, - }) - - return { - "id": random_uuid(), - "object": "text_completion", - "created": 1234567890, - "model": request.model, - "choices": results, - "usage": { - "prompt_tokens": 0, # vLLM doesn't expose this easily - "completion_tokens": 0, - "total_tokens": 0, - } - } - -@app.post("/v1/chat/completions") -async def create_chat_completion(request: ChatCompletionRequest): - """OpenAI-compatible chat completion endpoint""" - if not engine: - return JSONResponse( - status_code=503, - content={"error": "Engine not initialized"} - ) - - # Convert messages to prompt - prompt = messages_to_prompt(request.messages) - - # Configure sampling parameters - sampling_params = SamplingParams( - temperature=request.temperature, - top_p=request.top_p, - max_tokens=request.max_tokens, - n=request.n, - stop=request.stop if request.stop else ["<|im_end|>"], - ) - - request_id = random_uuid() - - if request.stream: - # Streaming response - async def generate_stream(): - async for output in engine.generate(prompt, sampling_params, request_id): - chunk = { - "id": request_id, - "object": "chat.completion.chunk", - "created": 1234567890, - "model": request.model, - "choices": [ - { - "index": 0, - "delta": {"content": output.outputs[0].text}, - "finish_reason": output.outputs[0].finish_reason, - } - ] - } - yield f"data: {json.dumps(chunk)}\n\n" - yield "data: [DONE]\n\n" - - return StreamingResponse(generate_stream(), media_type="text/event-stream") - else: - # Non-streaming response - async for output in engine.generate(prompt, sampling_params, request_id): - final_output = output - - return { - "id": request_id, - "object": "chat.completion", - "created": 1234567890, - "model": request.model, - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": final_output.outputs[0].text, - }, - "finish_reason": final_output.outputs[0].finish_reason, - } - ], - "usage": { - "prompt_tokens": 0, - "completion_tokens": 0, - "total_tokens": 0, - } - } - -if __name__ == "__main__": - import uvicorn - - # Get configuration from environment - host = os.getenv("VLLM_HOST", "0.0.0.0") - port = int(os.getenv("VLLM_PORT", "8000")) - - logger.info(f"Starting vLLM server on {host}:{port}") - - uvicorn.run( - app, - host=host, - port=port, - log_level="info", - access_log=True, - )