From cafa0a114760455f7dec4120c467f59d1776ca7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= Date: Fri, 21 Nov 2025 14:45:49 +0100 Subject: [PATCH] refactor: clean up runpod repository structure Removed facefusion and VPS-related files: - compose.yaml, postgres/, litellm-config.yaml (VPS services) - Dockerfile, entrypoint.sh, disable-nsfw-filter.patch (facefusion) Removed outdated documentation: - DOCKER_GPU_SETUP.md, README_GPU_SETUP.md, SETUP_GUIDE.md - TAILSCALE_SETUP.md, WIREGUARD_SETUP.md (covered in DEPLOYMENT.md) - GPU_EXPANSION_PLAN.md (historical planning doc) - gpu-server-compose.yaml, litellm-config-gpu.yaml (old versions) - deploy-gpu-stack.sh, simple_vllm_server.py (old scripts) Organized documentation: - Created docs/ directory - Moved DEPLOYMENT.md, RUNPOD_TEMPLATE.md, GPU_DEPLOYMENT_LOG.md to docs/ - Updated all documentation links in README.md Final structure: - Clean root directory with only GPU-specific files - Organized documentation in docs/ - Model services in dedicated directories (model-orchestrator/, vllm/, flux/, musicgen/) - Automation scripts in scripts/ --- DOCKER_GPU_SETUP.md | 430 ------ Dockerfile | 16 - GPU_EXPANSION_PLAN.md | 1306 ----------------- README.md | 23 +- README_GPU_SETUP.md | 444 ------ SETUP_GUIDE.md | 261 ---- TAILSCALE_SETUP.md | 417 ------ WIREGUARD_SETUP.md | 393 ----- compose.yaml | 206 --- deploy-gpu-stack.sh | 229 --- disable-nsfw-filter.patch | 12 - DEPLOYMENT.md => docs/DEPLOYMENT.md | 0 .../GPU_DEPLOYMENT_LOG.md | 0 RUNPOD_TEMPLATE.md => docs/RUNPOD_TEMPLATE.md | 0 entrypoint.sh | 16 - gpu-server-compose.yaml | 237 --- litellm-config-gpu.yaml | 199 --- litellm-config.yaml | 91 -- postgres/init/01-init-databases.sh | 38 - simple_vllm_server.py | 302 ---- 20 files changed, 8 insertions(+), 4612 deletions(-) delete mode 100644 DOCKER_GPU_SETUP.md delete mode 100644 Dockerfile delete mode 100644 GPU_EXPANSION_PLAN.md delete mode 100644 README_GPU_SETUP.md delete mode 100644 SETUP_GUIDE.md delete mode 100644 TAILSCALE_SETUP.md delete mode 100644 WIREGUARD_SETUP.md delete mode 100644 compose.yaml delete mode 100755 deploy-gpu-stack.sh delete mode 100644 disable-nsfw-filter.patch rename DEPLOYMENT.md => docs/DEPLOYMENT.md (100%) rename GPU_DEPLOYMENT_LOG.md => docs/GPU_DEPLOYMENT_LOG.md (100%) rename RUNPOD_TEMPLATE.md => docs/RUNPOD_TEMPLATE.md (100%) delete mode 100755 entrypoint.sh delete mode 100644 gpu-server-compose.yaml delete mode 100644 litellm-config-gpu.yaml delete mode 100644 litellm-config.yaml delete mode 100755 postgres/init/01-init-databases.sh delete mode 100644 simple_vllm_server.py diff --git a/DOCKER_GPU_SETUP.md b/DOCKER_GPU_SETUP.md deleted file mode 100644 index e60d103..0000000 --- a/DOCKER_GPU_SETUP.md +++ /dev/null @@ -1,430 +0,0 @@ -# Docker & NVIDIA Container Toolkit Setup - -## Day 5: Docker Configuration on GPU Server - -This guide sets up Docker with GPU support on your RunPod server. - ---- - -## Step 1: Install Docker - -### Quick Install (Recommended) - -```bash -# SSH into GPU server -ssh gpu-pivoine - -# Download and run Docker install script -curl -fsSL https://get.docker.com -o get-docker.sh -sh get-docker.sh - -# Verify installation -docker --version -docker compose version -``` - -Expected output: -``` -Docker version 24.0.7, build afdd53b -Docker Compose version v2.23.0 -``` - -### Manual Install (Alternative) - -```bash -# Add Docker's official GPG key -apt-get update -apt-get install -y ca-certificates curl gnupg -install -m 0755 -d /etc/apt/keyrings -curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg -chmod a+r /etc/apt/keyrings/docker.gpg - -# Add repository -echo \ - "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \ - $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ - tee /etc/apt/sources.list.d/docker.list > /dev/null - -# Install Docker -apt-get update -apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin - -# Start Docker -systemctl enable docker -systemctl start docker -``` - ---- - -## Step 2: Install NVIDIA Container Toolkit - -This enables Docker containers to use the GPU. - -```bash -# Add NVIDIA repository -distribution=$(. /etc/os-release;echo $ID$VERSION_ID) -curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | \ - gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg - -curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \ - sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ - tee /etc/apt/sources.list.d/nvidia-container-toolkit.list - -# Install toolkit -apt-get update -apt-get install -y nvidia-container-toolkit - -# Configure Docker to use NVIDIA runtime -nvidia-ctk runtime configure --runtime=docker - -# Restart Docker -systemctl restart docker -``` - ---- - -## Step 3: Test GPU Access in Docker - -### Test 1: Basic CUDA Container - -```bash -docker run --rm --runtime=nvidia --gpus all \ - nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi -``` - -Expected output: Same as `nvidia-smi` output showing your RTX 4090. - -### Test 2: PyTorch Container - -```bash -docker run --rm --runtime=nvidia --gpus all \ - pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime \ - python -c "import torch; print('CUDA:', torch.cuda.is_available(), 'Device:', torch.cuda.get_device_name(0))" -``` - -Expected output: -``` -CUDA: True Device: NVIDIA GeForce RTX 4090 -``` - -### Test 3: Multi-GPU Query (if you have multiple GPUs) - -```bash -docker run --rm --runtime=nvidia --gpus all \ - nvidia/cuda:12.1.0-base-ubuntu22.04 \ - bash -c "echo 'GPU Count:' && nvidia-smi --list-gpus" -``` - ---- - -## Step 4: Configure Docker Compose with GPU Support - -Docker Compose needs to know about NVIDIA runtime. - -### Create daemon.json - -```bash -cat > /etc/docker/daemon.json << 'EOF' -{ - "runtimes": { - "nvidia": { - "path": "nvidia-container-runtime", - "runtimeArgs": [] - } - }, - "default-runtime": "nvidia", - "log-driver": "json-file", - "log-opts": { - "max-size": "10m", - "max-file": "3" - } -} -EOF - -# Restart Docker -systemctl restart docker -``` - ---- - -## Step 5: Create GPU Project Structure - -```bash -cd /workspace - -# Create directory structure -mkdir -p gpu-stack/{vllm,comfyui,training,jupyter,monitoring} -cd gpu-stack - -# Create .env file -cat > .env << 'EOF' -# GPU Stack Environment Variables - -# Timezone -TIMEZONE=Europe/Berlin - -# VPN Network -VPS_IP=10.8.0.1 -GPU_IP=10.8.0.2 - -# Model Storage -MODELS_PATH=/workspace/models - -# Hugging Face (optional, for private models) -HF_TOKEN= - -# PostgreSQL (on VPS) -DB_HOST=10.8.0.1 -DB_PORT=5432 -DB_USER=valknar -DB_PASSWORD=ragnarok98 -DB_NAME=openwebui - -# Weights & Biases (optional, for training logging) -WANDB_API_KEY= -EOF - -chmod 600 .env -``` - ---- - -## Step 6: Test Full Stack (Quick Smoke Test) - -Let's deploy a minimal vLLM container to verify everything works: - -```bash -cd /workspace/gpu-stack - -# Create test compose file -cat > test-compose.yaml << 'EOF' -services: - test-vllm: - image: vllm/vllm-openai:latest - container_name: test_vllm - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - command: - - --model - - facebook/opt-125m # Tiny model for testing - - --host - - 0.0.0.0 - - --port - - 8000 - ports: - - "8000:8000" - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] -EOF - -# Start test -docker compose -f test-compose.yaml up -d - -# Wait 30 seconds for model download -sleep 30 - -# Check logs -docker compose -f test-compose.yaml logs - -# Test inference -curl http://localhost:8000/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "facebook/opt-125m", - "prompt": "Hello, my name is", - "max_tokens": 10 - }' -``` - -Expected output (JSON response with generated text). - -**Clean up test:** -```bash -docker compose -f test-compose.yaml down -``` - ---- - -## Step 7: Install Additional Tools - -```bash -# Python tools -apt install -y python3-pip python3-venv - -# Monitoring tools -apt install -y htop nvtop iotop - -# Network tools -apt install -y iperf3 tcpdump - -# Development tools -apt install -y build-essential - -# Git LFS (for large model files) -apt install -y git-lfs -git lfs install -``` - ---- - -## Step 8: Configure Automatic Updates (Optional) - -```bash -# Install unattended-upgrades -apt install -y unattended-upgrades - -# Configure -dpkg-reconfigure -plow unattended-upgrades - -# Enable automatic security updates -cat > /etc/apt/apt.conf.d/50unattended-upgrades << 'EOF' -Unattended-Upgrade::Allowed-Origins { - "${distro_id}:${distro_codename}-security"; -}; -Unattended-Upgrade::Automatic-Reboot "false"; -Unattended-Upgrade::Remove-Unused-Dependencies "true"; -EOF -``` - ---- - -## Troubleshooting - -### Docker can't access GPU - -**Problem:** `docker: Error response from daemon: could not select device driver "" with capabilities: [[gpu]]` - -**Solution:** -```bash -# Verify NVIDIA runtime is configured -docker info | grep -i runtime - -# Should show nvidia in runtimes list -# If not, reinstall nvidia-container-toolkit - -# Check daemon.json -cat /etc/docker/daemon.json - -# Restart Docker -systemctl restart docker -``` - -### Permission denied on docker commands - -**Solution:** -```bash -# Add your user to docker group (if not root) -usermod -aG docker $USER - -# Or always use sudo -sudo docker ... -``` - -### Out of disk space - -**Check usage:** -```bash -df -h -du -sh /var/lib/docker -docker system df -``` - -**Clean up:** -```bash -# Remove unused images -docker image prune -a - -# Remove unused volumes -docker volume prune - -# Full cleanup -docker system prune -a --volumes -``` - ---- - -## Verification Checklist - -Before deploying the full stack: - -- [ ] Docker installed and running -- [ ] `docker --version` shows 24.x or newer -- [ ] `docker compose version` works -- [ ] NVIDIA Container Toolkit installed -- [ ] `docker run --gpus all nvidia/cuda:12.1.0-base nvidia-smi` works -- [ ] PyTorch container can see GPU -- [ ] Test vLLM deployment successful -- [ ] /workspace directory structure created -- [ ] .env file configured with VPN IPs -- [ ] Additional tools installed (nvtop, htop, etc.) - ---- - -## Performance Monitoring Commands - -**GPU Monitoring:** -```bash -# Real-time GPU stats -watch -n 1 nvidia-smi - -# Or with nvtop (prettier) -nvtop - -# GPU memory usage -nvidia-smi --query-gpu=memory.used,memory.total --format=csv -``` - -**Docker Stats:** -```bash -# Container resource usage -docker stats - -# Specific container -docker stats vllm --no-stream -``` - -**System Resources:** -```bash -# Overall system -htop - -# I/O stats -iotop - -# Network -iftop -``` - ---- - -## Next: Deploy Production Stack - -Now you're ready to deploy the full GPU stack with vLLM, ComfyUI, and training tools. - -**Proceed to:** Deploying the production docker-compose.yaml - -**Save your progress:** - -```bash -cat >> /workspace/SERVER_INFO.md << 'EOF' - -## Docker Configuration -- Docker Version: [docker --version] -- NVIDIA Runtime: Enabled -- GPU Access in Containers: ✓ -- Test vLLM Deployment: Successful -- Directory: /workspace/gpu-stack - -## Tools Installed -- nvtop: GPU monitoring -- htop: System monitoring -- Docker Compose: v2.x -- Git LFS: Large file support -EOF -``` diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index d48b090..0000000 --- a/Dockerfile +++ /dev/null @@ -1,16 +0,0 @@ -FROM facefusion/facefusion:3.5.0-cpu - -# Patch content_analyser.py to disable NSFW filter -RUN sed -i '197s/.*/\treturn False # Patched: NSFW filter disabled/' /facefusion/facefusion/content_analyser.py && \ - grep -q 'return False.*Patched' /facefusion/facefusion/content_analyser.py || (echo "ERROR: Patch failed!" && exit 1) - -# Calculate new hash for patched content_analyser -RUN python3 -c "import inspect; import sys; sys.path.insert(0, '/facefusion'); from facefusion import content_analyser; from facefusion.hash_helper import create_hash; content = inspect.getsource(content_analyser).encode(); print('New hash:', create_hash(content))" - -# Update hash check in core.py to accept patched version -RUN NEW_HASH=$(python3 -c "import inspect; import sys; sys.path.insert(0, '/facefusion'); from facefusion import content_analyser; from facefusion.hash_helper import create_hash; content = inspect.getsource(content_analyser).encode(); print(create_hash(content))") && \ - sed -i "s/content_analyser_hash == 'b14e7b92'/content_analyser_hash == '$NEW_HASH'/" /facefusion/facefusion/core.py && \ - echo "Updated hash check in core.py to: $NEW_HASH" - -# Verify both patches were applied -RUN echo "NSFW filter patch successfully applied to image" diff --git a/GPU_EXPANSION_PLAN.md b/GPU_EXPANSION_PLAN.md deleted file mode 100644 index d34ea01..0000000 --- a/GPU_EXPANSION_PLAN.md +++ /dev/null @@ -1,1306 +0,0 @@ -# GPU-Enhanced AI Stack Expansion Plan - -## Executive Summary - -This document outlines a comprehensive plan to extend the current AI stack (LiteLLM, Open WebUI, Crawl4AI) with dedicated GPU hosting capabilities for: -- **LLM Model Hosting**: Self-hosted models (Llama, Mistral, Qwen, etc.) -- **Model Training**: Fine-tuning and training workflows -- **Image Generation**: Stable Diffusion, FLUX via ComfyUI -- **Video Generation**: AnimateDiff, CogVideo, etc. - -**Current Architecture**: CPU-based stack on pivoine.art VPS → Claude API via LiteLLM -**Target Architecture**: Hybrid stack with GPU server(s) for self-hosted models + API-based models - ---- - -## Phase 1: Current Stack Analysis - -### Existing Components - -1. **ai_postgres** (pgvector/pgvector:pg16) - - PostgreSQL with pgvector for RAG - - Stores: conversations, embeddings, LiteLLM logs - -2. **webui** (Open WebUI) - - User-facing ChatGPT-like interface - - URL: https://ai.pivoine.art - - Features: RAG, web search, document upload - - Connected to LiteLLM proxy - -3. **litellm** (LiteLLM proxy) - - Currently proxies Anthropic Claude API - - OpenAI-compatible endpoint at http://litellm:4000 - - Supports multiple providers via config - -4. **crawl4ai** - - Internal web scraping for LLM content prep - - Port 11235 (internal only) - -5. **facefusion** (CPU-only) - - Face swapping/enhancement - - Currently CPU-based (slow) - - Protected by Authelia SSO - -### Current Limitations - -- ❌ No self-hosted LLMs (relies on expensive API calls) -- ❌ No GPU acceleration for facefusion -- ❌ No image generation capabilities -- ❌ No model training/fine-tuning capabilities -- ❌ No video generation -- ❌ High operational costs for API usage - ---- - -## Phase 2: GPU Provider Comparison - -### Provider Options - -#### 1. **RunPod** ⭐ RECOMMENDED -**Pros:** -- Pay-per-second GPU billing -- Wide GPU selection (RTX 4090, A100, H100) -- Docker-first platform -- Global locations -- Easy HTTP/SSH tunneling -- Volume persistence - -**Pricing (Approximate):** -- RTX 4090 (24GB): ~$0.50/hour ($360/month 24/7) -- RTX 3090 (24GB): ~$0.35/hour ($250/month) -- A6000 (48GB): ~$0.80/hour ($576/month) -- A100 (40GB): ~$1.50/hour ($1,080/month) - -**Best for:** On-demand workloads, experimentation, cost-conscious hosting - ---- - -#### 2. **Lambda Labs** -**Pros:** -- Flat monthly pricing -- High-end GPUs (A100, H100) -- Jupyter notebooks included -- Fast network - -**Pricing:** -- 1x A100 (40GB): $1.10/hour ($792/month) -- 8x A100 (40GB): $8.00/hour (~$5,760/month) - -**Best for:** Research, high-utilization workloads - ---- - -#### 3. **Vast.ai** -**Pros:** -- Marketplace model (cheapest) -- Many GPU options -- Spot pricing available - -**Cons:** -- Variable reliability -- Setup complexity -- Community-hosted machines - -**Pricing:** -- RTX 4090: ~$0.25-0.40/hour -- A100: ~$0.80-1.20/hour - -**Best for:** Budget-conscious, experimental workloads - ---- - -#### 4. **Google Cloud Platform (GCP)** -**Pros:** -- Enterprise reliability -- Auto-scaling -- Integration with Google services -- Preemptible instances available - -**Pricing:** -- T4 (16GB): ~$0.35/hour -- V100 (16GB): ~$2.48/hour -- A100 (40GB): ~$2.93/hour -- TPU options available - -**Best for:** Enterprise workloads, auto-scaling needs - ---- - -#### 5. **AWS** -**Pros:** -- Global infrastructure -- Broad GPU selection -- Spot instances for cost savings -- Enterprise support - -**Pricing:** -- g4dn.xlarge (T4 16GB): ~$0.526/hour -- p3.2xlarge (V100 16GB): ~$3.06/hour -- p4d.24xlarge (8x A100 40GB): ~$32.77/hour - -**Best for:** Enterprise, existing AWS infrastructure - ---- - -#### 6. **Hugging Face Spaces / Inference Endpoints** -**Pros:** -- Managed model hosting -- Auto-scaling -- Simple deployment -- Community models - -**Pricing:** -- CPU: $0.03/hour -- T4: $0.60/hour -- A10G: $1.00/hour -- A100: $4.00/hour - -**Best for:** Quick model deployment, serverless inference - ---- - -### Recommendation: **RunPod** for Primary GPU Server - -**Rationale:** -1. **Cost-effective**: Pay-per-second billing, ~$0.50/hour for RTX 4090 -2. **Docker-native**: Easy integration with existing compose stack -3. **Flexibility**: Start/stop as needed, scale up for training -4. **Community**: Large user base, good documentation -5. **Network**: Built-in HTTP/SSH tunneling - -**Supplementary**: Use Hugging Face for specific model hosting if needed - ---- - -## Phase 3: Architecture Design - -### Network Topology - -``` -┌─────────────────────────────────────────────────────────────┐ -│ pivoine.art VPS (CPU-based) │ -├─────────────────────────────────────────────────────────────┤ -│ │ -│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ -│ │ Open │─────▶│ LiteLLM │◀────▶│ ai_ │ │ -│ │ WebUI │ │ Proxy │ │ postgres │ │ -│ └──────────┘ └──────────┘ └──────────┘ │ -│ │ │ │ -│ │ │ │ -└───────┼──────────────────┼──────────────────────────────────┘ - │ │ - │ ▼ - │ ┌─────────────────┐ - │ │ Anthropic API │ - │ │ (Claude) │ - │ └─────────────────┘ - │ - ▼ -┌────────────────────────────────────────────────────────────┐ -│ GPU Server (RunPod) │ -├────────────────────────────────────────────────────────────┤ -│ │ -│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ -│ │ vLLM │ │ ComfyUI │ │ Model │ │ JupyterLab│ │ -│ │ (LLMs) │ │ (SD/FLUX)│ │ Training │ │ │ │ -│ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │ -│ │ │ │ │ │ -│ └──────────────┴─────────────┴──────────────┘ │ -│ │ │ -│ ┌───────────────┐ │ -│ │ Model Storage │ │ -│ │ (Persistent) │ │ -│ └───────────────┘ │ -│ │ -└────────────────────────────────────────────────────────────┘ - │ - ▼ (Tunneled via WireGuard or Tailscale) -┌────────────────────────────────────────────────────────────┐ -│ Integration Options: │ -├────────────────────────────────────────────────────────────┤ -│ 1. LiteLLM adds vLLM endpoint (http://gpu.internal:8000) │ -│ 2. ComfyUI exposed via subdomain (comfy.ai.pivoine.art) │ -│ 3. Model storage synced via rclone/restic │ -└────────────────────────────────────────────────────────────┘ -``` - -### Connection Methods - -#### Option A: WireGuard VPN (RECOMMENDED) -- Create WireGuard tunnel between VPS and GPU server -- GPU services accessible via private IPs -- Secure, low overhead, easy to manage -- Already have wg-easy in your stack - -**Setup:** -1. Deploy WireGuard on GPU server -2. Add GPU server as VPN peer -3. Configure LiteLLM to use VPN IPs - -#### Option B: SSH Tunnel -- SSH reverse tunnel from GPU to VPS -- Simple, no additional software -- Higher latency - -#### Option C: Tailscale -- Zero-config VPN mesh -- Easy setup, good UX -- Proprietary (but free tier available) - ---- - -## Phase 4: Service Implementation Plans - -### 4.1 LLM Hosting with vLLM - -**vLLM** is the industry-standard for high-performance LLM inference. - -#### Features: -- PagedAttention for efficient KV cache -- Continuous batching -- OpenAI-compatible API -- Tensor parallelism for multi-GPU -- Quantization support (AWQ, GPTQ) - -#### Docker Compose Configuration: - -```yaml -services: - vllm: - image: vllm/vllm-openai:latest - container_name: gpu_vllm - restart: unless-stopped - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - CUDA_VISIBLE_DEVICES: 0 - volumes: - - vllm_models:/root/.cache/huggingface - command: - - --model - - meta-llama/Meta-Llama-3.1-8B-Instruct # or any model - - --host - - 0.0.0.0 - - --port - - 8000 - - --tensor-parallel-size - - '1' - - --gpu-memory-utilization - - '0.9' - - --max-model-len - - '8192' - ports: - - "8000:8000" - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] -``` - -#### Recommended Models for RTX 4090 (24GB): - -**Text Generation:** -- Llama 3.1 8B Instruct (8GB VRAM, fast) -- Qwen2.5 14B Instruct (14GB VRAM, multilingual) -- Mistral 7B Instruct v0.3 (7GB VRAM) -- Nous Hermes 2 Mixtral 8x7B (with quantization, 16GB) - -**Code:** -- DeepSeek Coder 6.7B (7GB VRAM) -- CodeLlama 13B (13GB VRAM) -- Qwen2.5-Coder 14B (14GB VRAM) - -#### Integration with LiteLLM: - -Add to `ai/litellm-config.yaml`: - -```yaml -model_list: - # Existing Anthropic - - model_name: claude-sonnet-4-5 - litellm_params: - model: anthropic/claude-sonnet-4-5-20250929 - api_key: os.environ/ANTHROPIC_API_KEY - - # New vLLM models - - model_name: llama-3.1-8b - litellm_params: - model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct - api_base: http://gpu.internal:8000/v1 - api_key: dummy - - - model_name: qwen-2.5-14b - litellm_params: - model: openai/Qwen/Qwen2.5-14B-Instruct - api_base: http://gpu.internal:8000/v1 - api_key: dummy -``` - ---- - -### 4.2 ComfyUI for Image/Video Generation - -**ComfyUI** is a node-based UI for Stable Diffusion with advanced workflows. - -#### Features: -- Node-based workflow editor -- Support for SD 1.5, SDXL, SD3, FLUX -- ControlNet, LoRA, embeddings -- Video generation (AnimateDiff, SVD) -- API for automation - -#### Docker Compose Configuration: - -```yaml -services: - comfyui: - image: ghcr.io/ai-dock/comfyui:latest - container_name: gpu_comfyui - restart: unless-stopped - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - # Custom nodes auto-install - COMFYUI_FLAGS: --listen 0.0.0.0 --port 8188 - volumes: - - comfyui_data:/data - - comfyui_models:/opt/ComfyUI/models - - comfyui_output:/opt/ComfyUI/output - ports: - - "8188:8188" - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] -``` - -#### Model Downloads (via ComfyUI Manager): - -**Stable Diffusion Models:** -- FLUX.1-dev (12GB, newest, best quality) -- FLUX.1-schnell (12GB, fast) -- SDXL Base 1.0 (6.9GB) -- SD 1.5 (4GB, fast, wide LoRA support) - -**ControlNet Models:** -- controlnet-canny-sdxl -- controlnet-depth-sdxl -- controlnet-openpose-sdxl - -**LoRA Models** (download from Civitai): -- Style LoRAs (anime, realistic, etc.) -- Character LoRAs -- Concept LoRAs - -#### Traefik Integration: - -Add subdomain routing for ComfyUI: - -```yaml -labels: - - 'traefik.enable=true' - - 'traefik.http.routers.comfyui-web-secure.rule=Host(`comfy.ai.pivoine.art`)' - - 'traefik.http.routers.comfyui-web-secure.tls.certresolver=resolver' - - 'traefik.http.routers.comfyui-web-secure.entrypoints=web-secure' - - 'traefik.http.routers.comfyui-web-secure.middlewares=net-authelia,security-headers@file' - - 'traefik.http.services.comfyui.loadbalancer.server.port=8188' -``` - -#### Open WebUI Integration: - -ComfyUI has a REST API that can be called from Open WebUI using function calling. - -Example workflow API call: -```python -import requests - -def generate_image(prompt: str, negative_prompt: str = ""): - workflow = { - # ComfyUI workflow JSON - } - response = requests.post( - "http://comfyui:8188/prompt", - json={"prompt": workflow} - ) - return response.json() -``` - ---- - -### 4.3 Model Training Infrastructure - -For fine-tuning LLMs and training custom models. - -#### Option A: Axolotl (Recommended) - -**Axolotl** is a user-friendly fine-tuning framework supporting: -- LoRA, QLoRA -- Full fine-tuning -- RLHF/DPO -- Multi-GPU training - -```yaml -services: - axolotl: - image: winglian/axolotl:main-py3.11-cu121-2.2.2 - container_name: gpu_training - runtime: nvidia - volumes: - - ./training/configs:/workspace/configs - - ./training/data:/workspace/data - - ./training/output:/workspace/output - - training_cache:/root/.cache - environment: - NVIDIA_VISIBLE_DEVICES: all - WANDB_API_KEY: ${WANDB_API_KEY:-} # Optional: Weights & Biases logging - command: | - bash -c " - accelerate launch -m axolotl.cli.train /workspace/configs/config.yaml - " - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] -``` - -#### Training Workflow: -1. Prepare dataset (JSONL format) -2. Create Axolotl config (LoRA, batch size, epochs) -3. Start training container -4. Monitor via Weights & Biases or TensorBoard -5. Export LoRA adapters -6. Merge with base model or use in vLLM - -#### Example Config: -```yaml -# training/configs/lora-llama3.yaml -base_model: meta-llama/Meta-Llama-3.1-8B-Instruct -model_type: AutoModelForCausalLM -tokenizer_type: AutoTokenizer - -load_in_8bit: false -load_in_4bit: true -strict: false - -datasets: - - path: /workspace/data/train.jsonl - type: completion - field: text - -output_dir: /workspace/output/llama3-lora - -adapter: lora -lora_r: 16 -lora_alpha: 32 -lora_dropout: 0.05 -lora_target_modules: - - q_proj - - v_proj - - k_proj - - o_proj - -gradient_accumulation_steps: 4 -micro_batch_size: 2 -num_epochs: 3 -learning_rate: 0.0002 - -optimizer: adamw_bnb_8bit -lr_scheduler: cosine -warmup_steps: 100 -``` - -#### Option B: JupyterLab for Custom Training - -For research and custom training scripts: - -```yaml -services: - jupyter: - image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel - container_name: gpu_jupyter - runtime: nvidia - volumes: - - ./notebooks:/workspace - - jupyter_cache:/root/.cache - ports: - - "8888:8888" - environment: - NVIDIA_VISIBLE_DEVICES: all - JUPYTER_ENABLE_LAB: "yes" - command: | - bash -c " - pip install jupyterlab transformers datasets accelerate bitsandbytes peft && - jupyter lab --ip=0.0.0.0 --allow-root --no-browser --NotebookApp.token='' - " - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] -``` - ---- - -### 4.4 Model Storage Strategy - -#### Storage Requirements: - -**Per Model Type:** -- LLM 7B: ~14GB (FP16) -- LLM 13B: ~26GB -- SDXL: ~7GB -- FLUX: ~12GB -- ControlNet: ~2.5GB each -- LoRA: ~100-500MB each - -**Total Estimated:** -- 3-4 LLMs: ~80GB -- SD models + LoRAs: ~50GB -- Training checkpoints: ~100GB -- **Total: 250-300GB minimum** - -#### RunPod Storage Options: - -1. **Network Volume** (Recommended) - - Persistent across pod restarts - - Shared between multiple pods - - ~$0.10/GB/month - - 500GB = $50/month - -2. **Container Disk** - - Included with pod - - Lost when pod stops - - Good for temporary storage - -3. **External Storage (rclone)** - - Sync to/from VPS or cloud storage - - Backup models to Backblaze B2 or Wasabi - - Good for disaster recovery - -#### Model Management: - -Use **Hugging Face Hub** as model cache: - -```bash -# Download models on first run -huggingface-cli download meta-llama/Meta-Llama-3.1-8B-Instruct \ - --local-dir /models/llama-3.1-8b - -# Or let vLLM/ComfyUI auto-download -``` - -**Model Sync Script:** -```bash -#!/bin/bash -# sync-models.sh - Sync models from VPS to GPU server - -rclone sync \ - /mnt/hidrive/AI/models \ - gpu:/workspace/models \ - --progress \ - --transfers 4 -``` - ---- - -## Phase 5: Implementation Roadmap - -### Week 1: Infrastructure Setup - -**Day 1-2: RunPod Account & GPU Server** -- [ ] Create RunPod account -- [ ] Deploy RTX 4090 pod with Ubuntu 22.04 + PyTorch template -- [ ] Configure persistent network volume (500GB) -- [ ] Set up SSH access - -**Day 3-4: Network Configuration** -- [ ] Deploy WireGuard on GPU server -- [ ] Add GPU server as peer to existing VPN (vpn/compose.yaml) -- [ ] Test connectivity between VPS and GPU server -- [ ] Configure firewall rules - -**Day 5: Docker Setup on GPU Server** -- [ ] Install Docker + NVIDIA Container Toolkit -- [ ] Create docker-compose.yaml for GPU services -- [ ] Test GPU access in containers - ---- - -### Week 2: LLM Hosting - -**Day 1-2: vLLM Deployment** -- [ ] Deploy vLLM container -- [ ] Download Llama 3.1 8B Instruct -- [ ] Test inference locally -- [ ] Benchmark performance (tokens/sec) - -**Day 3-4: LiteLLM Integration** -- [ ] Update litellm-config.yaml with vLLM endpoint -- [ ] Test via Open WebUI -- [ ] Configure model routing (cheap models → vLLM, complex → Claude) -- [ ] Set up usage monitoring - -**Day 5: Model Expansion** -- [ ] Download Qwen 2.5 14B -- [ ] Download Mistral 7B Instruct -- [ ] Test model switching in Open WebUI -- [ ] Document performance characteristics - ---- - -### Week 3: Image Generation - -**Day 1-2: ComfyUI Setup** -- [ ] Deploy ComfyUI container -- [ ] Download FLUX.1-schnell -- [ ] Download SDXL -- [ ] Install ComfyUI Manager - -**Day 3-4: Model Downloads** -- [ ] Download ControlNet models -- [ ] Download VAE models -- [ ] Download popular LoRAs from Civitai -- [ ] Organize model directory - -**Day 5: Integration & Workflows** -- [ ] Create basic text-to-image workflow -- [ ] Create ControlNet workflow -- [ ] Test API access -- [ ] Add Traefik subdomain (comfy.ai.pivoine.art) - ---- - -### Week 4: Training Infrastructure - -**Day 1-2: Axolotl Setup** -- [ ] Deploy Axolotl container -- [ ] Create sample dataset -- [ ] Test LoRA fine-tuning with tiny model -- [ ] Verify GPU utilization - -**Day 3-4: JupyterLab Setup** -- [ ] Deploy JupyterLab container -- [ ] Install ML libraries -- [ ] Create example notebooks -- [ ] Test custom training scripts - -**Day 5: Documentation & Testing** -- [ ] Write training guides -- [ ] Test end-to-end workflows -- [ ] Benchmark training speeds -- [ ] Document best practices - ---- - -### Ongoing: Optimization & Expansion - -**Month 2:** -- Monitor costs and optimize GPU utilization -- Implement model caching strategies -- Add more models based on usage patterns -- Set up automated model updates -- Implement usage quotas per user - -**Month 3+:** -- Consider multi-GPU setup for larger models -- Implement model quantization (AWQ/GPTQ) -- Add video generation (AnimateDiff, CogVideo) -- Explore voice synthesis (XTTS, Bark) -- Custom model training for specific use cases - ---- - -## Phase 6: Cost Analysis - -### Scenario A: Single RTX 4090 (24/7) - -**GPU Server (RunPod):** -- RTX 4090 pod: $0.50/hour × 720 hours = $360/month -- 500GB network volume: $50/month -- **Subtotal: $410/month** - -**VPS (Existing):** -- No change in cost - -**Total: ~$410/month** - -**Savings:** -- Claude API costs reduced by ~80% (self-hosted for routine tasks) -- Break-even if currently spending >$500/month on API calls - ---- - -### Scenario B: Pay-as-you-go (8 hours/day) - -**GPU Server (RunPod):** -- RTX 4090: $0.50/hour × 8 hours × 30 days = $120/month -- Storage: $50/month -- **Subtotal: $170/month** - -**Best for:** -- Development/experimentation -- Burst workloads -- Image generation on-demand - ---- - -### Scenario C: Dual GPU (Training + Inference) - -**GPU Server 1 (Inference):** -- RTX 4090 24/7: $360/month - -**GPU Server 2 (Training - On-demand):** -- A100 40GB: $1.50/hour × 40 hours/month = $60/month -- Used only for fine-tuning sessions - -**Storage:** -- 1TB network volume: $100/month - -**Total: ~$520/month** - ---- - -### Cost Optimization Tips - -1. **Auto-stop idle pods**: RunPod can auto-stop after X minutes idle -2. **Use spot instances**: ~50% cheaper but can be interrupted -3. **Quantized models**: 4-bit models use 4x less VRAM → cheaper GPUs -4. **Batch processing**: Queue image gen jobs to maximize GPU usage -5. **Model sharing**: One vLLM instance can serve multiple models via adapters -6. **Monitoring**: Track per-model costs to optimize routing - ---- - -## Phase 7: Monitoring & Operations - -### Metrics to Track - -**GPU Utilization:** -- nvidia-smi metrics (utilization %, memory usage) -- Temperature and power draw -- Per-process GPU usage - -**Model Performance:** -- Tokens per second (LLM inference) -- Images per second (SD/FLUX) -- Training time per epoch - -**Costs:** -- GPU hours consumed -- Storage usage -- API vs self-hosted breakdown - -### Monitoring Stack - -**Option A: Netdata (Already deployed)** - -Add GPU monitoring to existing Netdata: - -```yaml -# On GPU server -services: - netdata: - image: netdata/netdata:latest - container_name: gpu_netdata - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - volumes: - - /sys:/host/sys:ro - - /proc:/host/proc:ro - - /var/run/docker.sock:/var/run/docker.sock:ro - command: | - bash -c " - # Enable nvidia_smi plugin - /usr/libexec/netdata/plugins.d/charts.d.plugin nvidia_smi - " -``` - -**Option B: Prometheus + Grafana** - -For detailed metrics: - -```yaml -services: - prometheus: - image: prom/prometheus:latest - volumes: - - ./prometheus.yml:/etc/prometheus/prometheus.yml - - prometheus_data:/prometheus - command: - - '--config.file=/etc/prometheus/prometheus.yml' - - dcgm-exporter: - image: nvidia/dcgm-exporter:latest - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - - grafana: - image: grafana/grafana:latest - ports: - - "3000:3000" - volumes: - - grafana_data:/var/lib/grafana -``` - -Import Grafana dashboard #12219 for GPU metrics. - ---- - -## Phase 8: Backup & Disaster Recovery - -### What to Backup - -1. **Models** (250-300GB) - - Base models can be re-downloaded - - Custom fine-tuned models: CRITICAL - - LoRAs: CRITICAL - -2. **Training Data** (~10-50GB) - - Datasets - - Preprocessing scripts - -3. **Configurations** (<1GB) - - Docker compose files - - Training configs - - Workflow JSONs - -### Backup Strategy - -**Tier 1: Critical (Daily)** -- Fine-tuned models -- Training checkpoints -- Custom datasets - -**Backup to:** -- Restic → HiDrive (already configured) -- Backblaze B2 (~$6/TB/month) - -```bash -# Add to core/compose.yaml backrest config -- gpu_models:/volumes/gpu_models:ro -- gpu_checkpoints:/volumes/gpu_checkpoints:ro -``` - -**Tier 2: Nice-to-have (Weekly)** -- Base models (can re-download) -- ComfyUI outputs - -**Tier 3: Ephemeral (No backup)** -- Inference cache -- Temporary generations - ---- - -## Phase 9: Security Considerations - -### GPU Server Security - -1. **Firewall:** - - Only allow WireGuard port (51820) - - All services accessed via VPN - - No public exposure - -2. **SSH:** - - Key-based auth only - - Disable password auth - - Change default port - -3. **Docker:** - - Rootless Docker (optional but recommended) - - Limited container capabilities - - No privileged containers except for nvidia-runtime - -4. **Secrets:** - - Store API keys in .env - - Use Docker secrets for sensitive data - - Rotate keys periodically - -### Access Control - -- **ComfyUI**: Protected by Authelia SSO (already configured) -- **vLLM**: Internal only, accessed via LiteLLM proxy -- **JupyterLab**: Password-protected or Authelia -- **Training**: No public access, VPN only - ---- - -## Phase 10: Advanced Features (Future) - -### Multi-GPU Scaling - -**Tensor Parallelism** (vLLM): -- Split large models across multiple GPUs -- Example: 70B model on 2x A100s - -```yaml -command: - - --model - - meta-llama/Meta-Llama-3.1-70B-Instruct - - --tensor-parallel-size - - '2' # Use 2 GPUs -``` - -**Pipeline Parallelism** (training): -- Split model layers across GPUs -- Useful for very large models - -### Model Serving Optimization - -**vLLM Features:** -- Speculative decoding (faster generation) -- Prefix caching (faster for repeated prompts) -- Multi-LoRA serving (multiple adapters, one base model) - -**Example multi-LoRA:** -```yaml -command: - - --model - - meta-llama/Meta-Llama-3.1-8B-Instruct - - --enable-lora - - --max-loras - - '4' - - --lora-modules - - customer-support=/models/loras/support-lora - - creative-writing=/models/loras/writing-lora -``` - -### Video Generation - -**AnimateDiff in ComfyUI:** -- Generate short videos from text prompts -- Animate static images -- ~8GB VRAM for 512x512 16-frame videos - -**CogVideo:** -- High-quality video generation -- Requires A100 or H100 -- 5-second clips at 720p - -### Voice Synthesis - -**XTTS v2:** -- High-quality voice cloning -- Multi-language support -- ~6GB VRAM - -**Bark:** -- Text-to-speech with emotions -- Sound effects -- ~10GB VRAM - ---- - -## Appendix A: Quick Start Commands - -### Initial GPU Server Setup - -```bash -# SSH into RunPod instance -ssh root@gpu.runpod.io -p 12345 - -# Install Docker -curl -fsSL https://get.docker.com -o get-docker.sh -sh get-docker.sh - -# Install NVIDIA Container Toolkit -distribution=$(. /etc/os-release;echo $ID$VERSION_ID) -curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg -curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \ - sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ - tee /etc/apt/sources.list.d/nvidia-container-toolkit.list -apt-get update -apt-get install -y nvidia-container-toolkit -systemctl restart docker - -# Test GPU access -docker run --rm --runtime=nvidia --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi -``` - -### Deploy vLLM (Quick Test) - -```bash -# Create directory -mkdir -p /workspace/vllm -cd /workspace/vllm - -# Run vLLM -docker run -d \ - --name vllm \ - --runtime=nvidia \ - --gpus all \ - -v ~/.cache/huggingface:/root/.cache/huggingface \ - -p 8000:8000 \ - vllm/vllm-openai:latest \ - --model meta-llama/Meta-Llama-3.1-8B-Instruct \ - --dtype auto \ - --max-model-len 8192 - -# Test inference -curl http://localhost:8000/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", - "prompt": "Once upon a time", - "max_tokens": 50 - }' -``` - -### Deploy ComfyUI (Quick Test) - -```bash -docker run -d \ - --name comfyui \ - --runtime=nvidia \ - --gpus all \ - -v /workspace/comfyui:/data \ - -p 8188:8188 \ - ghcr.io/ai-dock/comfyui:latest - -# Access at http://gpu-ip:8188 -``` - ---- - -## Appendix B: Sample Docker Compose (Full GPU Stack) - -```yaml -# gpu-server/compose.yaml -version: '3.8' - -services: - # vLLM for LLM inference - vllm: - image: vllm/vllm-openai:latest - container_name: gpu_vllm - restart: unless-stopped - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - CUDA_VISIBLE_DEVICES: 0 - volumes: - - vllm_models:/root/.cache/huggingface - command: - - --model - - meta-llama/Meta-Llama-3.1-8B-Instruct - - --host - - 0.0.0.0 - - --port - - 8000 - - --gpu-memory-utilization - - '0.9' - ports: - - "8000:8000" - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - - # ComfyUI for image generation - comfyui: - image: ghcr.io/ai-dock/comfyui:latest - container_name: gpu_comfyui - restart: unless-stopped - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - volumes: - - comfyui_data:/data - - comfyui_models:/opt/ComfyUI/models - - comfyui_output:/opt/ComfyUI/output - ports: - - "8188:8188" - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - - # Axolotl for model training - axolotl: - image: winglian/axolotl:main-py3.11-cu121-2.2.2 - container_name: gpu_training - runtime: nvidia - volumes: - - ./training/configs:/workspace/configs - - ./training/data:/workspace/data - - ./training/output:/workspace/output - - training_cache:/root/.cache - environment: - NVIDIA_VISIBLE_DEVICES: all - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - # Only start when training - profiles: - - training - - # JupyterLab for research - jupyter: - image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel - container_name: gpu_jupyter - restart: unless-stopped - runtime: nvidia - volumes: - - ./notebooks:/workspace - - jupyter_cache:/root/.cache - ports: - - "8888:8888" - environment: - NVIDIA_VISIBLE_DEVICES: all - JUPYTER_ENABLE_LAB: "yes" - command: | - bash -c " - pip install jupyterlab transformers datasets accelerate bitsandbytes peft && - jupyter lab --ip=0.0.0.0 --allow-root --no-browser - " - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - - # Netdata monitoring - netdata: - image: netdata/netdata:latest - container_name: gpu_netdata - restart: unless-stopped - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - volumes: - - /sys:/host/sys:ro - - /proc:/host/proc:ro - - /var/run/docker.sock:/var/run/docker.sock:ro - ports: - - "19999:19999" - cap_add: - - SYS_PTRACE - - SYS_ADMIN - security_opt: - - apparmor:unconfined - -volumes: - vllm_models: - comfyui_data: - comfyui_models: - comfyui_output: - training_cache: - jupyter_cache: -``` - ---- - -## Appendix C: Cost Calculator - -**Monthly GPU Costs:** - -| GPU Model | VRAM | $/hour | 24/7 Month | 8hr/day | Use Case | -|-----------|------|--------|------------|---------|----------| -| RTX 3090 | 24GB | $0.35 | $252 | $84 | Development, small models | -| RTX 4090 | 24GB | $0.50 | $360 | $120 | Production inference, SD | -| A6000 | 48GB | $0.80 | $576 | $192 | Large models, training | -| A100 40GB | 40GB | $1.50 | $1,080 | $360 | Enterprise, training | -| A100 80GB | 80GB | $2.50 | $1,800 | $600 | Massive models, research | - -**Storage Costs:** -- Network volume: $0.10/GB/month -- 500GB = $50/month -- 1TB = $100/month - -**Total Estimated Monthly:** -- RTX 4090 + 500GB storage = $410/month (24/7) -- RTX 4090 + 500GB storage = $170/month (8hr/day) - -**Break-even Analysis:** -- If spending >$500/month on API calls → GPU server saves money -- If spending <$200/month → stick with APIs - ---- - -## Appendix D: Model Recommendations by Use Case - -### General Chat (24/7 Inference) -**Best:** Qwen 2.5 14B Instruct -- Excellent multilingual support -- Fast inference -- Good reasoning - -**Alternative:** Mistral 7B Instruct v0.3 -- Fastest inference -- Lower VRAM - -### Code Generation -**Best:** Qwen 2.5 Coder 14B -- SOTA coding performance -- Multi-language support - -**Alternative:** DeepSeek Coder 6.7B -- Faster, lighter - -### Creative Writing -**Best:** Nous Hermes 2 Mixtral 8x7B (quantized) -- Creative, engaging -- Follows instructions well - -### Image Generation (Realistic) -**Best:** FLUX.1-dev -- Highest quality -- Best prompt following - -**Alternative:** SDXL + RealVisXL LoRA -- Faster generation -- Good quality - -### Image Generation (Anime) -**Best:** SDXL + AnimagineXL LoRA -- Anime-specific training -- Vibrant colors - -### Video Generation -**Best:** AnimateDiff + SDXL -- 16-frame clips -- Good quality - -**Needs:** A100 40GB or better - ---- - -## Next Steps - -1. **Review this plan** and provide feedback -2. **Set budget** for GPU infrastructure -3. **Choose provider** (recommend RunPod) -4. **Define priority services** (LLM hosting first? Image gen first?) -5. **Schedule implementation** (4-week timeline above) - -Would you like me to: -- Create the detailed Docker Compose configurations? -- Set up a cost estimation spreadsheet? -- Research specific models for your use cases? -- Begin implementation with Phase 1? - -Let me know how you'd like to proceed! 🚀 diff --git a/README.md b/README.md index b1487cc..5f98f3d 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ For first-time setup on a new RunPod instance: 2. SSH to GPU server: `ssh gpu-server` 3. Run preparation script: `cd /workspace/ai && chmod +x scripts/prepare-template.sh && ./scripts/prepare-template.sh` -**See**: [DEPLOYMENT.md](DEPLOYMENT.md) for detailed deployment guide. +**See**: [docs/DEPLOYMENT.md](docs/DEPLOYMENT.md) for detailed deployment guide. ## Architecture @@ -64,16 +64,9 @@ All requests route through the orchestrator, which automatically loads the appro ## Documentation -### Primary Docs -- **[DEPLOYMENT.md](DEPLOYMENT.md)** - Complete deployment and usage guide -- **[RUNPOD_TEMPLATE.md](RUNPOD_TEMPLATE.md)** - Template creation and usage -- **[GPU_DEPLOYMENT_LOG.md](GPU_DEPLOYMENT_LOG.md)** - Deployment history and technical notes - -### Setup Guides (Historical) -- `DOCKER_GPU_SETUP.md` - Docker configuration for GPU support -- `TAILSCALE_SETUP.md` - Tailscale VPN setup -- `WIREGUARD_SETUP.md` - WireGuard VPN (deprecated, use Tailscale) -- `SETUP_GUIDE.md` - General setup instructions +- **[docs/DEPLOYMENT.md](docs/DEPLOYMENT.md)** - Complete deployment and usage guide +- **[docs/RUNPOD_TEMPLATE.md](docs/RUNPOD_TEMPLATE.md)** - Template creation and usage +- **[docs/GPU_DEPLOYMENT_LOG.md](docs/GPU_DEPLOYMENT_LOG.md)** - Deployment history and technical notes ### Architecture Components - `model-orchestrator/` - FastAPI orchestrator managing model lifecycle @@ -96,7 +89,7 @@ All requests route through the orchestrator, which automatically loads the appro 3. Save pod as template in RunPod dashboard 4. Name: `multi-modal-ai-v1.0` -**See**: [RUNPOD_TEMPLATE.md](RUNPOD_TEMPLATE.md) for step-by-step guide. +**See**: [docs/RUNPOD_TEMPLATE.md](docs/RUNPOD_TEMPLATE.md) for step-by-step guide. ## Adding New Models @@ -116,7 +109,7 @@ models: Then add the Docker service to `docker-compose.gpu.yaml` and restart the orchestrator. -**See**: [DEPLOYMENT.md](DEPLOYMENT.md#adding-new-models) for complete instructions. +**See**: [docs/DEPLOYMENT.md](docs/DEPLOYMENT.md#adding-new-models) for complete instructions. ## Usage Examples @@ -166,8 +159,8 @@ nvidia-smi For issues: 1. Check orchestrator logs: `docker logs ai_orchestrator` -2. Review [DEPLOYMENT.md](DEPLOYMENT.md#troubleshooting) -3. Check [GPU_DEPLOYMENT_LOG.md](GPU_DEPLOYMENT_LOG.md) for deployment history +2. Review [docs/DEPLOYMENT.md](docs/DEPLOYMENT.md#troubleshooting) +3. Check [docs/GPU_DEPLOYMENT_LOG.md](docs/GPU_DEPLOYMENT_LOG.md) for deployment history ## License diff --git a/README_GPU_SETUP.md b/README_GPU_SETUP.md deleted file mode 100644 index 34974f0..0000000 --- a/README_GPU_SETUP.md +++ /dev/null @@ -1,444 +0,0 @@ -# GPU-Enhanced AI Stack - Implementation Guide - -Welcome to your GPU expansion setup! This directory contains everything you need to deploy a production-ready GPU server for LLM hosting, image generation, and model training. - -## 📚 Documentation Files - -### Planning & Architecture -- **`GPU_EXPANSION_PLAN.md`** - Complete 70-page plan with provider comparison, architecture, and roadmap -- **`README_GPU_SETUP.md`** - This file - -### Step-by-Step Setup Guides -1. **`SETUP_GUIDE.md`** - Day 1-2: RunPod account & GPU server deployment -2. **`WIREGUARD_SETUP.md`** - Day 3-4: VPN connection between VPS and GPU server -3. **`DOCKER_GPU_SETUP.md`** - Day 5: Docker + NVIDIA Container Toolkit configuration - -### Configuration Files -- **`gpu-server-compose.yaml`** - Production Docker Compose for GPU server -- **`litellm-config-gpu.yaml`** - Updated LiteLLM config with self-hosted models -- **`deploy-gpu-stack.sh`** - Automated deployment script - ---- - -## 🚀 Quick Start (Week 1 Checklist) - -### Day 1-2: RunPod & GPU Server ✓ -- [ ] Create RunPod account at https://www.runpod.io/ -- [ ] Add billing method ($50 initial credit recommended) -- [ ] Deploy RTX 4090 pod with PyTorch template -- [ ] Configure 500GB network volume -- [ ] Verify SSH access -- [ ] Test GPU with `nvidia-smi` -- [ ] **Guide:** `SETUP_GUIDE.md` - -### Day 3-4: Network Configuration ✓ -- [ ] Install Tailscale on VPS -- [ ] Install Tailscale on GPU server -- [ ] Authenticate both devices -- [ ] Test VPN connectivity -- [ ] Configure firewall rules -- [ ] Verify VPS can reach GPU server -- [ ] **Guide:** `TAILSCALE_SETUP.md` - -### Day 5: Docker & GPU Setup ✓ -- [ ] Install Docker on GPU server -- [ ] Install NVIDIA Container Toolkit -- [ ] Test GPU access in containers -- [ ] Create /workspace/gpu-stack directory -- [ ] Copy configuration files -- [ ] **Guide:** `DOCKER_GPU_SETUP.md` - -### Day 6-7: Deploy Services ✓ -- [ ] Copy `gpu-server-compose.yaml` to GPU server -- [ ] Edit `.env` with your settings -- [ ] Run `./deploy-gpu-stack.sh` -- [ ] Wait for vLLM to load model (~5 minutes) -- [ ] Test vLLM: `curl http://localhost:8000/v1/models` -- [ ] Access ComfyUI: `http://[tailscale-ip]:8188` -- [ ] **Script:** `deploy-gpu-stack.sh` - ---- - -## 📦 Services Included - -### vLLM (http://[tailscale-ip]:8000) -**Purpose:** High-performance LLM inference -**Default Model:** Llama 3.1 8B Instruct -**Performance:** 50-80 tokens/second on RTX 4090 -**Use for:** General chat, Q&A, code generation, summarization - -**Switch models:** -Edit `gpu-server-compose.yaml`, change `--model` parameter, restart: -```bash -docker compose restart vllm -``` - -### ComfyUI (http://[tailscale-ip]:8188) -**Purpose:** Advanced Stable Diffusion interface -**Features:** FLUX, SDXL, ControlNet, LoRA -**Use for:** Image generation, img2img, inpainting - -**Download models:** -Access web UI → ComfyUI Manager → Install Models - -### JupyterLab (http://[tailscale-ip]:8888) -**Purpose:** Interactive development environment -**Token:** `pivoine-ai-2025` (change in `.env`) -**Use for:** Research, experimentation, custom training scripts - -### Axolotl (Training - on-demand) -**Purpose:** LLM fine-tuning framework -**Start:** `docker compose --profile training up -d axolotl` -**Use for:** LoRA training, full fine-tuning, RLHF - -### Netdata (http://[tailscale-ip]:19999) -**Purpose:** System & GPU monitoring -**Features:** Real-time metrics, GPU utilization, memory usage -**Use for:** Performance monitoring, troubleshooting - ---- - -## 🔧 Configuration - -### Environment Variables (.env) - -```bash -# VPN Network (Tailscale) -VPS_IP=100.x.x.x # Your VPS Tailscale IP (get with: tailscale ip -4) -GPU_IP=100.x.x.x # GPU server Tailscale IP (get with: tailscale ip -4) - -# Model Storage -MODELS_PATH=/workspace/models - -# Hugging Face Token (for gated models like Llama) -HF_TOKEN=hf_xxxxxxxxxxxxx - -# Weights & Biases (for training logging) -WANDB_API_KEY= - -# JupyterLab Access -JUPYTER_TOKEN=pivoine-ai-2025 - -# PostgreSQL (on VPS) -DB_HOST=100.x.x.x # Your VPS Tailscale IP -DB_PORT=5432 -DB_USER=valknar -DB_PASSWORD=ragnarok98 -DB_NAME=openwebui -``` - -### Updating LiteLLM on VPS - -After GPU server is running, update your VPS LiteLLM config: - -```bash -# On VPS -cd ~/Projects/docker-compose/ai - -# Backup current config -cp litellm-config.yaml litellm-config.yaml.backup - -# Copy new config with GPU models -cp litellm-config-gpu.yaml litellm-config.yaml - -# Restart LiteLLM -arty restart litellm -``` - -Now Open WebUI will have access to both Claude (API) and Llama (self-hosted)! - ---- - -## 💰 Cost Management - -### Current Costs (24/7 Operation) -- **GPU Server:** RTX 4090 @ $0.50/hour = $360/month -- **Storage:** 500GB network volume = $50/month -- **Total:** **$410/month** - -### Cost-Saving Options - -**1. Pay-as-you-go (8 hours/day)** -- GPU: $0.50 × 8 × 30 = $120/month -- Storage: $50/month -- **Total: $170/month** - -**2. Auto-stop idle pods** -RunPod can auto-stop after X minutes idle: -- Dashboard → Pod Settings → Auto-stop after 30 minutes - -**3. Use smaller models** -- Mistral 7B instead of Llama 8B: Faster, cheaper GPU -- Quantized models: 4-bit = 1/4 the VRAM - -**4. Batch image generation** -- Generate multiple images at once -- Use scheduled jobs (cron) during off-peak hours - -### Cost Tracking - -**Check GPU usage:** -```bash -# On RunPod dashboard -Billing → Usage History - -# See hourly costs, total spent -``` - -**Check API vs GPU savings:** -```bash -# On VPS, check LiteLLM logs -docker logs ai_litellm | grep "model=" - -# Count requests to llama-3.1-8b vs claude-* -``` - -**Expected savings:** -- 80% of requests → self-hosted = $0 cost -- 20% of requests → Claude = API cost -- Break-even if currently spending >$500/month on APIs - ---- - -## 🔍 Monitoring & Troubleshooting - -### Check Service Status - -```bash -# On GPU server -cd /workspace/gpu-stack - -# View all services -docker compose ps - -# Check specific service logs -docker compose logs -f vllm -docker compose logs -f comfyui -docker compose logs -f jupyter - -# Check GPU usage -nvidia-smi -# or prettier: -nvtop -``` - -### Common Issues - -**vLLM not loading model:** -```bash -# Check logs -docker compose logs vllm - -# Common causes: -# - Model download in progress (wait 5-10 minutes) -# - Out of VRAM (try smaller model) -# - Missing HF_TOKEN (for gated models like Llama) -``` - -**ComfyUI slow/crashing:** -```bash -# Check GPU memory -nvidia-smi - -# If VRAM full: -# - Close vLLM temporarily -# - Use smaller models -# - Reduce batch size in ComfyUI -``` - -**Can't access from VPS:** -```bash -# Test VPN -ping [tailscale-ip] - -# If fails: -# - Check Tailscale status: tailscale status -# - Restart Tailscale: tailscale down && tailscale up -# - Check firewall: ufw status -``` - -**Docker can't see GPU:** -```bash -# Test GPU access -docker run --rm --runtime=nvidia --gpus all nvidia/cuda:12.1.0-base nvidia-smi - -# If fails: -# - Check NVIDIA driver: nvidia-smi -# - Check nvidia-docker: nvidia-ctk --version -# - Restart Docker: systemctl restart docker -``` - ---- - -## 📊 Performance Benchmarks - -### Expected Performance (RTX 4090) - -**LLM Inference (vLLM):** -- Llama 3.1 8B: 50-80 tokens/second -- Qwen 2.5 14B: 30-50 tokens/second -- Batch size 32: ~1500 tokens/second - -**Image Generation (ComfyUI):** -- SDXL (1024×1024): ~4-6 seconds -- FLUX (1024×1024): ~8-12 seconds -- SD 1.5 (512×512): ~1-2 seconds - -**Training (Axolotl):** -- LoRA fine-tuning (8B model): ~3-5 hours for 3 epochs -- Full fine-tuning: Not recommended on 24GB VRAM - ---- - -## 🔐 Security Best Practices - -### Network Security -✅ All services behind Tailscale VPN (end-to-end encrypted) -✅ No public exposure (except RunPod's SSH) -✅ Firewall configured (no additional ports needed) - -### Access Control -✅ JupyterLab password-protected -✅ ComfyUI accessible via VPN only -✅ vLLM internal API (no auth needed) - -### SSH Security -```bash -# On GPU server, harden SSH -nano /etc/ssh/sshd_config - -# Set: -PermitRootLogin prohibit-password -PasswordAuthentication no -PubkeyAuthentication yes - -systemctl restart sshd -``` - -### Regular Updates -```bash -# Weekly updates -apt update && apt upgrade -y - -# Update Docker images -docker compose pull -docker compose up -d -``` - ---- - -## 📈 Scaling Up - -### When to Add More GPUs - -**Current limitations (1× RTX 4090):** -- Can run ONE of these at a time: - - 8B LLM at full speed - - 14B LLM at moderate speed - - SDXL image generation - - Training job - -**Add 2nd GPU if:** -- You want LLM + image gen simultaneously -- Training + inference at same time -- Multiple users with high demand - -**Multi-GPU options:** -- 2× RTX 4090: Run vLLM + ComfyUI separately ($720/month) -- 1× A100 40GB: Larger models (70B with quantization) ($1,080/month) -- Mix: RTX 4090 (inference) + A100 (training) (~$1,300/month) - -### Deploying Larger Models - -**70B models (need 2× A100 or 4× RTX 4090):** -```yaml -# In gpu-server-compose.yaml -vllm: - command: - - --model - - meta-llama/Meta-Llama-3.1-70B-Instruct - - --tensor-parallel-size - - "2" # Split across 2 GPUs - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 2 # Use 2 GPUs - capabilities: [gpu] -``` - ---- - -## 🎯 Next Steps (Week 2+) - -### Week 2: LLM Production Deployment -- [ ] Test Llama 3.1 8B performance -- [ ] Download additional models (Qwen, Mistral) -- [ ] Configure model routing in LiteLLM -- [ ] Set up usage monitoring -- [ ] Benchmark tokens/second for each model - -### Week 3: Image Generation -- [ ] Download FLUX and SDXL models -- [ ] Install ComfyUI Manager -- [ ] Download ControlNet models -- [ ] Create sample workflows -- [ ] Test API integration with Open WebUI - -### Week 4: Training Infrastructure -- [ ] Prepare a sample dataset -- [ ] Test LoRA fine-tuning with Axolotl -- [ ] Set up Weights & Biases logging -- [ ] Create training documentation -- [ ] Benchmark training speed - ---- - -## 🆘 Getting Help - -### Resources -- **RunPod Docs:** https://docs.runpod.io/ -- **vLLM Docs:** https://docs.vllm.ai/ -- **ComfyUI Wiki:** https://github.com/comfyanonymous/ComfyUI/wiki -- **Axolotl Docs:** https://github.com/OpenAccess-AI-Collective/axolotl - -### Community -- **RunPod Discord:** https://discord.gg/runpod -- **vLLM Discord:** https://discord.gg/vllm -- **r/LocalLLaMA:** https://reddit.com/r/LocalLLaMA - -### Support -If you encounter issues: -1. Check logs: `docker compose logs -f [service]` -2. Check GPU: `nvidia-smi` -3. Check VPN: `wg show` -4. Restart service: `docker compose restart [service]` -5. Full restart: `docker compose down && docker compose up -d` - ---- - -## ✅ Success Criteria - -You're ready to proceed when: -- [ ] GPU server responds to `ping [tailscale-ip]` from VPS -- [ ] vLLM returns models: `curl http://[tailscale-ip]:8000/v1/models` -- [ ] ComfyUI web interface loads: `http://[tailscale-ip]:8188` -- [ ] JupyterLab accessible with token -- [ ] Netdata shows GPU metrics -- [ ] Open WebUI shows both Claude and Llama models - -**Total setup time:** 4-6 hours (if following guides sequentially) - ---- - -## 🎉 You're All Set! - -Your GPU-enhanced AI stack is ready. You now have: -- ✅ Self-hosted LLM inference (saves $$$) -- ✅ Advanced image generation (FLUX, SDXL) -- ✅ Model training capabilities (LoRA, fine-tuning) -- ✅ Secure VPN connection -- ✅ Full monitoring and logging - -Enjoy building with your new AI infrastructure! 🚀 diff --git a/SETUP_GUIDE.md b/SETUP_GUIDE.md deleted file mode 100644 index 1d14145..0000000 --- a/SETUP_GUIDE.md +++ /dev/null @@ -1,261 +0,0 @@ -# GPU Server Setup Guide - Week 1 - -## Day 1-2: RunPod Account & GPU Server - -### Step 1: Create RunPod Account - -1. **Go to RunPod**: https://www.runpod.io/ -2. **Sign up** with email or GitHub -3. **Add billing method**: - - Credit card required - - No charges until you deploy a pod - - Recommended: Add $50 initial credit - -4. **Verify email** and complete account setup - -### Step 2: Deploy Your First GPU Pod - -#### 2.1 Navigate to Pods - -1. Click **"Deploy"** in top menu -2. Select **"GPU Pods"** - -#### 2.2 Choose GPU Type - -**Recommended: RTX 4090** -- 24GB VRAM -- ~$0.50/hour -- Perfect for LLMs up to 14B params -- Great for SDXL/FLUX - -**Filter options:** -- GPU Type: RTX 4090 -- GPU Count: 1 -- Sort by: Price (lowest first) -- Region: Europe (lower latency to Germany) - -#### 2.3 Select Template - -Choose: **"RunPod PyTorch"** template -- Includes: CUDA, PyTorch, Python -- Pre-configured for GPU workloads -- Docker pre-installed - -**Alternative**: "Ubuntu 22.04 with CUDA 12.1" (more control) - -#### 2.4 Configure Pod - -**Container Settings:** -- **Container Disk**: 50GB (temporary, auto-included) -- **Expose Ports**: - - Add: 22 (SSH) - - Add: 8000 (vLLM) - - Add: 8188 (ComfyUI) - - Add: 8888 (JupyterLab) - -**Volume Settings:** -- Click **"+ Network Volume"** -- **Name**: `gpu-models-storage` -- **Size**: 500GB -- **Region**: Same as pod -- **Cost**: ~$50/month - -**Environment Variables:** -- Add later (not needed for initial setup) - -#### 2.5 Deploy Pod - -1. Review configuration -2. Click **"Deploy On-Demand"** (not Spot for reliability) -3. Wait 2-3 minutes for deployment - -**Expected cost:** -- GPU: $0.50/hour = $360/month (24/7) -- Storage: $50/month -- **Total: $410/month** - -### Step 3: Access Your GPU Server - -#### 3.1 Get Connection Info - -Once deployed, you'll see: -- **Pod ID**: e.g., `abc123def456` -- **SSH Command**: `ssh root@.runpod.io -p 12345` -- **Public IP**: May not be directly accessible (use SSH) - -#### 3.2 SSH Access - -RunPod automatically generates SSH keys for you: - -```bash -# Copy the SSH command from RunPod dashboard -ssh root@abc123def456.runpod.io -p 12345 - -# First time: Accept fingerprint -# You should now be in the GPU server! -``` - -**Verify GPU:** -```bash -nvidia-smi -``` - -Expected output: -``` -+-----------------------------------------------------------------------------+ -| NVIDIA-SMI 535.xx Driver Version: 535.xx CUDA Version: 12.1 | -|-------------------------------+----------------------+----------------------+ -| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | -|===============================+======================+======================| -| 0 NVIDIA GeForce ... Off | 00000000:01:00.0 Off | N/A | -| 30% 45C P0 50W / 450W | 0MiB / 24564MiB | 0% Default | -+-------------------------------+----------------------+----------------------+ -``` - -### Step 4: Initial Server Configuration - -#### 4.1 Update System - -```bash -# Update package lists -apt update - -# Upgrade existing packages -apt upgrade -y - -# Install essential tools -apt install -y \ - vim \ - htop \ - tmux \ - curl \ - wget \ - git \ - net-tools \ - iptables-persistent -``` - -#### 4.2 Set Timezone - -```bash -timedatectl set-timezone Europe/Berlin -date # Verify -``` - -#### 4.3 Create Working Directory - -```bash -# Create workspace -mkdir -p /workspace/{models,configs,data,scripts} - -# Check network volume mount -ls -la /workspace -# Should show your 500GB volume -``` - -#### 4.4 Configure SSH (Optional but Recommended) - -**Generate your own SSH key on your local machine:** - -```bash -# On your local machine (not GPU server) -ssh-keygen -t ed25519 -C "gpu-server-pivoine" -f ~/.ssh/gpu_pivoine - -# Copy public key to GPU server -ssh-copy-id -i ~/.ssh/gpu_pivoine.pub root@abc123def456.runpod.io -p 12345 -``` - -**Add to your local ~/.ssh/config:** - -```bash -Host gpu-pivoine - HostName abc123def456.runpod.io - Port 12345 - User root - IdentityFile ~/.ssh/gpu_pivoine -``` - -Now you can connect with: `ssh gpu-pivoine` - -### Step 5: Verify GPU Access - -Run this test: - -```bash -# Test CUDA -python3 -c "import torch; print('CUDA available:', torch.cuda.is_available()); print('GPU count:', torch.cuda.device_count())" -``` - -Expected output: -``` -CUDA available: True -GPU count: 1 -``` - -### Troubleshooting - -**Problem: Can't connect via SSH** -- Check pod is running (not stopped) -- Verify port number in SSH command -- Try web terminal in RunPod dashboard - -**Problem: GPU not detected** -- Run `nvidia-smi` -- Check RunPod selected correct GPU type -- Restart pod if needed - -**Problem: Network volume not mounted** -- Check RunPod dashboard → Volume tab -- Verify volume is attached to pod -- Try: `df -h` to see mounts - -### Next Steps - -Once SSH access works and GPU is verified: -✅ Proceed to **Day 3-4: Network Configuration (Tailscale VPN)** - -### Save Important Info - -Create a file to track your setup: - -```bash -# On GPU server -cat > /workspace/SERVER_INFO.md << 'EOF' -# GPU Server Information - -## Connection -- SSH: ssh root@abc123def456.runpod.io -p 12345 -- Pod ID: abc123def456 -- Region: [YOUR_REGION] - -## Hardware -- GPU: RTX 4090 24GB -- CPU: [Check with: lscpu] -- RAM: [Check with: free -h] -- Storage: 500GB network volume at /workspace - -## Costs -- GPU: $0.50/hour -- Storage: $50/month -- Total: ~$410/month (24/7) - -## Deployed: [DATE] -EOF -``` - ---- - -## Checkpoint ✓ - -Before moving to Day 3, verify: -- [ ] RunPod account created and billing added -- [ ] RTX 4090 pod deployed successfully -- [ ] 500GB network volume attached -- [ ] SSH access working -- [ ] `nvidia-smi` shows GPU -- [ ] `torch.cuda.is_available()` returns True -- [ ] Timezone set to Europe/Berlin -- [ ] Essential tools installed - -**Ready for Tailscale setup? Let's go!** diff --git a/TAILSCALE_SETUP.md b/TAILSCALE_SETUP.md deleted file mode 100644 index 9950469..0000000 --- a/TAILSCALE_SETUP.md +++ /dev/null @@ -1,417 +0,0 @@ -# Tailscale VPN Setup - Better Alternative to WireGuard - -## Why Tailscale? - -RunPod doesn't support UDP ports, which blocks WireGuard. Tailscale solves this by: -- ✅ Works over HTTPS (TCP) - no UDP needed -- ✅ Zero configuration - automatic setup -- ✅ Free for personal use -- ✅ Built on WireGuard (same security) -- ✅ Automatic NAT traversal -- ✅ Peer-to-peer when possible (low latency) - ---- - -## Step 1: Create Tailscale Account - -1. Go to: https://tailscale.com/ -2. Click **"Get Started"** -3. Sign up with **GitHub** or **Google** (easiest) -4. You'll be redirected to the Tailscale admin console - -**No credit card required!** Free tier is perfect for our use case. - ---- - -## Step 2: Install Tailscale on VPS - -**SSH into your VPS:** - -```bash -ssh root@vps -``` - -**Install Tailscale:** - -```bash -# Download and run install script -curl -fsSL https://tailscale.com/install.sh | sh - -# Start Tailscale -tailscale up - -# You'll see a URL like: -# https://login.tailscale.com/a/xxxxxxxxxx -``` - -**Authenticate:** -1. Copy the URL and open in browser -2. Click **"Connect"** to authorize the device -3. Name it: `pivoine-vps` - -**Check status:** -```bash -tailscale status -``` - -You should see your VPS listed with an IP like `100.x.x.x` - -**Save your VPS Tailscale IP:** -```bash -tailscale ip -4 -# Example output: 100.101.102.103 -``` - -**Write this down - you'll need it!** - ---- - -## Step 3: Install Tailscale on GPU Server - -**SSH into your RunPod GPU server:** - -```bash -ssh root@abc123def456-12345678.runpod.io -p 12345 -``` - -**Install Tailscale:** - -```bash -# Download and run install script -curl -fsSL https://tailscale.com/install.sh | sh - -# Start Tailscale -tailscale up --advertise-tags=tag:gpu - -# You'll see another URL -``` - -**Authenticate:** -1. Copy the URL and open in browser -2. Click **"Connect"** -3. Name it: `gpu-runpod` - -**Check status:** -```bash -tailscale status -``` - -You should now see BOTH devices: -- `pivoine-vps` - 100.x.x.x -- `gpu-runpod` - 100.x.x.x - -**Save your GPU server Tailscale IP:** -```bash -tailscale ip -4 -# Example output: 100.104.105.106 -``` - ---- - -## Step 4: Test Connectivity - -**From VPS, ping GPU server:** - -```bash -# SSH into VPS -ssh root@vps - -# Ping GPU server (use its Tailscale IP) -ping 100.104.105.106 -c 4 -``` - -Expected output: -``` -PING 100.104.105.106 (100.104.105.106) 56(84) bytes of data. -64 bytes from 100.104.105.106: icmp_seq=1 ttl=64 time=15.3 ms -64 bytes from 100.104.105.106: icmp_seq=2 ttl=64 time=14.8 ms -... -``` - -**From GPU server, ping VPS:** - -```bash -# SSH into GPU server -ssh root@abc123def456-12345678.runpod.io -p 12345 - -# Ping VPS (use its Tailscale IP) -ping 100.101.102.103 -c 4 -``` - -**Both should work!** ✅ - ---- - -## Step 5: Update Configuration Files - -Now update the IP addresses in your configs to use Tailscale IPs. - -### On GPU Server (.env file) - -**Edit your .env file:** - -```bash -# On GPU server -cd /workspace/gpu-stack - -nano .env -``` - -**Update these lines:** -```bash -# VPN Network (use your actual Tailscale IPs) -VPS_IP=100.101.102.103 # Your VPS Tailscale IP -GPU_IP=100.104.105.106 # Your GPU Tailscale IP - -# PostgreSQL (on VPS) -DB_HOST=100.101.102.103 # Your VPS Tailscale IP -DB_PORT=5432 -``` - -Save and exit (Ctrl+X, Y, Enter) - -### On VPS (LiteLLM config) - -**Edit your LiteLLM config:** - -```bash -# On VPS -ssh root@vps -cd ~/Projects/docker-compose/ai - -nano litellm-config-gpu.yaml -``` - -**Update the GPU server IP:** - -```yaml -# Find this section and update IP: - - model_name: llama-3.1-8b - litellm_params: - model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct - api_base: http://100.104.105.106:8000/v1 # Use GPU Tailscale IP - api_key: dummy -``` - -Save and exit. - ---- - -## Step 6: Verify PostgreSQL Access - -**From GPU server, test database connection:** - -```bash -# Install PostgreSQL client -apt install -y postgresql-client - -# Test connection (use your VPS Tailscale IP) -psql -h 100.101.102.103 -U valknar -d openwebui -c "SELECT 1;" -``` - -**If this fails, allow Tailscale network on VPS PostgreSQL:** - -```bash -# On VPS -ssh root@vps - -# Check if postgres allows Tailscale network -docker exec core_postgres cat /var/lib/postgresql/data/pg_hba.conf | grep 100 - -# If not present, add it: -docker exec -it core_postgres bash - -# Inside container: -echo "host all all 100.0.0.0/8 scram-sha-256" >> /var/lib/postgresql/data/pg_hba.conf - -# Restart postgres -exit -docker restart core_postgres -``` - -Try connecting again - should work now! - ---- - -## Tailscale Management - -### View Connected Devices - -**Web dashboard:** -https://login.tailscale.com/admin/machines - -You'll see all your devices with their Tailscale IPs. - -**Command line:** -```bash -tailscale status -``` - -### Disconnect/Reconnect - -```bash -# Stop Tailscale -tailscale down - -# Start Tailscale -tailscale up -``` - -### Remove Device - -From web dashboard: -1. Click on device -2. Click "..." menu -3. Select "Disable" or "Delete" - ---- - -## Advantages Over WireGuard - -✅ **Works anywhere** - No UDP ports needed -✅ **Auto-reconnect** - Survives network changes -✅ **Multiple devices** - Easy to add laptop, phone, etc. -✅ **NAT traversal** - Direct peer-to-peer when possible -✅ **Access Control** - Manage from web dashboard -✅ **Monitoring** - See connection status in real-time - ---- - -## Security Notes - -🔒 **Tailscale is secure:** -- End-to-end encrypted (WireGuard) -- Zero-trust architecture -- No Tailscale servers can see your traffic -- Only authenticated devices can connect - -🔒 **Access control:** -- Only devices you authorize can join -- Revoke access anytime from dashboard -- Set ACLs for fine-grained control - ---- - -## Network Reference (Updated) - -**Old (WireGuard):** -- VPS: `10.8.0.1` -- GPU: `10.8.0.2` - -**New (Tailscale):** -- VPS: `100.101.102.103` (example - use your actual IP) -- GPU: `100.104.105.106` (example - use your actual IP) - -**All services now accessible via Tailscale:** - -**From VPS to GPU:** -- vLLM: `http://100.104.105.106:8000` -- ComfyUI: `http://100.104.105.106:8188` -- JupyterLab: `http://100.104.105.106:8888` -- Netdata: `http://100.104.105.106:19999` - -**From GPU to VPS:** -- PostgreSQL: `100.101.102.103:5432` -- Redis: `100.101.102.103:6379` -- LiteLLM: `http://100.101.102.103:4000` - ---- - -## Troubleshooting - -### Can't ping between devices - -**Check Tailscale status:** -```bash -tailscale status -``` - -Both devices should show "active" or "online". - -**Check connectivity:** -```bash -tailscale ping 100.104.105.106 -``` - -**Restart Tailscale:** -```bash -tailscale down && tailscale up -``` - -### PostgreSQL connection refused - -**Check if postgres is listening on all interfaces:** -```bash -# On VPS -docker exec core_postgres cat /var/lib/postgresql/data/postgresql.conf | grep listen_addresses -``` - -Should show: `listen_addresses = '*'` - -**Check pg_hba.conf allows Tailscale network:** -```bash -docker exec core_postgres cat /var/lib/postgresql/data/pg_hba.conf | grep 100 -``` - -Should have line: -``` -host all all 100.0.0.0/8 scram-sha-256 -``` - -### Device not showing in network - -**Re-authenticate:** -```bash -tailscale logout -tailscale up -# Click the new URL to re-authenticate -``` - ---- - -## Verification Checklist - -Before proceeding: -- [ ] Tailscale account created -- [ ] Tailscale installed on VPS -- [ ] Tailscale installed on GPU server -- [ ] Both devices visible in `tailscale status` -- [ ] VPS can ping GPU server (via Tailscale IP) -- [ ] GPU server can ping VPS (via Tailscale IP) -- [ ] PostgreSQL accessible from GPU server -- [ ] .env file updated with Tailscale IPs -- [ ] LiteLLM config updated with GPU Tailscale IP - ---- - -## Next Steps - -✅ **Network configured!** Proceed to Docker & GPU setup: - -```bash -cat /home/valknar/Projects/docker-compose/ai/DOCKER_GPU_SETUP.md -``` - -**Your Tailscale IPs (save these!):** -- VPS: `__________________` (from `tailscale ip -4` on VPS) -- GPU: `__________________` (from `tailscale ip -4` on GPU server) - ---- - -## Bonus: Add Your Local Machine - -Want to access GPU server from your laptop? - -```bash -# On your local machine -curl -fsSL https://tailscale.com/install.sh | sh -tailscale up - -# Now you can SSH directly via Tailscale: -ssh root@100.104.105.106 - -# Or access ComfyUI in browser: -# http://100.104.105.106:8188 -``` - -No more port forwarding needed! 🎉 diff --git a/WIREGUARD_SETUP.md b/WIREGUARD_SETUP.md deleted file mode 100644 index 0f274fa..0000000 --- a/WIREGUARD_SETUP.md +++ /dev/null @@ -1,393 +0,0 @@ -# WireGuard VPN Setup - Connecting GPU Server to VPS - -## Day 3-4: Network Configuration - -This guide connects your RunPod GPU server to your VPS via WireGuard VPN, enabling secure, low-latency communication. - -### Architecture - -``` -┌─────────────────────────────┐ ┌──────────────────────────────┐ -│ VPS (pivoine.art) │ │ GPU Server (RunPod) │ -│ 10.8.0.1 (WireGuard) │◄───────►│ 10.8.0.2 (WireGuard) │ -├─────────────────────────────┤ ├──────────────────────────────┤ -│ - LiteLLM Proxy │ │ - vLLM (10.8.0.2:8000) │ -│ - Open WebUI │ │ - ComfyUI (10.8.0.2:8188) │ -│ - PostgreSQL │ │ - Training │ -└─────────────────────────────┘ └──────────────────────────────┘ -``` - -### Prerequisites - -- ✅ VPS with root access -- ✅ GPU server with root access -- ✅ Both servers have public IPs - ---- - -## Method 1: Using Existing wg-easy (Recommended) - -You already have `wg-easy` running on your VPS. Let's use it! - -### Step 1: Access wg-easy Dashboard - -**On your local machine:** - -1. Open browser: https://vpn.pivoine.art (or whatever your wg-easy URL is) -2. Login with admin password - -**Don't have wg-easy set up? Skip to Method 2.** - -### Step 2: Create GPU Server Client - -1. In wg-easy dashboard, click **"+ New Client"** -2. **Name**: `gpu-server-runpod` -3. Click **"Create"** -4. **Download** configuration file (or copy QR code data) - -You'll get a file like: `gpu-server-runpod.conf` - -### Step 3: Install WireGuard on GPU Server - -**SSH into GPU server:** - -```bash -ssh gpu-pivoine # or your SSH command - -# Install WireGuard -apt update -apt install -y wireguard wireguard-tools -``` - -### Step 4: Configure WireGuard on GPU Server - -**Upload the config file:** - -```bash -# On your local machine, copy the config to GPU server -scp gpu-server-runpod.conf gpu-pivoine:/etc/wireguard/wg0.conf - -# Or manually create it on GPU server: -nano /etc/wireguard/wg0.conf -# Paste the configuration from wg-easy -``` - -**Example config (yours will be different):** -```ini -[Interface] -PrivateKey = -Address = 10.8.0.2/24 -DNS = 10.8.0.1 - -[Peer] -PublicKey = -PresharedKey = -AllowedIPs = 10.8.0.0/24 -Endpoint = :51820 -PersistentKeepalive = 25 -``` - -### Step 5: Start WireGuard - -```bash -# Enable IP forwarding -echo "net.ipv4.ip_forward=1" >> /etc/sysctl.conf -sysctl -p - -# Set permissions -chmod 600 /etc/wireguard/wg0.conf - -# Start WireGuard -systemctl enable wg-quick@wg0 -systemctl start wg-quick@wg0 - -# Check status -systemctl status wg-quick@wg0 -wg show -``` - -Expected output: -``` -interface: wg0 - public key: - private key: (hidden) - listening port: 51820 - -peer: - endpoint: :51820 - allowed ips: 10.8.0.0/24 - latest handshake: 1 second ago - transfer: 1.2 KiB received, 892 B sent - persistent keepalive: every 25 seconds -``` - -### Step 6: Test Connectivity - -**From GPU server, ping VPS:** - -```bash -ping 10.8.0.1 -c 4 -``` - -Expected output: -``` -PING 10.8.0.1 (10.8.0.1) 56(84) bytes of data. -64 bytes from 10.8.0.1: icmp_seq=1 ttl=64 time=25.3 ms -64 bytes from 10.8.0.1: icmp_seq=2 ttl=64 time=24.8 ms -... -``` - -**From VPS, ping GPU server:** - -```bash -ssh root@vps -ping 10.8.0.2 -c 4 -``` - -**Test PostgreSQL access from GPU server:** - -```bash -# On GPU server -apt install -y postgresql-client - -# Try connecting to VPS postgres -psql -h 10.8.0.1 -U valknar -d openwebui -c "SELECT 1;" -# Should work if postgres allows 10.8.0.0/24 -``` - ---- - -## Method 2: Manual WireGuard Setup (If no wg-easy) - -### Step 1: Install WireGuard on Both Servers - -**On VPS:** -```bash -ssh root@vps -apt update -apt install -y wireguard wireguard-tools -``` - -**On GPU Server:** -```bash -ssh gpu-pivoine -apt update -apt install -y wireguard wireguard-tools -``` - -### Step 2: Generate Keys - -**On VPS:** -```bash -cd /etc/wireguard -umask 077 -wg genkey | tee vps-private.key | wg pubkey > vps-public.key -``` - -**On GPU Server:** -```bash -cd /etc/wireguard -umask 077 -wg genkey | tee gpu-private.key | wg pubkey > gpu-public.key -``` - -### Step 3: Create Config on VPS - -**On VPS (`/etc/wireguard/wg0.conf`):** - -```bash -cat > /etc/wireguard/wg0.conf << 'EOF' -[Interface] -PrivateKey = -Address = 10.8.0.1/24 -ListenPort = 51820 -SaveConfig = false - -# GPU Server Peer -[Peer] -PublicKey = -AllowedIPs = 10.8.0.2/32 -PersistentKeepalive = 25 -EOF -``` - -Replace `` with contents of `vps-private.key` -Replace `` with contents from GPU server's `gpu-public.key` - -### Step 4: Create Config on GPU Server - -**On GPU Server (`/etc/wireguard/wg0.conf`):** - -```bash -cat > /etc/wireguard/wg0.conf << 'EOF' -[Interface] -PrivateKey = -Address = 10.8.0.2/24 - -[Peer] -PublicKey = -AllowedIPs = 10.8.0.0/24 -Endpoint = :51820 -PersistentKeepalive = 25 -EOF -``` - -Replace: -- `` with contents of `gpu-private.key` -- `` with contents from VPS's `vps-public.key` -- `` with your VPS's public IP address - -### Step 5: Start WireGuard on Both - -**On VPS:** -```bash -# Enable IP forwarding -echo "net.ipv4.ip_forward=1" >> /etc/sysctl.conf -sysctl -p - -# Start WireGuard -chmod 600 /etc/wireguard/wg0.conf -systemctl enable wg-quick@wg0 -systemctl start wg-quick@wg0 -``` - -**On GPU Server:** -```bash -# Enable IP forwarding -echo "net.ipv4.ip_forward=1" >> /etc/sysctl.conf -sysctl -p - -# Start WireGuard -chmod 600 /etc/wireguard/wg0.conf -systemctl enable wg-quick@wg0 -systemctl start wg-quick@wg0 -``` - -### Step 6: Configure Firewall - -**On VPS:** -```bash -# Allow WireGuard port -ufw allow 51820/udp -ufw reload - -# Or with iptables -iptables -A INPUT -p udp --dport 51820 -j ACCEPT -iptables-save > /etc/iptables/rules.v4 -``` - -**On GPU Server (RunPod):** -```bash -# Allow WireGuard -ufw allow 51820/udp -ufw reload -``` - -### Step 7: Test Connection - -Same as Method 1 Step 6. - ---- - -## Troubleshooting - -### No handshake - -**Check:** -```bash -wg show -``` - -If "latest handshake" shows "never": -1. Verify public keys are correct (easy to swap them!) -2. Check firewall allows UDP 51820 -3. Verify endpoint IP is correct -4. Check `systemctl status wg-quick@wg0` for errors - -### Can ping but can't access services - -**On VPS, check PostgreSQL allows 10.8.0.0/24:** - -```bash -# Edit postgresql.conf -nano /var/lib/postgresql/data/postgresql.conf -# Add or modify: -listen_addresses = '*' - -# Edit pg_hba.conf -nano /var/lib/postgresql/data/pg_hba.conf -# Add: -host all all 10.8.0.0/24 scram-sha-256 - -# Restart -docker restart core_postgres -``` - -### WireGuard won't start - -```bash -# Check logs -journalctl -u wg-quick@wg0 -n 50 - -# Common issues: -# - Wrong permissions: chmod 600 /etc/wireguard/wg0.conf -# - Invalid keys: regenerate with wg genkey -# - Port already in use: lsof -i :51820 -``` - ---- - -## Verification Checklist - -Before proceeding to Day 5: - -- [ ] WireGuard installed on both VPS and GPU server -- [ ] VPN tunnel established (wg show shows handshake) -- [ ] GPU server can ping VPS (10.8.0.1) -- [ ] VPS can ping GPU server (10.8.0.2) -- [ ] Firewall allows WireGuard (UDP 51820) -- [ ] PostgreSQL accessible from GPU server -- [ ] WireGuard starts on boot (systemctl enable) - ---- - -## Network Reference - -**VPN IPs:** -- VPS: `10.8.0.1` -- GPU Server: `10.8.0.2` - -**Service Access from GPU Server:** -- PostgreSQL: `postgresql://valknar:password@10.8.0.1:5432/dbname` -- Redis: `10.8.0.1:6379` -- LiteLLM: `http://10.8.0.1:4000` -- Mailpit: `10.8.0.1:1025` - -**Service Access from VPS:** -- vLLM: `http://10.8.0.2:8000` -- ComfyUI: `http://10.8.0.2:8188` -- JupyterLab: `http://10.8.0.2:8888` - ---- - -## Next: Docker & GPU Setup - -Once VPN is working, proceed to **Day 5: Docker & NVIDIA Container Toolkit Setup**. - -**Save connection info:** - -```bash -# On GPU server -cat >> /workspace/SERVER_INFO.md << 'EOF' - -## VPN Configuration -- VPN IP: 10.8.0.2 -- VPS VPN IP: 10.8.0.1 -- WireGuard Status: Active -- Latest Handshake: [Check with: wg show] - -## Network Access -- Can reach VPS services: ✓ -- VPS can reach GPU services: ✓ -EOF -``` diff --git a/compose.yaml b/compose.yaml deleted file mode 100644 index 0daff89..0000000 --- a/compose.yaml +++ /dev/null @@ -1,206 +0,0 @@ -services: - # PostgreSQL with pgvector for AI/RAG workloads - ai_postgres: - image: ${AI_POSTGRES_IMAGE:-pgvector/pgvector:pg16} - container_name: ${AI_COMPOSE_PROJECT_NAME}_postgres - restart: unless-stopped - environment: - TZ: ${TIMEZONE:-Europe/Berlin} - POSTGRES_USER: ${AI_DB_USER} - POSTGRES_PASSWORD: ${AI_DB_PASSWORD} - POSTGRES_DB: ${AI_DB_NAME} - POSTGRES_HOST_AUTH_METHOD: scram-sha-256 - POSTGRES_INITDB_ARGS: --auth-host=scram-sha-256 - volumes: - - ai_postgres_data:/var/lib/postgresql/data - - ./postgres/init:/docker-entrypoint-initdb.d - healthcheck: - test: ['CMD-SHELL', 'pg_isready -U ${AI_DB_USER}'] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - networks: - - compose_network - - # Open WebUI - ChatGPT-like interface for AI models - webui: - image: ${AI_WEBUI_IMAGE:-ghcr.io/open-webui/open-webui:main} - container_name: ${AI_COMPOSE_PROJECT_NAME}_webui - restart: unless-stopped - environment: - TZ: ${TIMEZONE:-Europe/Berlin} - - # Database configuration - DATABASE_URL: postgresql://${AI_DB_USER}:${AI_DB_PASSWORD}@ai_postgres:5432/${AI_DB_NAME} - - # OpenAI API configuration (pointing to LiteLLM proxy) - OPENAI_API_BASE_URLS: http://litellm:4000 - OPENAI_API_KEYS: ${AI_LITELLM_API_KEY} - - # WebUI configuration - WEBUI_NAME: ${AI_WEBUI_NAME:-Pivoine AI} - WEBUI_URL: https://${AI_TRAEFIK_HOST} - WEBUI_SECRET_KEY: ${AI_WEBUI_SECRET_KEY} - - # Feature flags - ENABLE_SIGNUP: ${AI_ENABLE_SIGNUP:-true} - ENABLE_RAG_WEB_SEARCH: ${AI_ENABLE_RAG_WEB_SEARCH:-true} - ENABLE_RAG_WEB_LOADER_SSL_VERIFICATION: ${AI_ENABLE_RAG_SSL_VERIFY:-true} - - # RAG configuration - RAG_EMBEDDING_ENGINE: ${AI_RAG_EMBEDDING_ENGINE:-openai} - RAG_EMBEDDING_MODEL: ${AI_RAG_EMBEDDING_MODEL:-text-embedding-3-small} - VECTOR_DB: ${AI_VECTOR_DB:-pgvector} - - # Email configuration (Mailpit SMTP relay) - SMTP_HOST: net_mailpit - SMTP_PORT: 1025 - SMTP_FROM_EMAIL: ${EMAIL_FROM} - SMTP_USE_TLS: false - SMTP_USE_SSL: false - - volumes: - - ai_webui_data:/app/backend/data - depends_on: - - ai_postgres - - litellm - networks: - - compose_network - labels: - - 'traefik.enable=${AI_TRAEFIK_ENABLED}' - # HTTP to HTTPS redirect - - 'traefik.http.middlewares.${AI_COMPOSE_PROJECT_NAME}-redirect-web-secure.redirectscheme.scheme=https' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-web.middlewares=${AI_COMPOSE_PROJECT_NAME}-redirect-web-secure' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-web.rule=Host(`${AI_TRAEFIK_HOST}`)' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-web.entrypoints=web' - # HTTPS router - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-web-secure.rule=Host(`${AI_TRAEFIK_HOST}`)' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-web-secure.tls.certresolver=resolver' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-web-secure.entrypoints=web-secure' - - 'traefik.http.middlewares.${AI_COMPOSE_PROJECT_NAME}-web-secure-compress.compress=true' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-web-secure.middlewares=${AI_COMPOSE_PROJECT_NAME}-web-secure-compress,security-headers@file' - # Service - - 'traefik.http.services.${AI_COMPOSE_PROJECT_NAME}-web-secure.loadbalancer.server.port=8080' - - 'traefik.docker.network=${NETWORK_NAME}' - # Watchtower - - 'com.centurylinklabs.watchtower.enable=${WATCHTOWER_LABEL_ENABLE}' - - # LiteLLM - Proxy to convert Anthropic API to OpenAI-compatible format - litellm: - image: ghcr.io/berriai/litellm:main-latest - container_name: ${AI_COMPOSE_PROJECT_NAME}_litellm - restart: unless-stopped - environment: - TZ: ${TIMEZONE:-Europe/Berlin} - ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY} - LITELLM_MASTER_KEY: ${AI_LITELLM_API_KEY} - DATABASE_URL: postgresql://${AI_DB_USER}:${AI_DB_PASSWORD}@ai_postgres:5432/litellm - LITELLM_DROP_PARAMS: 'true' - NO_DOCS: 'true' - NO_REDOC: 'true' - # Performance optimizations - LITELLM_LOG: 'ERROR' # Only log errors - LITELLM_MODE: 'PRODUCTION' # Production mode for better performance - volumes: - - ./litellm-config.yaml:/app/litellm-config.yaml:ro - command: - [ - '--config', - '/app/litellm-config.yaml', - '--host', - '0.0.0.0', - '--port', - '4000', - '--drop_params' - ] - depends_on: - - ai_postgres - networks: - - compose_network - healthcheck: - disable: true - labels: - - 'traefik.enable=${AI_TRAEFIK_ENABLED}' - # HTTP to HTTPS redirect - - 'traefik.http.middlewares.${AI_COMPOSE_PROJECT_NAME}-litellm-redirect-web-secure.redirectscheme.scheme=https' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-litellm-web.middlewares=${AI_COMPOSE_PROJECT_NAME}-litellm-redirect-web-secure' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-litellm-web.rule=Host(`${AI_LITELLM_TRAEFIK_HOST}`)' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-litellm-web.entrypoints=web' - # HTTPS router - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-litellm-web-secure.rule=Host(`${AI_LITELLM_TRAEFIK_HOST}`)' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-litellm-web-secure.tls.certresolver=resolver' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-litellm-web-secure.entrypoints=web-secure' - - 'traefik.http.middlewares.${AI_COMPOSE_PROJECT_NAME}-litellm-web-secure-compress.compress=true' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-litellm-web-secure.middlewares=${AI_COMPOSE_PROJECT_NAME}-litellm-web-secure-compress,security-headers@file' - # Service - - 'traefik.http.services.${AI_COMPOSE_PROJECT_NAME}-litellm-web-secure.loadbalancer.server.port=4000' - - 'traefik.docker.network=${NETWORK_NAME}' - # Watchtower - - 'com.centurylinklabs.watchtower.enable=${WATCHTOWER_LABEL_ENABLE}' - - # Crawl4AI - Web scraping for LLMs (internal API, no public access) - crawl4ai: - image: ${AI_CRAWL4AI_IMAGE:-unclecode/crawl4ai:latest} - container_name: ${AI_COMPOSE_PROJECT_NAME}_crawl4ai - restart: unless-stopped - environment: - TZ: ${TIMEZONE:-Europe/Berlin} - # API configuration - PORT: ${AI_CRAWL4AI_PORT:-11235} - volumes: - - ai_crawl4ai_data:/app/.crawl4ai - networks: - - compose_network - labels: - # No Traefik exposure - internal only - - 'traefik.enable=false' - # Watchtower - - 'com.centurylinklabs.watchtower.enable=${WATCHTOWER_LABEL_ENABLE}' - - # Facefusion - AI face swapping and enhancement - facefusion: - build: - context: . - dockerfile: Dockerfile - image: facefusion-patched:3.5.0-cpu - container_name: ${AI_COMPOSE_PROJECT_NAME}_facefusion - restart: unless-stopped - tty: true - command: ['python', '-u', 'facefusion.py', 'run'] - environment: - TZ: ${TIMEZONE:-Europe/Berlin} - GRADIO_SERVER_NAME: "0.0.0.0" - GRADIO_SERVER_PORT: "7860" - volumes: - - ai_facefusion_data:/workspace - networks: - - compose_network - labels: - - 'traefik.enable=${AI_FACEFUSION_TRAEFIK_ENABLED}' - # HTTP to HTTPS redirect - - 'traefik.http.middlewares.${AI_COMPOSE_PROJECT_NAME}-facefusion-redirect-web-secure.redirectscheme.scheme=https' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-facefusion-web.middlewares=${AI_COMPOSE_PROJECT_NAME}-facefusion-redirect-web-secure' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-facefusion-web.rule=Host(`${AI_FACEFUSION_TRAEFIK_HOST}`)' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-facefusion-web.entrypoints=web' - # HTTPS router with Authelia - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-facefusion-web-secure.rule=Host(`${AI_FACEFUSION_TRAEFIK_HOST}`)' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-facefusion-web-secure.tls.certresolver=resolver' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-facefusion-web-secure.entrypoints=web-secure' - - 'traefik.http.middlewares.${AI_COMPOSE_PROJECT_NAME}-facefusion-web-secure-compress.compress=true' - - 'traefik.http.routers.${AI_COMPOSE_PROJECT_NAME}-facefusion-web-secure.middlewares=${AI_COMPOSE_PROJECT_NAME}-facefusion-web-secure-compress,net-authelia,security-headers@file' - # Service - - 'traefik.http.services.${AI_COMPOSE_PROJECT_NAME}-facefusion-web-secure.loadbalancer.server.port=7860' - - 'traefik.docker.network=${NETWORK_NAME}' - # Watchtower - disabled for custom local image - - 'com.centurylinklabs.watchtower.enable=false' - -volumes: - ai_postgres_data: - name: ${AI_COMPOSE_PROJECT_NAME}_postgres_data - ai_webui_data: - name: ${AI_COMPOSE_PROJECT_NAME}_webui_data - ai_crawl4ai_data: - name: ${AI_COMPOSE_PROJECT_NAME}_crawl4ai_data - ai_facefusion_data: - name: ${AI_COMPOSE_PROJECT_NAME}_facefusion_data diff --git a/deploy-gpu-stack.sh b/deploy-gpu-stack.sh deleted file mode 100755 index f770946..0000000 --- a/deploy-gpu-stack.sh +++ /dev/null @@ -1,229 +0,0 @@ -#!/bin/bash -# GPU Stack Deployment Script -# Run this on the GPU server after SSH access is established - -set -e # Exit on error - -echo "==================================" -echo "GPU Stack Deployment Script" -echo "==================================" -echo "" - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -# Functions -print_success() { - echo -e "${GREEN}✓ $1${NC}" -} - -print_error() { - echo -e "${RED}✗ $1${NC}" -} - -print_info() { - echo -e "${YELLOW}→ $1${NC}" -} - -# Check if running as root -if [[ $EUID -ne 0 ]]; then - print_error "This script must be run as root (use sudo)" - exit 1 -fi - -# Step 1: Check prerequisites -print_info "Checking prerequisites..." - -if ! command -v docker &> /dev/null; then - print_error "Docker is not installed. Please run DOCKER_GPU_SETUP.md first." - exit 1 -fi -print_success "Docker installed" - -if ! command -v nvidia-smi &> /dev/null; then - print_error "nvidia-smi not found. Is this a GPU server?" - exit 1 -fi -print_success "NVIDIA GPU detected" - -if ! docker run --rm --runtime=nvidia --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi &> /dev/null; then - print_error "Docker cannot access GPU. Please configure NVIDIA Container Toolkit." - exit 1 -fi -print_success "Docker GPU access working" - -# Step 2: Create directory structure -print_info "Creating directory structure..." - -mkdir -p /workspace/gpu-stack/{vllm,comfyui,training/{configs,data,output},notebooks,monitoring} -cd /workspace/gpu-stack - -print_success "Directory structure created" - -# Step 3: Create .env file -if [ ! -f .env ]; then - print_info "Creating .env file..." - - cat > .env << 'EOF' -# GPU Stack Environment Variables - -# Timezone -TIMEZONE=Europe/Berlin - -# VPN Network -VPS_IP=10.8.0.1 -GPU_IP=10.8.0.2 - -# Model Storage (network volume) -MODELS_PATH=/workspace/models - -# Hugging Face Token (optional, for gated models like Llama) -# Get from: https://huggingface.co/settings/tokens -HF_TOKEN= - -# Weights & Biases (optional, for training logging) -# Get from: https://wandb.ai/authorize -WANDB_API_KEY= - -# JupyterLab Access Token -JUPYTER_TOKEN=pivoine-ai-2025 - -# PostgreSQL (on VPS) -DB_HOST=10.8.0.1 -DB_PORT=5432 -DB_USER=valknar -DB_PASSWORD=ragnarok98 -DB_NAME=openwebui -EOF - - chmod 600 .env - print_success ".env file created (please edit with your tokens)" -else - print_success ".env file already exists" -fi - -# Step 4: Download docker-compose.yaml -print_info "Downloading docker-compose.yaml..." - -# In production, this would be copied from the repo -# For now, assume it's already in the current directory -if [ ! -f docker-compose.yaml ]; then - print_error "docker-compose.yaml not found. Please copy gpu-server-compose.yaml to docker-compose.yaml" - exit 1 -fi - -print_success "docker-compose.yaml found" - -# Step 5: Pre-download models (optional but recommended) -print_info "Do you want to pre-download models? (y/n)" -read -r response - -if [[ "$response" =~ ^[Yy]$ ]]; then - print_info "Downloading Llama 3.1 8B Instruct (this will take a while)..." - - mkdir -p /workspace/models - - # Use huggingface-cli to download - pip install -q huggingface-hub - - huggingface-cli download \ - meta-llama/Meta-Llama-3.1-8B-Instruct \ - --local-dir /workspace/models/Meta-Llama-3.1-8B-Instruct \ - --local-dir-use-symlinks False || print_error "Model download failed (may need HF_TOKEN)" - - print_success "Model downloaded to /workspace/models" -fi - -# Step 6: Start services -print_info "Starting GPU stack services..." - -docker compose up -d vllm comfyui jupyter netdata - -print_success "Services starting (this may take a few minutes)..." - -# Step 7: Wait for services -print_info "Waiting for services to be ready..." - -sleep 10 - -# Check service health -print_info "Checking service status..." - -if docker ps | grep -q gpu_vllm; then - print_success "vLLM container running" -else - print_error "vLLM container not running" -fi - -if docker ps | grep -q gpu_comfyui; then - print_success "ComfyUI container running" -else - print_error "ComfyUI container not running" -fi - -if docker ps | grep -q gpu_jupyter; then - print_success "JupyterLab container running" -else - print_error "JupyterLab container not running" -fi - -if docker ps | grep -q gpu_netdata; then - print_success "Netdata container running" -else - print_error "Netdata container not running" -fi - -# Step 8: Display access information -echo "" -echo "==================================" -echo "Deployment Complete!" -echo "==================================" -echo "" -echo "Services accessible via VPN (from VPS):" -echo " - vLLM API: http://10.8.0.2:8000" -echo " - ComfyUI: http://10.8.0.2:8188" -echo " - JupyterLab: http://10.8.0.2:8888 (token: pivoine-ai-2025)" -echo " - Netdata: http://10.8.0.2:19999" -echo "" -echo "Local access (from GPU server):" -echo " - vLLM API: http://localhost:8000" -echo " - ComfyUI: http://localhost:8188" -echo " - JupyterLab: http://localhost:8888" -echo " - Netdata: http://localhost:19999" -echo "" -echo "Useful commands:" -echo " - View logs: docker compose logs -f" -echo " - Check status: docker compose ps" -echo " - Stop all: docker compose down" -echo " - Restart service: docker compose restart vllm" -echo " - Start training: docker compose --profile training up -d axolotl" -echo "" -echo "Next steps:" -echo " 1. Wait for vLLM to load model (check logs: docker compose logs -f vllm)" -echo " 2. Test vLLM: curl http://localhost:8000/v1/models" -echo " 3. Configure LiteLLM on VPS to use http://10.8.0.2:8000" -echo " 4. Download ComfyUI models via web interface" -echo "" - -# Step 9: Create helpful aliases -print_info "Creating helpful aliases..." - -cat >> ~/.bashrc << 'EOF' - -# GPU Stack Aliases -alias gpu-logs='cd /workspace/gpu-stack && docker compose logs -f' -alias gpu-ps='cd /workspace/gpu-stack && docker compose ps' -alias gpu-restart='cd /workspace/gpu-stack && docker compose restart' -alias gpu-down='cd /workspace/gpu-stack && docker compose down' -alias gpu-up='cd /workspace/gpu-stack && docker compose up -d' -alias gpu-stats='watch -n 1 nvidia-smi' -alias gpu-top='nvtop' -EOF - -print_success "Aliases added to ~/.bashrc (reload with: source ~/.bashrc)" - -echo "" -print_success "All done! 🚀" diff --git a/disable-nsfw-filter.patch b/disable-nsfw-filter.patch deleted file mode 100644 index 6853110..0000000 --- a/disable-nsfw-filter.patch +++ /dev/null @@ -1,12 +0,0 @@ ---- a/facefusion/content_analyser.py -+++ b/facefusion/content_analyser.py -@@ -194,7 +194,8 @@ def analyse_frame(vision_frame : VisionFrame) -> bool: - is_nsfw_2 = detect_with_nsfw_2(vision_frame) - is_nsfw_3 = detect_with_nsfw_3(vision_frame) - -- return is_nsfw_1 and is_nsfw_2 or is_nsfw_1 and is_nsfw_3 or is_nsfw_2 and is_nsfw_3 -+ # Patched to disable NSFW filter - always return False (content is safe) -+ return False - - - def detect_with_nsfw_1(vision_frame : VisionFrame) -> bool: diff --git a/DEPLOYMENT.md b/docs/DEPLOYMENT.md similarity index 100% rename from DEPLOYMENT.md rename to docs/DEPLOYMENT.md diff --git a/GPU_DEPLOYMENT_LOG.md b/docs/GPU_DEPLOYMENT_LOG.md similarity index 100% rename from GPU_DEPLOYMENT_LOG.md rename to docs/GPU_DEPLOYMENT_LOG.md diff --git a/RUNPOD_TEMPLATE.md b/docs/RUNPOD_TEMPLATE.md similarity index 100% rename from RUNPOD_TEMPLATE.md rename to docs/RUNPOD_TEMPLATE.md diff --git a/entrypoint.sh b/entrypoint.sh deleted file mode 100755 index a2cd939..0000000 --- a/entrypoint.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/sh -echo "Patching Facefusion to disable NSFW filter..." - -# Patch content_analyser.py line 197 to always return False (content is safe) -sed -i '197s/.*/\treturn False # Patched: NSFW filter disabled/' /facefusion/facefusion/content_analyser.py - -# Verify the patch was applied -if grep -q 'return False.*Patched' /facefusion/facefusion/content_analyser.py; then - echo "NSFW filter successfully disabled" -else - echo "ERROR: Patch failed!" - exit 1 -fi - -echo "Starting Facefusion..." -cd /facefusion && exec python -u facefusion.py run diff --git a/gpu-server-compose.yaml b/gpu-server-compose.yaml deleted file mode 100644 index 9cb2f70..0000000 --- a/gpu-server-compose.yaml +++ /dev/null @@ -1,237 +0,0 @@ -# GPU Server Docker Compose Configuration -# Deploy on RunPod GPU server (10.8.0.2) -# Services accessible from VPS (10.8.0.1) via WireGuard VPN - -version: '3.8' - -services: - # ============================================================================= - # vLLM - High-performance LLM Inference Server - # ============================================================================= - vllm: - image: vllm/vllm-openai:latest - container_name: gpu_vllm - restart: unless-stopped - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - CUDA_VISIBLE_DEVICES: "0" - HF_TOKEN: ${HF_TOKEN:-} - volumes: - - ${MODELS_PATH:-/workspace/models}:/root/.cache/huggingface - command: - - --model - - meta-llama/Meta-Llama-3.1-8B-Instruct # Change model here - - --host - - 0.0.0.0 - - --port - - 8000 - - --tensor-parallel-size - - "1" - - --gpu-memory-utilization - - "0.85" # Leave 15% for other tasks - - --max-model-len - - "8192" - - --dtype - - auto - - --trust-remote-code - ports: - - "8000:8000" - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8000/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 120s # Model loading takes time - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - labels: - - "service=vllm" - - "stack=gpu-ai" - - # ============================================================================= - # ComfyUI - Advanced Stable Diffusion Interface - # ============================================================================= - comfyui: - image: ghcr.io/ai-dock/comfyui:latest - container_name: gpu_comfyui - restart: unless-stopped - runtime: nvidia - environment: - NVIDIA_VISIBLE_DEVICES: all - TZ: ${TIMEZONE:-Europe/Berlin} - # ComfyUI auto-installs custom nodes on first run - COMFYUI_FLAGS: "--listen 0.0.0.0 --port 8188" - volumes: - - comfyui_data:/data - - ${MODELS_PATH:-/workspace/models}/comfyui:/opt/ComfyUI/models - - comfyui_output:/opt/ComfyUI/output - - comfyui_input:/opt/ComfyUI/input - ports: - - "8188:8188" - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8188/"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 60s - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - labels: - - "service=comfyui" - - "stack=gpu-ai" - - # ============================================================================= - # Axolotl - LLM Fine-tuning Framework - # ============================================================================= - # Note: This service uses "profiles" - only starts when explicitly requested - # Start with: docker compose --profile training up -d axolotl - axolotl: - image: winglian/axolotl:main-py3.11-cu121-2.2.2 - container_name: gpu_training - runtime: nvidia - volumes: - - ./training/configs:/workspace/configs - - ./training/data:/workspace/data - - ./training/output:/workspace/output - - ${MODELS_PATH:-/workspace/models}:/workspace/models - - training_cache:/root/.cache - environment: - NVIDIA_VISIBLE_DEVICES: all - WANDB_API_KEY: ${WANDB_API_KEY:-} - HF_TOKEN: ${HF_TOKEN:-} - working_dir: /workspace - # Default command - override when running specific training - command: sleep infinity - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - profiles: - - training - labels: - - "service=axolotl" - - "stack=gpu-ai" - - # ============================================================================= - # JupyterLab - Interactive Development Environment - # ============================================================================= - jupyter: - image: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-devel - container_name: gpu_jupyter - restart: unless-stopped - runtime: nvidia - volumes: - - ./notebooks:/workspace/notebooks - - ${MODELS_PATH:-/workspace/models}:/workspace/models - - jupyter_cache:/root/.cache - ports: - - "8888:8888" - environment: - NVIDIA_VISIBLE_DEVICES: all - JUPYTER_ENABLE_LAB: "yes" - JUPYTER_TOKEN: ${JUPYTER_TOKEN:-pivoine-ai-2025} - HF_TOKEN: ${HF_TOKEN:-} - command: | - bash -c " - pip install --quiet jupyterlab transformers datasets accelerate bitsandbytes peft trl sentencepiece protobuf && - jupyter lab --ip=0.0.0.0 --port=8888 --allow-root --no-browser --NotebookApp.token='${JUPYTER_TOKEN:-pivoine-ai-2025}' - " - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8888/"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 60s - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - labels: - - "service=jupyter" - - "stack=gpu-ai" - - # ============================================================================= - # Netdata - System & GPU Monitoring - # ============================================================================= - netdata: - image: netdata/netdata:latest - container_name: gpu_netdata - restart: unless-stopped - runtime: nvidia - hostname: gpu-runpod - cap_add: - - SYS_PTRACE - - SYS_ADMIN - security_opt: - - apparmor:unconfined - environment: - NVIDIA_VISIBLE_DEVICES: all - TZ: ${TIMEZONE:-Europe/Berlin} - volumes: - - /sys:/host/sys:ro - - /proc:/host/proc:ro - - /var/run/docker.sock:/var/run/docker.sock:ro - - /etc/os-release:/host/etc/os-release:ro - - netdata_config:/etc/netdata - - netdata_cache:/var/cache/netdata - - netdata_lib:/var/lib/netdata - ports: - - "19999:19999" - labels: - - "service=netdata" - - "stack=gpu-ai" - -# ============================================================================= -# Volumes -# ============================================================================= -volumes: - # ComfyUI data - comfyui_data: - driver: local - comfyui_output: - driver: local - comfyui_input: - driver: local - - # Training data - training_cache: - driver: local - - # Jupyter data - jupyter_cache: - driver: local - - # Netdata data - netdata_config: - driver: local - netdata_cache: - driver: local - netdata_lib: - driver: local - -# ============================================================================= -# Networks -# ============================================================================= -networks: - default: - driver: bridge - ipam: - config: - - subnet: 172.25.0.0/24 diff --git a/litellm-config-gpu.yaml b/litellm-config-gpu.yaml deleted file mode 100644 index 5313d64..0000000 --- a/litellm-config-gpu.yaml +++ /dev/null @@ -1,199 +0,0 @@ -# LiteLLM Configuration with GPU Server Integration -# This config includes both Anthropic Claude (API) and self-hosted models (vLLM on GPU server) - -model_list: - # ============================================================================= - # Anthropic Claude Models (API-based, for complex reasoning) - # ============================================================================= - - - model_name: claude-sonnet-4 - litellm_params: - model: anthropic/claude-sonnet-4-20250514 - api_key: os.environ/ANTHROPIC_API_KEY - - - model_name: claude-sonnet-4.5 - litellm_params: - model: anthropic/claude-sonnet-4-5-20250929 - api_key: os.environ/ANTHROPIC_API_KEY - - - model_name: claude-3-5-sonnet - litellm_params: - model: anthropic/claude-3-5-sonnet-20241022 - api_key: os.environ/ANTHROPIC_API_KEY - - - model_name: claude-3-opus - litellm_params: - model: anthropic/claude-3-opus-20240229 - api_key: os.environ/ANTHROPIC_API_KEY - - - model_name: claude-3-haiku - litellm_params: - model: anthropic/claude-3-haiku-20240307 - api_key: os.environ/ANTHROPIC_API_KEY - - # ============================================================================= - # Self-Hosted Models (vLLM on GPU server via WireGuard VPN) - # ============================================================================= - - # Llama 3.1 8B Instruct - Fast, general-purpose, good for routine tasks - - model_name: llama-3.1-8b - litellm_params: - model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct - api_base: http://10.8.0.2:8000/v1 - api_key: dummy # vLLM doesn't require auth - rpm: 1000 # Rate limit: requests per minute - tpm: 100000 # Rate limit: tokens per minute - - # Alternative models (uncomment and configure on GPU server as needed) - - # Qwen 2.5 14B Instruct - Excellent multilingual, stronger reasoning - # - model_name: qwen-2.5-14b - # litellm_params: - # model: openai/Qwen/Qwen2.5-14B-Instruct - # api_base: http://10.8.0.2:8000/v1 - # api_key: dummy - # rpm: 800 - # tpm: 80000 - - # Mistral 7B Instruct - Very fast, lightweight - # - model_name: mistral-7b - # litellm_params: - # model: openai/mistralai/Mistral-7B-Instruct-v0.3 - # api_base: http://10.8.0.2:8000/v1 - # api_key: dummy - # rpm: 1200 - # tpm: 120000 - - # DeepSeek Coder 6.7B - Code generation specialist - # - model_name: deepseek-coder-6.7b - # litellm_params: - # model: openai/deepseek-ai/deepseek-coder-6.7b-instruct - # api_base: http://10.8.0.2:8000/v1 - # api_key: dummy - # rpm: 1000 - # tpm: 100000 - -# ============================================================================= -# Router Settings - Intelligent Model Selection -# ============================================================================= - -# Model aliases for easy switching in Open WebUI -model_name_map: - # Default model (self-hosted, fast) - gpt-3.5-turbo: llama-3.1-8b - - # Power users can use Claude for complex tasks - gpt-4: claude-sonnet-4.5 - gpt-4-turbo: claude-sonnet-4.5 - -# LiteLLM Settings -litellm_settings: - drop_params: true - set_verbose: false # Disable verbose logging for better performance - - # Enable caching with Redis for better performance - cache: true - cache_params: - type: redis - host: redis - port: 6379 - ttl: 3600 # Cache for 1 hour - - # Force strip specific parameters globally - allowed_fails: 0 - - # Modify params before sending to provider - modify_params: true - - # Enable success and failure logging but minimize overhead - success_callback: [] # Disable all success callbacks to reduce DB writes - failure_callback: [] # Disable all failure callbacks - -# Router Settings -router_settings: - allowed_fails: 0 - - # Routing strategy: Try self-hosted first, fallback to Claude on failure - routing_strategy: simple-shuffle - - # Cooldown for failed models - cooldown_time: 30 # seconds - -# Drop unsupported parameters -default_litellm_params: - drop_params: true - -# General Settings -general_settings: - disable_responses_id_security: true - - # Disable spend tracking to reduce database overhead - disable_spend_logs: false # Keep enabled to track API vs GPU costs - - # Disable tag tracking - disable_tag_tracking: true - - # Disable daily spend updates - disable_daily_spend_logs: false # Keep enabled for cost analysis - - # Master key for authentication (set via env var) - master_key: os.environ/LITELLM_MASTER_KEY - - # Database for logging (optional but recommended for cost tracking) - database_url: os.environ/DATABASE_URL - - # Enable OpenAPI docs - docs_url: /docs - -# ============================================================================= -# Usage Guidelines (for Open WebUI users) -# ============================================================================= -# -# Model Selection Guide: -# -# Use llama-3.1-8b for: -# - General chat and Q&A -# - Simple code generation -# - Data extraction -# - Summarization -# - Translation -# - Most routine tasks -# Cost: ~$0/month (self-hosted) -# Speed: ~50-80 tokens/second -# -# Use qwen-2.5-14b for: -# - Complex reasoning -# - Multi-step problems -# - Advanced code generation -# - Multilingual tasks -# Cost: ~$0/month (self-hosted) -# Speed: ~30-50 tokens/second -# -# Use claude-sonnet-4.5 for: -# - Very complex reasoning -# - Long documents (200K context) -# - Production-critical code -# - When quality matters most -# Cost: ~$3/million input tokens, ~$15/million output tokens -# Speed: ~30-40 tokens/second -# -# Use claude-3-haiku for: -# - API fallback (if self-hosted down) -# - Very fast responses needed -# Cost: ~$0.25/million input tokens, ~$1.25/million output tokens -# Speed: ~60-80 tokens/second -# -# ============================================================================= - -# Health Check Configuration -health_check: - # Check vLLM health endpoint - enabled: true - interval: 30 # seconds - timeout: 5 # seconds - -# Fallback Configuration -# If GPU server is down, automatically use Claude -fallback: - - ["llama-3.1-8b", "claude-3-haiku"] - - ["qwen-2.5-14b", "claude-sonnet-4.5"] diff --git a/litellm-config.yaml b/litellm-config.yaml deleted file mode 100644 index 134375e..0000000 --- a/litellm-config.yaml +++ /dev/null @@ -1,91 +0,0 @@ -model_list: - - model_name: claude-sonnet-4 - litellm_params: - model: anthropic/claude-sonnet-4-20250514 - api_key: os.environ/ANTHROPIC_API_KEY - - - model_name: claude-sonnet-4.5 - litellm_params: - model: anthropic/claude-sonnet-4-5-20250929 - api_key: os.environ/ANTHROPIC_API_KEY - - - model_name: claude-3-5-sonnet - litellm_params: - model: anthropic/claude-3-5-sonnet-20241022 - api_key: os.environ/ANTHROPIC_API_KEY - - - model_name: claude-3-opus - litellm_params: - model: anthropic/claude-3-opus-20240229 - api_key: os.environ/ANTHROPIC_API_KEY - - - model_name: claude-3-haiku - litellm_params: - model: anthropic/claude-3-haiku-20240307 - api_key: os.environ/ANTHROPIC_API_KEY - - # =========================================================================== - # SELF-HOSTED MODELS VIA ORCHESTRATOR (GPU Server via Tailscale VPN) - # =========================================================================== - # All requests route through orchestrator (port 9000) which manages model loading - - # Text Generation - - model_name: qwen-2.5-7b - litellm_params: - model: openai/qwen-2.5-7b - api_base: http://100.100.108.13:9000/v1 # Orchestrator endpoint - api_key: dummy - rpm: 1000 - tpm: 100000 - - # Image Generation - - model_name: flux-schnell - litellm_params: - model: openai/dall-e-3 # OpenAI-compatible mapping - api_base: http://100.100.108.13:9000/v1 # Orchestrator endpoint - api_key: dummy - rpm: 100 - max_parallel_requests: 3 - - # Music Generation - - model_name: musicgen-medium - litellm_params: - model: openai/musicgen-medium - api_base: http://100.100.108.13:9000/v1 # Orchestrator endpoint - api_key: dummy - rpm: 50 - max_parallel_requests: 1 - -litellm_settings: - drop_params: true - set_verbose: false # Disable verbose logging for better performance - # Enable caching with Redis for better performance - cache: true - cache_params: - type: redis - host: redis - port: 6379 - ttl: 3600 # Cache for 1 hour - # Force strip specific parameters globally - allowed_fails: 0 - # Modify params before sending to provider - modify_params: true - # Enable success and failure logging but minimize overhead - success_callback: [] # Disable all success callbacks to reduce DB writes - failure_callback: [] # Disable all failure callbacks - -router_settings: - allowed_fails: 0 - -# Drop unsupported parameters -default_litellm_params: - drop_params: true - -general_settings: - disable_responses_id_security: true - # Disable spend tracking to reduce database overhead - disable_spend_logs: true - # Disable tag tracking - disable_tag_tracking: true - # Disable daily spend updates - disable_daily_spend_logs: true diff --git a/postgres/init/01-init-databases.sh b/postgres/init/01-init-databases.sh deleted file mode 100755 index 69e7094..0000000 --- a/postgres/init/01-init-databases.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -set -e - -# PostgreSQL initialization script for AI stack -# This script runs on first database initialization -# Creates all databases required by AI services - -echo "Starting AI stack database initialization..." - -psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-EOSQL - -- Create databases for AI services - -- Open WebUI database - SELECT 'CREATE DATABASE openwebui' - WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'openwebui')\gexec - - -- LiteLLM proxy database - SELECT 'CREATE DATABASE litellm' - WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'litellm')\gexec - - -- Grant privileges to all databases - GRANT ALL PRIVILEGES ON DATABASE openwebui TO $POSTGRES_USER; - GRANT ALL PRIVILEGES ON DATABASE litellm TO $POSTGRES_USER; - - -- Log success - SELECT 'AI stack databases initialized:' AS status; - SELECT datname FROM pg_database - WHERE datname IN ('openwebui', 'litellm') - ORDER BY datname; -EOSQL - -echo "" -echo "✓ PostgreSQL initialization completed" -echo "✓ All AI stack databases created successfully" -echo "" -echo "Databases available:" -echo " • openwebui - Open WebUI application database" -echo " • litellm - LiteLLM proxy database" -echo "" diff --git a/simple_vllm_server.py b/simple_vllm_server.py deleted file mode 100644 index 0075bd2..0000000 --- a/simple_vllm_server.py +++ /dev/null @@ -1,302 +0,0 @@ -#!/usr/bin/env python3 -""" -Simple vLLM server using AsyncLLMEngine directly -Bypasses the multiprocessing issues we hit with the default vLLM API server -OpenAI-compatible endpoints: /v1/models and /v1/completions -""" - -import asyncio -import json -import logging -import os -from typing import AsyncIterator, Dict, List, Optional - -from fastapi import FastAPI, Request -from fastapi.responses import JSONResponse, StreamingResponse -from pydantic import BaseModel, Field -from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams -from vllm.utils import random_uuid - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - -# FastAPI app -app = FastAPI(title="Simple vLLM Server", version="1.0.0") - -# Global engine instance -engine: Optional[AsyncLLMEngine] = None -model_name: str = "Qwen/Qwen2.5-7B-Instruct" - -# Request/Response models -class CompletionRequest(BaseModel): - """OpenAI-compatible completion request""" - model: str = Field(default="qwen-2.5-7b") - prompt: str | List[str] = Field(..., description="Text prompt(s)") - max_tokens: int = Field(default=512, ge=1, le=4096) - temperature: float = Field(default=0.7, ge=0.0, le=2.0) - top_p: float = Field(default=1.0, ge=0.0, le=1.0) - n: int = Field(default=1, ge=1, le=10) - stream: bool = Field(default=False) - stop: Optional[str | List[str]] = None - presence_penalty: float = Field(default=0.0, ge=-2.0, le=2.0) - frequency_penalty: float = Field(default=0.0, ge=-2.0, le=2.0) - -class ChatMessage(BaseModel): - """Chat message format""" - role: str = Field(..., description="Role: system, user, or assistant") - content: str = Field(..., description="Message content") - -class ChatCompletionRequest(BaseModel): - """OpenAI-compatible chat completion request""" - model: str = Field(default="qwen-2.5-7b") - messages: List[ChatMessage] = Field(..., description="Chat messages") - max_tokens: int = Field(default=512, ge=1, le=4096) - temperature: float = Field(default=0.7, ge=0.0, le=2.0) - top_p: float = Field(default=1.0, ge=0.0, le=1.0) - n: int = Field(default=1, ge=1, le=10) - stream: bool = Field(default=False) - stop: Optional[str | List[str]] = None - -@app.on_event("startup") -async def startup_event(): - """Initialize vLLM engine on startup""" - global engine, model_name - - logger.info(f"Initializing vLLM AsyncLLMEngine with model: {model_name}") - - # Configure engine - engine_args = AsyncEngineArgs( - model=model_name, - tensor_parallel_size=1, # Single GPU - gpu_memory_utilization=0.85, # Use 85% of GPU memory - max_model_len=4096, # Context length - dtype="auto", # Auto-detect dtype - download_dir="/workspace/huggingface_cache", # Large disk - trust_remote_code=True, # Some models require this - enforce_eager=False, # Use CUDA graphs for better performance - ) - - # Create async engine - engine = AsyncLLMEngine.from_engine_args(engine_args) - - logger.info("vLLM AsyncLLMEngine initialized successfully") - -@app.get("/") -async def root(): - """Health check endpoint""" - return {"status": "ok", "model": model_name} - -@app.get("/health") -async def health(): - """Detailed health check""" - return { - "status": "healthy" if engine else "initializing", - "model": model_name, - "ready": engine is not None - } - -@app.get("/v1/models") -async def list_models(): - """OpenAI-compatible models endpoint""" - return { - "object": "list", - "data": [ - { - "id": "qwen-2.5-7b", - "object": "model", - "created": 1234567890, - "owned_by": "pivoine-gpu", - "permission": [], - "root": model_name, - "parent": None, - } - ] - } - -def messages_to_prompt(messages: List[ChatMessage]) -> str: - """Convert chat messages to a single prompt string""" - # Qwen 2.5 chat template format - prompt_parts = [] - - for msg in messages: - role = msg.role - content = msg.content - - if role == "system": - prompt_parts.append(f"<|im_start|>system\n{content}<|im_end|>") - elif role == "user": - prompt_parts.append(f"<|im_start|>user\n{content}<|im_end|>") - elif role == "assistant": - prompt_parts.append(f"<|im_start|>assistant\n{content}<|im_end|>") - - # Add final assistant prompt - prompt_parts.append("<|im_start|>assistant\n") - - return "\n".join(prompt_parts) - -@app.post("/v1/completions") -async def create_completion(request: CompletionRequest): - """OpenAI-compatible completion endpoint""" - if not engine: - return JSONResponse( - status_code=503, - content={"error": "Engine not initialized"} - ) - - # Handle both single prompt and batch prompts - prompts = [request.prompt] if isinstance(request.prompt, str) else request.prompt - - # Configure sampling parameters - sampling_params = SamplingParams( - temperature=request.temperature, - top_p=request.top_p, - max_tokens=request.max_tokens, - n=request.n, - stop=request.stop if request.stop else [], - presence_penalty=request.presence_penalty, - frequency_penalty=request.frequency_penalty, - ) - - # Generate completions - results = [] - for prompt in prompts: - request_id = random_uuid() - - if request.stream: - # Streaming response - async def generate_stream(): - async for output in engine.generate(prompt, sampling_params, request_id): - chunk = { - "id": request_id, - "object": "text_completion", - "created": 1234567890, - "model": request.model, - "choices": [ - { - "text": output.outputs[0].text, - "index": 0, - "logprobs": None, - "finish_reason": output.outputs[0].finish_reason, - } - ] - } - yield f"data: {json.dumps(chunk)}\n\n" - yield "data: [DONE]\n\n" - - return StreamingResponse(generate_stream(), media_type="text/event-stream") - else: - # Non-streaming response - async for output in engine.generate(prompt, sampling_params, request_id): - final_output = output - - results.append({ - "text": final_output.outputs[0].text, - "index": len(results), - "logprobs": None, - "finish_reason": final_output.outputs[0].finish_reason, - }) - - return { - "id": random_uuid(), - "object": "text_completion", - "created": 1234567890, - "model": request.model, - "choices": results, - "usage": { - "prompt_tokens": 0, # vLLM doesn't expose this easily - "completion_tokens": 0, - "total_tokens": 0, - } - } - -@app.post("/v1/chat/completions") -async def create_chat_completion(request: ChatCompletionRequest): - """OpenAI-compatible chat completion endpoint""" - if not engine: - return JSONResponse( - status_code=503, - content={"error": "Engine not initialized"} - ) - - # Convert messages to prompt - prompt = messages_to_prompt(request.messages) - - # Configure sampling parameters - sampling_params = SamplingParams( - temperature=request.temperature, - top_p=request.top_p, - max_tokens=request.max_tokens, - n=request.n, - stop=request.stop if request.stop else ["<|im_end|>"], - ) - - request_id = random_uuid() - - if request.stream: - # Streaming response - async def generate_stream(): - async for output in engine.generate(prompt, sampling_params, request_id): - chunk = { - "id": request_id, - "object": "chat.completion.chunk", - "created": 1234567890, - "model": request.model, - "choices": [ - { - "index": 0, - "delta": {"content": output.outputs[0].text}, - "finish_reason": output.outputs[0].finish_reason, - } - ] - } - yield f"data: {json.dumps(chunk)}\n\n" - yield "data: [DONE]\n\n" - - return StreamingResponse(generate_stream(), media_type="text/event-stream") - else: - # Non-streaming response - async for output in engine.generate(prompt, sampling_params, request_id): - final_output = output - - return { - "id": request_id, - "object": "chat.completion", - "created": 1234567890, - "model": request.model, - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": final_output.outputs[0].text, - }, - "finish_reason": final_output.outputs[0].finish_reason, - } - ], - "usage": { - "prompt_tokens": 0, - "completion_tokens": 0, - "total_tokens": 0, - } - } - -if __name__ == "__main__": - import uvicorn - - # Get configuration from environment - host = os.getenv("VLLM_HOST", "0.0.0.0") - port = int(os.getenv("VLLM_PORT", "8000")) - - logger.info(f"Starting vLLM server on {host}:{port}") - - uvicorn.run( - app, - host=host, - port=port, - log_level="info", - access_log=True, - )