diff --git a/CLAUDE.md b/CLAUDE.md index c12bfe1..6e45958 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -143,6 +143,64 @@ arty run services/logs # Follow ComfyUI logs via arty - `comfyui` - ComfyUI server (port 8188, autostart enabled) - `orchestrator` - Model orchestrator (port 9000, autostart disabled) +### GPU Memory Management and Mode Switching + +**VRAM Constraints (RTX 4090 - 24GB total):** + +The GPU has limited memory, which requires manual service switching: + +| Service | Model | VRAM Usage | Compatible With | +|---------|-------|------------|-----------------| +| ComfyUI | FLUX Schnell FP16 | ~23GB | None (uses all VRAM) | +| ComfyUI | SDXL Base | ~12GB | Small vLLM models | +| vLLM | Qwen 2.5 7B | ~14GB | None (conflicts with ComfyUI) | +| vLLM | Llama 3.1 8B | ~17GB | None (conflicts with ComfyUI) | + +**Mode Switching Workflow:** + +Since ComfyUI and vLLM models cannot run simultaneously (they exceed 24GB combined), you must manually switch modes: + +**Switch to Text Generation Mode (vLLM):** +```bash +# 1. Stop ComfyUI +supervisorctl stop comfyui + +# 2. Start orchestrator (manages vLLM models) +supervisorctl start orchestrator + +# 3. Verify +supervisorctl status +nvidia-smi # Check VRAM usage +``` + +**Switch to Image/Video/Audio Generation Mode (ComfyUI):** +```bash +# 1. Stop orchestrator (stops all vLLM models) +supervisorctl stop orchestrator + +# 2. Start ComfyUI +supervisorctl start comfyui + +# 3. Verify +supervisorctl status +nvidia-smi # Check VRAM usage +``` + +**Access via Supervisor Web UI:** + +You can also switch modes using the Supervisor web interface: +- URL: `https://supervisor.ai.pivoine.art` (via VPS proxy) or `http://100.114.60.40:9001` (direct Tailscale) +- Username: `admin` +- Password: `runpod2024` +- Click "Start" or "Stop" buttons for each service + +**Integration with LiteLLM:** + +The orchestrator integrates with LiteLLM on the VPS for unified API access: +- vLLM models (qwen-2.5-7b, llama-3.1-8b) available when orchestrator is running +- Requests route through orchestrator (port 9000) which handles model loading +- Environment variable `GPU_TAILSCALE_IP` (100.114.60.40) configures connection +- LiteLLM config uses `os.environ/GPU_TAILSCALE_IP` syntax for dynamic IP ### Testing diff --git a/model-orchestrator/orchestrator_subprocess.py b/model-orchestrator/orchestrator_subprocess.py index 23ae90f..62109cb 100644 --- a/model-orchestrator/orchestrator_subprocess.py +++ b/model-orchestrator/orchestrator_subprocess.py @@ -102,11 +102,17 @@ async def start_model_process(model_name: str) -> bool: env.update({ 'HF_TOKEN': os.getenv('HF_TOKEN', ''), 'PORT': str(port), - 'HOST': '0.0.0.0' + 'HOST': '0.0.0.0', + 'MODEL_NAME': model_config.get('model_name', model_name) }) + # Use venv python if it exists + script_dir = script_path.parent + venv_python = script_dir / 'venv' / 'bin' / 'python3' + python_cmd = str(venv_python) if venv_python.exists() else 'python3' + proc = subprocess.Popen( - ['python3', str(script_path)], + [python_cmd, str(script_path)], env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE,