From 664da9f4ead2d2386865d3912c60aad1716e6316 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= Date: Sat, 22 Nov 2025 09:22:16 +0100 Subject: [PATCH] feat: add Supervisor process manager for service management MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add supervisord.conf with ComfyUI and orchestrator services - Update Ansible playbook with supervisor installation tag - Rewrite start-all.sh and stop-all.sh to use Supervisor - Add status.sh script for checking service status - Update arty.yml with supervisor commands and shortcuts - Update CLAUDE.md with Supervisor documentation and troubleshooting - Services now auto-restart on crashes with centralized logging Benefits: - Better process control than manual pkill/background jobs - Auto-restart on service crashes - Centralized log management in /workspace/logs/ - Web interface for monitoring (port 9001) - Works perfectly in RunPod containers (no systemd needed) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- CLAUDE.md | 71 +++++++++++++++++++++++++++++++++++--------- arty.yml | 10 ++++++- playbook.yml | 50 +++++++++++++++++++++++++++++++ scripts/start-all.sh | 58 +++++++++++++++++++++++++++++++----- scripts/status.sh | 47 +++++++++++++++++++++++++++++ scripts/stop-all.sh | 39 ++++++++++++++++++++---- supervisord.conf | 60 +++++++++++++++++++++++++++++++++++++ 7 files changed, 306 insertions(+), 29 deletions(-) create mode 100644 scripts/status.sh create mode 100644 supervisord.conf diff --git a/CLAUDE.md b/CLAUDE.md index 90f98d4..c12bfe1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -102,22 +102,48 @@ ansible-playbook playbook.yml --tags base,python,dependencies ### Service Management +This project uses **Supervisor** for process management, providing auto-restart, centralized logging, and easy service control. + ```bash -# Start orchestrator (runs in foreground) -bash scripts/start-all.sh -# Or directly: -python3 model-orchestrator/orchestrator_subprocess.py +# Start all services (with Supervisor) +bash scripts/start-all.sh # Starts supervisor daemon + services +arty run services/start # Same via arty # Stop all services -bash scripts/stop-all.sh +bash scripts/stop-all.sh # Stops all services + supervisor +arty run services/stop # Same via arty -# Stop orchestrator only -pkill -f orchestrator_subprocess.py +# Check service status +bash scripts/status.sh # Show all service status +arty run services/status # Same via arty +supervisorctl status # Direct supervisor command -# Stop specific model service -pkill -f "models/vllm/server.py" +# Individual service control +supervisorctl start orchestrator # Start orchestrator +supervisorctl restart comfyui # Restart ComfyUI +supervisorctl stop orchestrator # Stop orchestrator +arty run services/restart-comfyui # Restart ComfyUI via arty + +# View logs +supervisorctl tail -f comfyui # Follow ComfyUI logs +supervisorctl tail -f orchestrator # Follow orchestrator logs +arty run services/logs # Follow ComfyUI logs via arty + +# Web interface +# Access at http://localhost:9001 (username: admin, password: runpod2024) ``` +**Supervisor Configuration:** +- Config file: `/workspace/supervisord.conf` +- Log directory: `/workspace/logs/` +- PID file: `/workspace/supervisord.pid` +- Socket: `/workspace/supervisor.sock` + +**Services managed:** +- `comfyui` - ComfyUI server (port 8188, autostart enabled) +- `orchestrator` - Model orchestrator (port 9000, autostart disabled) + + ### Testing ```bash @@ -167,6 +193,7 @@ curl -X POST http://localhost:9000/v1/images/generations \ **Infrastructure:** - `tailscale` - Install Tailscale VPN client +- `supervisor` - Install and configure Supervisor process manager - `systemd` - Configure systemd services (use `never` - not for RunPod) - `validate` - Health checks (use `never` - run explicitly) @@ -352,8 +379,10 @@ runpod/ │ ├── start.sh # ComfyUI startup script │ └── requirements.txt ├── scripts/ -│ ├── start-all.sh # Start orchestrator -│ └── stop-all.sh # Stop all services +│ ├── start-all.sh # Start all services with Supervisor +│ ├── stop-all.sh # Stop all services +│ └── status.sh # Check service status +├── supervisord.conf # Supervisor process manager config ├── arty.yml # Arty repository manager config ├── playbook.yml # Ansible provisioning playbook ├── inventory.yml # Ansible inventory (localhost) @@ -389,10 +418,24 @@ runpod/ - Use essential tags: `--tags comfyui-essential` (~80GB vs ~137GB) - Clear cache: `rm -rf /workspace/huggingface_cache` +### Supervisor not running +- Check status: `bash scripts/status.sh` +- View logs: `cat /workspace/logs/supervisord.log` +- Start supervisor: `bash scripts/start-all.sh` +- Check for stale PID: `rm -f /workspace/supervisord.pid` then restart + +### Service won't start +- Check supervisor status: `supervisorctl status` +- View service logs: `supervisorctl tail -f comfyui` or `supervisorctl tail -f orchestrator` +- Check error logs: `cat /workspace/logs/comfyui.err.log` +- Restart service: `supervisorctl restart comfyui` +- Check if port is in use: `ss -tulpn | grep :8188` + ### Orchestrator not responding -- Check process: `ps aux | grep orchestrator` -- View logs: Check terminal output where orchestrator was started -- Restart: `bash scripts/stop-all.sh && bash scripts/start-all.sh` +- Check supervisor status: `supervisorctl status orchestrator` +- View logs: `supervisorctl tail -f orchestrator` or `cat /workspace/logs/orchestrator.err.log` +- Restart: `supervisorctl restart orchestrator` +- Manual start for debugging: `cd /workspace/ai && python3 model-orchestrator/orchestrator_subprocess.py` ## Performance Notes diff --git a/arty.yml b/arty.yml index c9a0560..d8c31f1 100644 --- a/arty.yml +++ b/arty.yml @@ -127,10 +127,17 @@ scripts: ln -sf /workspace/huggingface_cache/models--stabilityai--stable-video-diffusion-img2vid-xt stable-video-diffusion-img2vid-xt echo "Models linked to ComfyUI" - # Service management + # Service management (Supervisor-based) services/start: bash /workspace/ai/scripts/start-all.sh services/stop: bash /workspace/ai/scripts/stop-all.sh services/restart: bash /workspace/ai/scripts/stop-all.sh && bash /workspace/ai/scripts/start-all.sh + services/status: bash /workspace/ai/scripts/status.sh + services/logs: supervisorctl -c /workspace/supervisord.conf tail -f comfyui + services/logs-orchestrator: supervisorctl -c /workspace/supervisord.conf tail -f orchestrator + services/restart-comfyui: supervisorctl -c /workspace/supervisord.conf restart comfyui + services/restart-orchestrator: supervisorctl -c /workspace/supervisord.conf restart orchestrator + services/start-orchestrator: supervisorctl -c /workspace/supervisord.conf start orchestrator + services/stop-orchestrator: supervisorctl -c /workspace/supervisord.conf stop orchestrator # Dependency installation deps/comfyui-nodes: | @@ -141,6 +148,7 @@ scripts: # Ansible provisioning shortcuts ansible/base: cd /workspace/ai && ansible-playbook playbook.yml --tags base,python,dependencies + ansible/supervisor: cd /workspace/ai && ansible-playbook playbook.yml --tags supervisor ansible/vllm: cd /workspace/ai && ansible-playbook playbook.yml --tags models ansible/comfyui: cd /workspace/ai && ansible-playbook playbook.yml --tags comfyui,comfyui-essential ansible/comfyui-all: cd /workspace/ai && ansible-playbook playbook.yml --tags comfyui,comfyui-models-all,comfyui-nodes diff --git a/playbook.yml b/playbook.yml index 64c2704..f71ecc1 100644 --- a/playbook.yml +++ b/playbook.yml @@ -26,6 +26,7 @@ # comfyui-nodes - Install essential custom nodes # comfyui-essential - Quick setup (ComfyUI + essential models only) # tailscale - Install and configure Tailscale +# supervisor - Install and configure Supervisor process manager # systemd - Configure systemd services # validate - Health checks and validation # @@ -755,6 +756,55 @@ Note: Authentication requires manual intervention via provided URL + # + # Supervisor Process Manager + # + - name: Install and configure Supervisor + tags: [supervisor] + block: + - name: Install Supervisor + pip: + name: supervisor + executable: pip3 + become: true + + - name: Create logs directory + file: + path: "{{ workspace_dir }}/logs" + state: directory + mode: '0755' + + - name: Deploy supervisord configuration + copy: + src: "{{ ai_dir }}/supervisord.conf" + dest: "{{ workspace_dir }}/supervisord.conf" + mode: '0644' + + - name: Display Supervisor setup instructions + debug: + msg: | + ✓ Supervisor installed successfully! + + Configuration: {{ workspace_dir }}/supervisord.conf + Logs: {{ workspace_dir }}/logs/ + + Services configured: + - comfyui: ComfyUI server (port 8188) - autostart enabled + - orchestrator: Model orchestrator (port 9000) - autostart disabled + + To start Supervisor: + supervisord -c {{ workspace_dir }}/supervisord.conf + + To manage services: + supervisorctl status # Check service status + supervisorctl start orchestrator # Start orchestrator + supervisorctl restart comfyui # Restart ComfyUI + supervisorctl stop all # Stop all services + supervisorctl tail -f comfyui # Follow ComfyUI logs + + Web interface: + http://localhost:9001 (username: admin, password: runpod2024) + # # Systemd Services (Optional) # diff --git a/scripts/start-all.sh b/scripts/start-all.sh index ffed9e4..ce381ce 100644 --- a/scripts/start-all.sh +++ b/scripts/start-all.sh @@ -1,15 +1,19 @@ #!/bin/bash # -# Start AI Orchestrator -# Starts the model orchestrator which manages all AI services +# Start AI Services with Supervisor +# Starts supervisor daemon which manages ComfyUI and orchestrator # set -e -cd "$(dirname "$0")/.." +WORKSPACE_DIR="${WORKSPACE_DIR:-/workspace}" +SUPERVISORD_CONF="${WORKSPACE_DIR}/supervisord.conf" +AI_DIR="${WORKSPACE_DIR}/ai" + +cd "${AI_DIR}" echo "=========================================" -echo " Starting AI Orchestrator" +echo " Starting AI Services with Supervisor" echo "=========================================" echo "" @@ -27,9 +31,47 @@ if [ -f .env ]; then set +a fi -# Start orchestrator -echo "Starting orchestrator on port 9000..." -python3 model-orchestrator/orchestrator_subprocess.py +# Check if supervisord is already running +if [ -f "${WORKSPACE_DIR}/supervisord.pid" ]; then + PID=$(cat "${WORKSPACE_DIR}/supervisord.pid") + if ps -p "$PID" > /dev/null 2>&1; then + echo "Supervisor is already running (PID: $PID)" + echo "" + echo "Checking service status..." + supervisorctl -c "${SUPERVISORD_CONF}" status + exit 0 + else + echo "Removing stale PID file..." + rm -f "${WORKSPACE_DIR}/supervisord.pid" + fi +fi + +# Start supervisord +echo "Starting Supervisor daemon..." +supervisord -c "${SUPERVISORD_CONF}" + +# Wait a moment for supervisor to start +sleep 2 + +# Check status +echo "" +echo "Service Status:" +echo "---------------" +supervisorctl -c "${SUPERVISORD_CONF}" status echo "" -echo "Orchestrator stopped" +echo "=========================================" +echo "Services started successfully!" +echo "=========================================" +echo "" +echo "Useful commands:" +echo " supervisorctl status - Check status" +echo " supervisorctl start orchestrator - Start orchestrator" +echo " supervisorctl restart comfyui - Restart ComfyUI" +echo " supervisorctl stop all - Stop all services" +echo " supervisorctl tail -f comfyui - Follow ComfyUI logs" +echo "" +echo "Web interface: http://localhost:9001" +echo " Username: admin" +echo " Password: runpod2024" +echo "" diff --git a/scripts/status.sh b/scripts/status.sh new file mode 100644 index 0000000..3b1dfa8 --- /dev/null +++ b/scripts/status.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# +# Check AI Services Status +# Shows status of all services managed by Supervisor +# + +WORKSPACE_DIR="${WORKSPACE_DIR:-/workspace}" +SUPERVISORD_CONF="${WORKSPACE_DIR}/supervisord.conf" + +echo "=========================================" +echo " AI Services Status" +echo "=========================================" +echo "" + +# Check if supervisord is running +if [ ! -f "${WORKSPACE_DIR}/supervisord.pid" ]; then + echo "❌ Supervisor is not running" + echo "" + echo "To start services, run:" + echo " bash scripts/start-all.sh" + exit 1 +fi + +PID=$(cat "${WORKSPACE_DIR}/supervisord.pid") +if ! ps -p "$PID" > /dev/null 2>&1; then + echo "❌ Supervisor PID file exists but process is not running" + echo "" + echo "To start services, run:" + echo " bash scripts/start-all.sh" + exit 1 +fi + +echo "✅ Supervisor is running (PID: $PID)" +echo "" + +# Show service status +echo "Service Status:" +echo "---------------" +supervisorctl -c "${SUPERVISORD_CONF}" status + +echo "" +echo "Useful commands:" +echo " supervisorctl start orchestrator - Start orchestrator" +echo " supervisorctl restart comfyui - Restart ComfyUI" +echo " supervisorctl stop all - Stop all services" +echo " supervisorctl tail -f comfyui - Follow ComfyUI logs" +echo "" diff --git a/scripts/stop-all.sh b/scripts/stop-all.sh index d585a67..31fd6e3 100644 --- a/scripts/stop-all.sh +++ b/scripts/stop-all.sh @@ -1,22 +1,49 @@ #!/bin/bash # # Stop AI Services -# Gracefully stops all running AI services +# Gracefully stops all services managed by Supervisor # set -e +WORKSPACE_DIR="${WORKSPACE_DIR:-/workspace}" +SUPERVISORD_CONF="${WORKSPACE_DIR}/supervisord.conf" + echo "=========================================" echo " Stopping AI Services" echo "=========================================" echo "" -# Kill orchestrator and model processes -echo "Stopping orchestrator..." -pkill -f "orchestrator_subprocess.py" || echo "Orchestrator not running" +# Check if supervisord is running +if [ ! -f "${WORKSPACE_DIR}/supervisord.pid" ]; then + echo "Supervisor is not running (no PID file found)" + echo "Cleaning up any stray processes..." + pkill -f "orchestrator_subprocess.py" || echo " - Orchestrator not running" + pkill -f "ComfyUI.*main.py" || echo " - ComfyUI not running" + echo "" + echo "All services stopped" + exit 0 +fi -echo "Stopping model services..." -pkill -f "models/vllm/server.py" || echo "vLLM not running" +PID=$(cat "${WORKSPACE_DIR}/supervisord.pid") +if ! ps -p "$PID" > /dev/null 2>&1; then + echo "Supervisor PID file exists but process is not running" + echo "Removing stale PID file..." + rm -f "${WORKSPACE_DIR}/supervisord.pid" + echo "" + echo "All services stopped" + exit 0 +fi + +# Stop all supervised services +echo "Stopping all supervised services..." +supervisorctl -c "${SUPERVISORD_CONF}" stop all + +sleep 2 + +# Shutdown supervisord +echo "Shutting down Supervisor daemon..." +supervisorctl -c "${SUPERVISORD_CONF}" shutdown echo "" echo "All services stopped" diff --git a/supervisord.conf b/supervisord.conf new file mode 100644 index 0000000..00b0fdd --- /dev/null +++ b/supervisord.conf @@ -0,0 +1,60 @@ +[supervisord] +logfile=/workspace/logs/supervisord.log +pidfile=/workspace/supervisord.pid +childlogdir=/workspace/logs +nodaemon=false +loglevel=info + +[unix_http_server] +file=/workspace/supervisor.sock +chmod=0700 + +[supervisorctl] +serverurl=unix:///workspace/supervisor.sock + +[rpcinterface:supervisor] +supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface + +# Web interface for monitoring (localhost only) +[inet_http_server] +port=127.0.0.1:9001 +username=admin +password=runpod2024 + +# ComfyUI Server +[program:comfyui] +command=bash /workspace/ai/models/comfyui/start.sh +directory=/workspace/ComfyUI +autostart=true +autorestart=true +startretries=3 +stderr_logfile=/workspace/logs/comfyui.err.log +stdout_logfile=/workspace/logs/comfyui.out.log +stdout_logfile_maxbytes=50MB +stdout_logfile_backups=10 +stderr_logfile_maxbytes=50MB +stderr_logfile_backups=10 +environment=HF_HOME="/workspace/huggingface_cache",PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" +priority=100 +stopwaitsecs=30 + +# AI Model Orchestrator +[program:orchestrator] +command=python3 model-orchestrator/orchestrator_subprocess.py +directory=/workspace/ai +autostart=false +autorestart=true +startretries=3 +stderr_logfile=/workspace/logs/orchestrator.err.log +stdout_logfile=/workspace/logs/orchestrator.out.log +stdout_logfile_maxbytes=50MB +stdout_logfile_backups=10 +stderr_logfile_maxbytes=50MB +stderr_logfile_backups=10 +environment=HF_HOME="/workspace/huggingface_cache",HF_TOKEN="%(ENV_HF_TOKEN)s" +priority=200 +stopwaitsecs=30 + +[group:ai-services] +programs=comfyui,orchestrator +priority=999