Files
runpod/core/service_manager.py
Sebastian Krüger 9ee626a78e feat: implement Ansible-based process architecture for RunPod
Major architecture overhaul to address RunPod Docker limitations:

Core Infrastructure:
- Add base_service.py: Abstract base class for all AI services
- Add service_manager.py: Process lifecycle management
- Add core/requirements.txt: Core dependencies

Model Services (Standalone Python):
- Add models/vllm/server.py: Qwen 2.5 7B text generation
- Add models/flux/server.py: Flux.1 Schnell image generation
- Add models/musicgen/server.py: MusicGen Medium music generation
- Each service inherits from GPUService base class
- OpenAI-compatible APIs
- Standalone execution support

Ansible Deployment:
- Add playbook.yml: Comprehensive deployment automation
- Add ansible.cfg: Ansible configuration
- Add inventory.yml: Localhost inventory
- Tags: base, python, dependencies, models, tailscale, validate, cleanup

Scripts:
- Add scripts/install.sh: Full installation wrapper
- Add scripts/download-models.sh: Model download wrapper
- Add scripts/start-all.sh: Start orchestrator
- Add scripts/stop-all.sh: Stop all services

Documentation:
- Update ARCHITECTURE.md: Document distributed VPS+GPU architecture

Benefits:
- No Docker: Avoids RunPod CAP_SYS_ADMIN limitations
- Fully reproducible via Ansible
- Extensible: Add models in 3 steps
- Direct Python execution (no container overhead)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-21 15:37:18 +01:00

302 lines
9.1 KiB
Python

#!/usr/bin/env python3
"""
Service Manager for AI Model Services
Manages lifecycle of model services running as Python processes:
- Start/stop services
- Health monitoring
- Auto-restart on failure
- Resource cleanup
"""
import asyncio
import logging
import os
import signal
import subprocess
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Optional
import httpx
@dataclass
class ServiceConfig:
"""Configuration for a service"""
name: str
script_path: Path
port: int
startup_timeout: int = 120
health_check_path: str = "/health"
auto_restart: bool = False
env: Optional[Dict[str, str]] = None
class ServiceManager:
"""Manages multiple AI model services as subprocesses"""
def __init__(self):
self.logger = logging.getLogger("ServiceManager")
self.processes: Dict[str, subprocess.Popen] = {}
self.configs: Dict[str, ServiceConfig] = {}
self.shutdown_event = asyncio.Event()
def register_service(self, config: ServiceConfig):
"""Register a service configuration"""
self.configs[config.name] = config
self.logger.info(f"Registered service: {config.name} on port {config.port}")
async def start_service(self, name: str) -> bool:
"""
Start a service by name
Args:
name: Service name to start
Returns:
bool: True if service started successfully
"""
if name not in self.configs:
self.logger.error(f"Service {name} not registered")
return False
if name in self.processes:
proc = self.processes[name]
if proc.poll() is None:
self.logger.info(f"Service {name} already running")
return True
config = self.configs[name]
self.logger.info(f"Starting service {name}...")
try:
# Prepare environment
env = os.environ.copy()
if config.env:
env.update(config.env)
env.update({
'PORT': str(config.port),
'HOST': '0.0.0.0'
})
# Start process
proc = subprocess.Popen(
['python3', str(config.script_path)],
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
preexec_fn=os.setsid # Create new process group
)
self.processes[name] = proc
self.logger.info(f"Process started for {name} (PID: {proc.pid})")
# Wait for health check
if await self._wait_for_health(name, config):
self.logger.info(f"Service {name} is healthy and ready")
return True
else:
self.logger.error(f"Service {name} failed health check")
await self.stop_service(name)
return False
except Exception as e:
self.logger.error(f"Error starting {name}: {e}", exc_info=True)
return False
async def _wait_for_health(self, name: str, config: ServiceConfig) -> bool:
"""
Wait for service to become healthy
Args:
name: Service name
config: Service configuration
Returns:
bool: True if service becomes healthy within timeout
"""
proc = self.processes.get(name)
if not proc:
return False
start_time = time.time()
url = f"http://localhost:{config.port}{config.health_check_path}"
while time.time() - start_time < config.startup_timeout:
# Check if process is still running
if proc.poll() is not None:
self.logger.error(f"Process for {name} exited prematurely (code: {proc.returncode})")
return False
# Try health check
try:
async with httpx.AsyncClient() as client:
response = await client.get(url, timeout=5.0)
if response.status_code == 200:
return True
except Exception:
pass
await asyncio.sleep(2)
return False
async def stop_service(self, name: str, timeout: int = 10):
"""
Stop a running service
Args:
name: Service name
timeout: Seconds to wait for graceful shutdown
"""
if name not in self.processes:
self.logger.warning(f"Service {name} not in process registry")
return
proc = self.processes[name]
if proc.poll() is None: # Still running
self.logger.info(f"Stopping service {name}...")
try:
# Send SIGTERM to process group
os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
# Wait for graceful shutdown
try:
proc.wait(timeout=timeout)
self.logger.info(f"Service {name} stopped gracefully")
except subprocess.TimeoutExpired:
# Force kill if not terminated
self.logger.warning(f"Service {name} did not stop gracefully, forcing kill")
os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
proc.wait()
except Exception as e:
self.logger.error(f"Error stopping {name}: {e}", exc_info=True)
del self.processes[name]
async def restart_service(self, name: str) -> bool:
"""
Restart a service
Args:
name: Service name
Returns:
bool: True if service restarted successfully
"""
self.logger.info(f"Restarting service {name}...")
await self.stop_service(name)
await asyncio.sleep(2) # Brief pause between stop and start
return await self.start_service(name)
async def check_health(self, name: str) -> bool:
"""
Check if a service is healthy
Args:
name: Service name
Returns:
bool: True if service is running and healthy
"""
if name not in self.processes:
return False
proc = self.processes[name]
if proc.poll() is not None:
return False
config = self.configs[name]
url = f"http://localhost:{config.port}{config.health_check_path}"
try:
async with httpx.AsyncClient() as client:
response = await client.get(url, timeout=5.0)
return response.status_code == 200
except Exception:
return False
async def monitor_services(self):
"""
Monitor all services and auto-restart if configured
This runs continuously until shutdown_event is set.
"""
self.logger.info("Starting service monitor...")
while not self.shutdown_event.is_set():
for name, config in self.configs.items():
if not config.auto_restart:
continue
# Check if process exists and is healthy
if name in self.processes:
proc = self.processes[name]
if proc.poll() is not None:
self.logger.warning(f"Service {name} died (code: {proc.returncode}), restarting...")
await self.restart_service(name)
elif not await self.check_health(name):
self.logger.warning(f"Service {name} unhealthy, restarting...")
await self.restart_service(name)
# Wait before next check
try:
await asyncio.wait_for(self.shutdown_event.wait(), timeout=10.0)
except asyncio.TimeoutError:
pass
self.logger.info("Service monitor stopped")
async def stop_all_services(self):
"""Stop all running services"""
self.logger.info("Stopping all services...")
for name in list(self.processes.keys()):
await self.stop_service(name)
self.logger.info("All services stopped")
def get_service_status(self, name: str) -> Dict:
"""
Get status information for a service
Args:
name: Service name
Returns:
dict: Status information
"""
if name not in self.configs:
return {"status": "unknown", "error": "Service not registered"}
if name not in self.processes:
return {"status": "stopped"}
proc = self.processes[name]
if proc.poll() is not None:
return {
"status": "exited",
"exit_code": proc.returncode
}
config = self.configs[name]
return {
"status": "running",
"pid": proc.pid,
"port": config.port
}
def get_all_service_status(self) -> Dict:
"""
Get status for all registered services
Returns:
dict: Service name -> status mapping
"""
return {
name: self.get_service_status(name)
for name in self.configs.keys()
}