#!/usr/bin/env python3 """ Service Manager for AI Model Services Manages lifecycle of model services running as Python processes: - Start/stop services - Health monitoring - Auto-restart on failure - Resource cleanup """ import asyncio import logging import os import signal import subprocess import time from dataclasses import dataclass from pathlib import Path from typing import Dict, Optional import httpx @dataclass class ServiceConfig: """Configuration for a service""" name: str script_path: Path port: int startup_timeout: int = 120 health_check_path: str = "/health" auto_restart: bool = False env: Optional[Dict[str, str]] = None class ServiceManager: """Manages multiple AI model services as subprocesses""" def __init__(self): self.logger = logging.getLogger("ServiceManager") self.processes: Dict[str, subprocess.Popen] = {} self.configs: Dict[str, ServiceConfig] = {} self.shutdown_event = asyncio.Event() def register_service(self, config: ServiceConfig): """Register a service configuration""" self.configs[config.name] = config self.logger.info(f"Registered service: {config.name} on port {config.port}") async def start_service(self, name: str) -> bool: """ Start a service by name Args: name: Service name to start Returns: bool: True if service started successfully """ if name not in self.configs: self.logger.error(f"Service {name} not registered") return False if name in self.processes: proc = self.processes[name] if proc.poll() is None: self.logger.info(f"Service {name} already running") return True config = self.configs[name] self.logger.info(f"Starting service {name}...") try: # Prepare environment env = os.environ.copy() if config.env: env.update(config.env) env.update({ 'PORT': str(config.port), 'HOST': '0.0.0.0' }) # Start process proc = subprocess.Popen( ['python3', str(config.script_path)], env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=os.setsid # Create new process group ) self.processes[name] = proc self.logger.info(f"Process started for {name} (PID: {proc.pid})") # Wait for health check if await self._wait_for_health(name, config): self.logger.info(f"Service {name} is healthy and ready") return True else: self.logger.error(f"Service {name} failed health check") await self.stop_service(name) return False except Exception as e: self.logger.error(f"Error starting {name}: {e}", exc_info=True) return False async def _wait_for_health(self, name: str, config: ServiceConfig) -> bool: """ Wait for service to become healthy Args: name: Service name config: Service configuration Returns: bool: True if service becomes healthy within timeout """ proc = self.processes.get(name) if not proc: return False start_time = time.time() url = f"http://localhost:{config.port}{config.health_check_path}" while time.time() - start_time < config.startup_timeout: # Check if process is still running if proc.poll() is not None: self.logger.error(f"Process for {name} exited prematurely (code: {proc.returncode})") return False # Try health check try: async with httpx.AsyncClient() as client: response = await client.get(url, timeout=5.0) if response.status_code == 200: return True except Exception: pass await asyncio.sleep(2) return False async def stop_service(self, name: str, timeout: int = 10): """ Stop a running service Args: name: Service name timeout: Seconds to wait for graceful shutdown """ if name not in self.processes: self.logger.warning(f"Service {name} not in process registry") return proc = self.processes[name] if proc.poll() is None: # Still running self.logger.info(f"Stopping service {name}...") try: # Send SIGTERM to process group os.killpg(os.getpgid(proc.pid), signal.SIGTERM) # Wait for graceful shutdown try: proc.wait(timeout=timeout) self.logger.info(f"Service {name} stopped gracefully") except subprocess.TimeoutExpired: # Force kill if not terminated self.logger.warning(f"Service {name} did not stop gracefully, forcing kill") os.killpg(os.getpgid(proc.pid), signal.SIGKILL) proc.wait() except Exception as e: self.logger.error(f"Error stopping {name}: {e}", exc_info=True) del self.processes[name] async def restart_service(self, name: str) -> bool: """ Restart a service Args: name: Service name Returns: bool: True if service restarted successfully """ self.logger.info(f"Restarting service {name}...") await self.stop_service(name) await asyncio.sleep(2) # Brief pause between stop and start return await self.start_service(name) async def check_health(self, name: str) -> bool: """ Check if a service is healthy Args: name: Service name Returns: bool: True if service is running and healthy """ if name not in self.processes: return False proc = self.processes[name] if proc.poll() is not None: return False config = self.configs[name] url = f"http://localhost:{config.port}{config.health_check_path}" try: async with httpx.AsyncClient() as client: response = await client.get(url, timeout=5.0) return response.status_code == 200 except Exception: return False async def monitor_services(self): """ Monitor all services and auto-restart if configured This runs continuously until shutdown_event is set. """ self.logger.info("Starting service monitor...") while not self.shutdown_event.is_set(): for name, config in self.configs.items(): if not config.auto_restart: continue # Check if process exists and is healthy if name in self.processes: proc = self.processes[name] if proc.poll() is not None: self.logger.warning(f"Service {name} died (code: {proc.returncode}), restarting...") await self.restart_service(name) elif not await self.check_health(name): self.logger.warning(f"Service {name} unhealthy, restarting...") await self.restart_service(name) # Wait before next check try: await asyncio.wait_for(self.shutdown_event.wait(), timeout=10.0) except asyncio.TimeoutError: pass self.logger.info("Service monitor stopped") async def stop_all_services(self): """Stop all running services""" self.logger.info("Stopping all services...") for name in list(self.processes.keys()): await self.stop_service(name) self.logger.info("All services stopped") def get_service_status(self, name: str) -> Dict: """ Get status information for a service Args: name: Service name Returns: dict: Status information """ if name not in self.configs: return {"status": "unknown", "error": "Service not registered"} if name not in self.processes: return {"status": "stopped"} proc = self.processes[name] if proc.poll() is not None: return { "status": "exited", "exit_code": proc.returncode } config = self.configs[name] return { "status": "running", "pid": proc.pid, "port": config.port } def get_all_service_status(self) -> Dict: """ Get status for all registered services Returns: dict: Service name -> status mapping """ return { name: self.get_service_status(name) for name in self.configs.keys() }