runpod/core/service_manager.py

#!/usr/bin/env python3
"""
Service Manager for AI Model Services

Manages lifecycle of model services running as Python processes:
- Start/stop services
- Health monitoring
- Auto-restart on failure
- Resource cleanup
"""

import asyncio
import logging
import os
import signal
import subprocess
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Optional

import httpx


@dataclass
class ServiceConfig:
    """Configuration for a service"""
    name: str
    script_path: Path
    port: int
    startup_timeout: int = 120
    health_check_path: str = "/health"
    auto_restart: bool = False
    env: Optional[Dict[str, str]] = None


class ServiceManager:
    """Manages multiple AI model services as subprocesses"""

    def __init__(self):
        self.logger = logging.getLogger("ServiceManager")
        self.processes: Dict[str, subprocess.Popen] = {}
        self.configs: Dict[str, ServiceConfig] = {}
        self.shutdown_event = asyncio.Event()

    def register_service(self, config: ServiceConfig):
        """Register a service configuration"""
        self.configs[config.name] = config
        self.logger.info(f"Registered service: {config.name} on port {config.port}")

    async def start_service(self, name: str) -> bool:
        """
        Start a service by name

        Args:
            name: Service name to start

        Returns:
            bool: True if service started successfully
        """
        if name not in self.configs:
            self.logger.error(f"Service {name} not registered")
            return False

        if name in self.processes:
            proc = self.processes[name]
            if proc.poll() is None:
                self.logger.info(f"Service {name} already running")
                return True

        config = self.configs[name]
        self.logger.info(f"Starting service {name}...")

        try:
            # Prepare environment
            env = os.environ.copy()
            if config.env:
                env.update(config.env)
            env.update({
                'PORT': str(config.port),
                'HOST': '0.0.0.0'
            })

            # Start process
            proc = subprocess.Popen(
                ['python3', str(config.script_path)],
                env=env,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                preexec_fn=os.setsid  # Create new process group
            )

            self.processes[name] = proc
            self.logger.info(f"Process started for {name} (PID: {proc.pid})")

            # Wait for health check
            if await self._wait_for_health(name, config):
                self.logger.info(f"Service {name} is healthy and ready")
                return True
            else:
                self.logger.error(f"Service {name} failed health check")
                await self.stop_service(name)
                return False

        except Exception as e:
            self.logger.error(f"Error starting {name}: {e}", exc_info=True)
            return False

    async def _wait_for_health(self, name: str, config: ServiceConfig) -> bool:
        """
        Wait for service to become healthy

        Args:
            name: Service name
            config: Service configuration

        Returns:
            bool: True if service becomes healthy within timeout
        """
        proc = self.processes.get(name)
        if not proc:
            return False

        start_time = time.time()
        url = f"http://localhost:{config.port}{config.health_check_path}"

        while time.time() - start_time < config.startup_timeout:
            # Check if process is still running
            if proc.poll() is not None:
                self.logger.error(f"Process for {name} exited prematurely (code: {proc.returncode})")
                return False

            # Try health check
            try:
                async with httpx.AsyncClient() as client:
                    response = await client.get(url, timeout=5.0)
                    if response.status_code == 200:
                        return True
            except Exception:
                pass

            await asyncio.sleep(2)

        return False

    async def stop_service(self, name: str, timeout: int = 10):
        """
        Stop a running service

        Args:
            name: Service name
            timeout: Seconds to wait for graceful shutdown
        """
        if name not in self.processes:
            self.logger.warning(f"Service {name} not in process registry")
            return

        proc = self.processes[name]

        if proc.poll() is None:  # Still running
            self.logger.info(f"Stopping service {name}...")
            try:
                # Send SIGTERM to process group
                os.killpg(os.getpgid(proc.pid), signal.SIGTERM)

                # Wait for graceful shutdown
                try:
                    proc.wait(timeout=timeout)
                    self.logger.info(f"Service {name} stopped gracefully")
                except subprocess.TimeoutExpired:
                    # Force kill if not terminated
                    self.logger.warning(f"Service {name} did not stop gracefully, forcing kill")
                    os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
                    proc.wait()

            except Exception as e:
                self.logger.error(f"Error stopping {name}: {e}", exc_info=True)

        del self.processes[name]

    async def restart_service(self, name: str) -> bool:
        """
        Restart a service

        Args:
            name: Service name

        Returns:
            bool: True if service restarted successfully
        """
        self.logger.info(f"Restarting service {name}...")
        await self.stop_service(name)
        await asyncio.sleep(2)  # Brief pause between stop and start
        return await self.start_service(name)

    async def check_health(self, name: str) -> bool:
        """
        Check if a service is healthy

        Args:
            name: Service name

        Returns:
            bool: True if service is running and healthy
        """
        if name not in self.processes:
            return False

        proc = self.processes[name]
        if proc.poll() is not None:
            return False

        config = self.configs[name]
        url = f"http://localhost:{config.port}{config.health_check_path}"

        try:
            async with httpx.AsyncClient() as client:
                response = await client.get(url, timeout=5.0)
                return response.status_code == 200
        except Exception:
            return False

    async def monitor_services(self):
        """
        Monitor all services and auto-restart if configured

        This runs continuously until shutdown_event is set.
        """
        self.logger.info("Starting service monitor...")

        while not self.shutdown_event.is_set():
            for name, config in self.configs.items():
                if not config.auto_restart:
                    continue

                # Check if process exists and is healthy
                if name in self.processes:
                    proc = self.processes[name]
                    if proc.poll() is not None:
                        self.logger.warning(f"Service {name} died (code: {proc.returncode}), restarting...")
                        await self.restart_service(name)
                    elif not await self.check_health(name):
                        self.logger.warning(f"Service {name} unhealthy, restarting...")
                        await self.restart_service(name)

            # Wait before next check
            try:
                await asyncio.wait_for(self.shutdown_event.wait(), timeout=10.0)
            except asyncio.TimeoutError:
                pass

        self.logger.info("Service monitor stopped")

    async def stop_all_services(self):
        """Stop all running services"""
        self.logger.info("Stopping all services...")
        for name in list(self.processes.keys()):
            await self.stop_service(name)
        self.logger.info("All services stopped")

    def get_service_status(self, name: str) -> Dict:
        """
        Get status information for a service

        Args:
            name: Service name

        Returns:
            dict: Status information
        """
        if name not in self.configs:
            return {"status": "unknown", "error": "Service not registered"}

        if name not in self.processes:
            return {"status": "stopped"}

        proc = self.processes[name]
        if proc.poll() is not None:
            return {
                "status": "exited",
                "exit_code": proc.returncode
            }

        config = self.configs[name]
        return {
            "status": "running",
            "pid": proc.pid,
            "port": config.port
        }

    def get_all_service_status(self) -> Dict:
        """
        Get status for all registered services

        Returns:
            dict: Service name -> status mapping
        """
        return {
            name: self.get_service_status(name)
            for name in self.configs.keys()
        }