feat: implement Ansible-based process architecture for RunPod

Major architecture overhaul to address RunPod Docker limitations: Core Infrastructure: - Add base_service.py: Abstract base class for all AI services - Add service_manager.py: Process lifecycle management - Add core/requirements.txt: Core dependencies Model Services (Standalone Python): - Add models/vllm/server.py: Qwen 2.5 7B text generation - Add models/flux/server.py: Flux.1 Schnell image generation - Add models/musicgen/server.py: MusicGen Medium music generation - Each service inherits from GPUService base class - OpenAI-compatible APIs - Standalone execution support Ansible Deployment: - Add playbook.yml: Comprehensive deployment automation - Add ansible.cfg: Ansible configuration - Add inventory.yml: Localhost inventory - Tags: base, python, dependencies, models, tailscale, validate, cleanup Scripts: - Add scripts/install.sh: Full installation wrapper - Add scripts/download-models.sh: Model download wrapper - Add scripts/start-all.sh: Start orchestrator - Add scripts/stop-all.sh: Stop all services Documentation: - Update ARCHITECTURE.md: Document distributed VPS+GPU architecture Benefits: - No Docker: Avoids RunPod CAP_SYS_ADMIN limitations - Fully reproducible via Ansible - Extensible: Add models in 3 steps - Direct Python execution (no container overhead) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-21 15:37:18 +01:00
parent 03a430894d
commit 9ee626a78e
17 changed files with 1817 additions and 5 deletions
--- a/core/base_service.py
+++ b/core/base_service.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""
+Base Service Class for AI Model Services
+
+Provides common functionality for all model services:
+- Health check endpoint
+- Graceful shutdown handling
+- Logging configuration
+- Standard FastAPI setup
+"""
+
+import asyncio
+import logging
+import os
+import signal
+import sys
+from abc import ABC, abstractmethod
+from typing import Optional
+
+from fastapi import FastAPI
+import uvicorn
+
+
+class BaseService(ABC):
+    """Abstract base class for all AI model services"""
+
+    def __init__(self, name: str, port: int, host: str = "0.0.0.0"):
+        """
+        Initialize base service
+
+        Args:
+            name: Service name (for logging)
+            port: Port to run service on
+            host: Host to bind to (default: 0.0.0.0)
+        """
+        self.name = name
+        self.port = port
+        self.host = host
+        self.app = FastAPI(title=f"{name} Service", version="1.0.0")
+        self.logger = self._setup_logging()
+        self.shutdown_event = asyncio.Event()
+
+        # Register standard endpoints
+        self._register_health_endpoint()
+
+        # Register signal handlers for graceful shutdown
+        self._register_signal_handlers()
+
+        # Allow subclasses to add custom routes
+        self.create_app()
+
+    def _setup_logging(self) -> logging.Logger:
+        """Configure logging for the service"""
+        logging.basicConfig(
+            level=logging.INFO,
+            format=f'%(asctime)s - {self.name} - %(levelname)s - %(message)s',
+            handlers=[
+                logging.StreamHandler(sys.stdout)
+            ]
+        )
+        return logging.getLogger(self.name)
+
+    def _register_health_endpoint(self):
+        """Register standard health check endpoint"""
+        @self.app.get("/health")
+        async def health_check():
+            """Health check endpoint"""
+            return {
+                "status": "healthy",
+                "service": self.name,
+                "port": self.port
+            }
+
+    def _register_signal_handlers(self):
+        """Register signal handlers for graceful shutdown"""
+        def signal_handler(sig, frame):
+            self.logger.info(f"Received signal {sig}, initiating graceful shutdown...")
+            self.shutdown_event.set()
+
+        signal.signal(signal.SIGINT, signal_handler)
+        signal.signal(signal.SIGTERM, signal_handler)
+
+    @abstractmethod
+    def create_app(self):
+        """
+        Create FastAPI routes for this service.
+        Subclasses must implement this to add their specific endpoints.
+
+        Example:
+            @self.app.post("/v1/generate")
+            async def generate(request: MyRequest):
+                return await self.model.generate(request)
+        """
+        pass
+
+    async def initialize(self):
+        """
+        Initialize the service (load models, etc.).
+        Subclasses can override this for custom initialization.
+        """
+        self.logger.info(f"Initializing {self.name} service...")
+
+    async def cleanup(self):
+        """
+        Cleanup resources on shutdown.
+        Subclasses can override this for custom cleanup.
+        """
+        self.logger.info(f"Cleaning up {self.name} service...")
+
+    def run(self):
+        """
+        Run the service.
+        This is the main entry point that starts the FastAPI server.
+        """
+        try:
+            self.logger.info(f"Starting {self.name} service on {self.host}:{self.port}")
+
+            # Run initialization
+            asyncio.run(self.initialize())
+
+            # Start uvicorn server
+            config = uvicorn.Config(
+                app=self.app,
+                host=self.host,
+                port=self.port,
+                log_level="info",
+                access_log=True
+            )
+            server = uvicorn.Server(config)
+
+            # Run server
+            asyncio.run(server.serve())
+
+        except KeyboardInterrupt:
+            self.logger.info("Keyboard interrupt received")
+        except Exception as e:
+            self.logger.error(f"Error running service: {e}", exc_info=True)
+            sys.exit(1)
+        finally:
+            # Cleanup
+            asyncio.run(self.cleanup())
+            self.logger.info(f"{self.name} service stopped")
+
+
+class GPUService(BaseService):
+    """
+    Base class for GPU-accelerated services.
+    Provides additional GPU-specific functionality.
+    """
+
+    def __init__(self, name: str, port: int, host: str = "0.0.0.0"):
+        super().__init__(name, port, host)
+        self._check_gpu_availability()
+
+    def _check_gpu_availability(self):
+        """Check if GPU is available"""
+        try:
+            import torch
+            if torch.cuda.is_available():
+                gpu_count = torch.cuda.device_count()
+                gpu_name = torch.cuda.get_device_name(0)
+                self.logger.info(f"GPU available: {gpu_name} (count: {gpu_count})")
+            else:
+                self.logger.warning("No GPU available - service may run slowly")
+        except ImportError:
+            self.logger.warning("PyTorch not installed - cannot check GPU availability")
--- a/core/requirements.txt
+++ b/core/requirements.txt
@@ -0,0 +1,15 @@
+# Core dependencies for AI service infrastructure
+
+# FastAPI and server
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+pydantic==2.5.0
+
+# HTTP client for health checks and proxying
+httpx==0.25.1
+
+# YAML configuration
+pyyaml==6.0.1
+
+# Process management
+psutil==5.9.6
--- a/core/service_manager.py
+++ b/core/service_manager.py
@@ -0,0 +1,301 @@
+#!/usr/bin/env python3
+"""
+Service Manager for AI Model Services
+
+Manages lifecycle of model services running as Python processes:
+- Start/stop services
+- Health monitoring
+- Auto-restart on failure
+- Resource cleanup
+"""
+
+import asyncio
+import logging
+import os
+import signal
+import subprocess
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Optional
+
+import httpx
+
+
+@dataclass
+class ServiceConfig:
+    """Configuration for a service"""
+    name: str
+    script_path: Path
+    port: int
+    startup_timeout: int = 120
+    health_check_path: str = "/health"
+    auto_restart: bool = False
+    env: Optional[Dict[str, str]] = None
+
+
+class ServiceManager:
+    """Manages multiple AI model services as subprocesses"""
+
+    def __init__(self):
+        self.logger = logging.getLogger("ServiceManager")
+        self.processes: Dict[str, subprocess.Popen] = {}
+        self.configs: Dict[str, ServiceConfig] = {}
+        self.shutdown_event = asyncio.Event()
+
+    def register_service(self, config: ServiceConfig):
+        """Register a service configuration"""
+        self.configs[config.name] = config
+        self.logger.info(f"Registered service: {config.name} on port {config.port}")
+
+    async def start_service(self, name: str) -> bool:
+        """
+        Start a service by name
+
+        Args:
+            name: Service name to start
+
+        Returns:
+            bool: True if service started successfully
+        """
+        if name not in self.configs:
+            self.logger.error(f"Service {name} not registered")
+            return False
+
+        if name in self.processes:
+            proc = self.processes[name]
+            if proc.poll() is None:
+                self.logger.info(f"Service {name} already running")
+                return True
+
+        config = self.configs[name]
+        self.logger.info(f"Starting service {name}...")
+
+        try:
+            # Prepare environment
+            env = os.environ.copy()
+            if config.env:
+                env.update(config.env)
+            env.update({
+                'PORT': str(config.port),
+                'HOST': '0.0.0.0'
+            })
+
+            # Start process
+            proc = subprocess.Popen(
+                ['python3', str(config.script_path)],
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                preexec_fn=os.setsid  # Create new process group
+            )
+
+            self.processes[name] = proc
+            self.logger.info(f"Process started for {name} (PID: {proc.pid})")
+
+            # Wait for health check
+            if await self._wait_for_health(name, config):
+                self.logger.info(f"Service {name} is healthy and ready")
+                return True
+            else:
+                self.logger.error(f"Service {name} failed health check")
+                await self.stop_service(name)
+                return False
+
+        except Exception as e:
+            self.logger.error(f"Error starting {name}: {e}", exc_info=True)
+            return False
+
+    async def _wait_for_health(self, name: str, config: ServiceConfig) -> bool:
+        """
+        Wait for service to become healthy
+
+        Args:
+            name: Service name
+            config: Service configuration
+
+        Returns:
+            bool: True if service becomes healthy within timeout
+        """
+        proc = self.processes.get(name)
+        if not proc:
+            return False
+
+        start_time = time.time()
+        url = f"http://localhost:{config.port}{config.health_check_path}"
+
+        while time.time() - start_time < config.startup_timeout:
+            # Check if process is still running
+            if proc.poll() is not None:
+                self.logger.error(f"Process for {name} exited prematurely (code: {proc.returncode})")
+                return False
+
+            # Try health check
+            try:
+                async with httpx.AsyncClient() as client:
+                    response = await client.get(url, timeout=5.0)
+                    if response.status_code == 200:
+                        return True
+            except Exception:
+                pass
+
+            await asyncio.sleep(2)
+
+        return False
+
+    async def stop_service(self, name: str, timeout: int = 10):
+        """
+        Stop a running service
+
+        Args:
+            name: Service name
+            timeout: Seconds to wait for graceful shutdown
+        """
+        if name not in self.processes:
+            self.logger.warning(f"Service {name} not in process registry")
+            return
+
+        proc = self.processes[name]
+
+        if proc.poll() is None:  # Still running
+            self.logger.info(f"Stopping service {name}...")
+            try:
+                # Send SIGTERM to process group
+                os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
+
+                # Wait for graceful shutdown
+                try:
+                    proc.wait(timeout=timeout)
+                    self.logger.info(f"Service {name} stopped gracefully")
+                except subprocess.TimeoutExpired:
+                    # Force kill if not terminated
+                    self.logger.warning(f"Service {name} did not stop gracefully, forcing kill")
+                    os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
+                    proc.wait()
+
+            except Exception as e:
+                self.logger.error(f"Error stopping {name}: {e}", exc_info=True)
+
+        del self.processes[name]
+
+    async def restart_service(self, name: str) -> bool:
+        """
+        Restart a service
+
+        Args:
+            name: Service name
+
+        Returns:
+            bool: True if service restarted successfully
+        """
+        self.logger.info(f"Restarting service {name}...")
+        await self.stop_service(name)
+        await asyncio.sleep(2)  # Brief pause between stop and start
+        return await self.start_service(name)
+
+    async def check_health(self, name: str) -> bool:
+        """
+        Check if a service is healthy
+
+        Args:
+            name: Service name
+
+        Returns:
+            bool: True if service is running and healthy
+        """
+        if name not in self.processes:
+            return False
+
+        proc = self.processes[name]
+        if proc.poll() is not None:
+            return False
+
+        config = self.configs[name]
+        url = f"http://localhost:{config.port}{config.health_check_path}"
+
+        try:
+            async with httpx.AsyncClient() as client:
+                response = await client.get(url, timeout=5.0)
+                return response.status_code == 200
+        except Exception:
+            return False
+
+    async def monitor_services(self):
+        """
+        Monitor all services and auto-restart if configured
+
+        This runs continuously until shutdown_event is set.
+        """
+        self.logger.info("Starting service monitor...")
+
+        while not self.shutdown_event.is_set():
+            for name, config in self.configs.items():
+                if not config.auto_restart:
+                    continue
+
+                # Check if process exists and is healthy
+                if name in self.processes:
+                    proc = self.processes[name]
+                    if proc.poll() is not None:
+                        self.logger.warning(f"Service {name} died (code: {proc.returncode}), restarting...")
+                        await self.restart_service(name)
+                    elif not await self.check_health(name):
+                        self.logger.warning(f"Service {name} unhealthy, restarting...")
+                        await self.restart_service(name)
+
+            # Wait before next check
+            try:
+                await asyncio.wait_for(self.shutdown_event.wait(), timeout=10.0)
+            except asyncio.TimeoutError:
+                pass
+
+        self.logger.info("Service monitor stopped")
+
+    async def stop_all_services(self):
+        """Stop all running services"""
+        self.logger.info("Stopping all services...")
+        for name in list(self.processes.keys()):
+            await self.stop_service(name)
+        self.logger.info("All services stopped")
+
+    def get_service_status(self, name: str) -> Dict:
+        """
+        Get status information for a service
+
+        Args:
+            name: Service name
+
+        Returns:
+            dict: Status information
+        """
+        if name not in self.configs:
+            return {"status": "unknown", "error": "Service not registered"}
+
+        if name not in self.processes:
+            return {"status": "stopped"}
+
+        proc = self.processes[name]
+        if proc.poll() is not None:
+            return {
+                "status": "exited",
+                "exit_code": proc.returncode
+            }
+
+        config = self.configs[name]
+        return {
+            "status": "running",
+            "pid": proc.pid,
+            "port": config.port
+        }
+
+    def get_all_service_status(self) -> Dict:
+        """
+        Get status for all registered services
+
+        Returns:
+            dict: Service name -> status mapping
+        """
+        return {
+            name: self.get_service_status(name)
+            for name in self.configs.keys()
+        }