feat: implement Ansible-based process architecture for RunPod

Major architecture overhaul to address RunPod Docker limitations:

Core Infrastructure:
- Add base_service.py: Abstract base class for all AI services
- Add service_manager.py: Process lifecycle management
- Add core/requirements.txt: Core dependencies

Model Services (Standalone Python):
- Add models/vllm/server.py: Qwen 2.5 7B text generation
- Add models/flux/server.py: Flux.1 Schnell image generation
- Add models/musicgen/server.py: MusicGen Medium music generation
- Each service inherits from GPUService base class
- OpenAI-compatible APIs
- Standalone execution support

Ansible Deployment:
- Add playbook.yml: Comprehensive deployment automation
- Add ansible.cfg: Ansible configuration
- Add inventory.yml: Localhost inventory
- Tags: base, python, dependencies, models, tailscale, validate, cleanup

Scripts:
- Add scripts/install.sh: Full installation wrapper
- Add scripts/download-models.sh: Model download wrapper
- Add scripts/start-all.sh: Start orchestrator
- Add scripts/stop-all.sh: Stop all services

Documentation:
- Update ARCHITECTURE.md: Document distributed VPS+GPU architecture

Benefits:
- No Docker: Avoids RunPod CAP_SYS_ADMIN limitations
- Fully reproducible via Ansible
- Extensible: Add models in 3 steps
- Direct Python execution (no container overhead)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-11-21 15:37:18 +01:00
parent 03a430894d
commit 9ee626a78e
17 changed files with 1817 additions and 5 deletions

166
core/base_service.py Normal file
View File

@@ -0,0 +1,166 @@
#!/usr/bin/env python3
"""
Base Service Class for AI Model Services
Provides common functionality for all model services:
- Health check endpoint
- Graceful shutdown handling
- Logging configuration
- Standard FastAPI setup
"""
import asyncio
import logging
import os
import signal
import sys
from abc import ABC, abstractmethod
from typing import Optional
from fastapi import FastAPI
import uvicorn
class BaseService(ABC):
"""Abstract base class for all AI model services"""
def __init__(self, name: str, port: int, host: str = "0.0.0.0"):
"""
Initialize base service
Args:
name: Service name (for logging)
port: Port to run service on
host: Host to bind to (default: 0.0.0.0)
"""
self.name = name
self.port = port
self.host = host
self.app = FastAPI(title=f"{name} Service", version="1.0.0")
self.logger = self._setup_logging()
self.shutdown_event = asyncio.Event()
# Register standard endpoints
self._register_health_endpoint()
# Register signal handlers for graceful shutdown
self._register_signal_handlers()
# Allow subclasses to add custom routes
self.create_app()
def _setup_logging(self) -> logging.Logger:
"""Configure logging for the service"""
logging.basicConfig(
level=logging.INFO,
format=f'%(asctime)s - {self.name} - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
return logging.getLogger(self.name)
def _register_health_endpoint(self):
"""Register standard health check endpoint"""
@self.app.get("/health")
async def health_check():
"""Health check endpoint"""
return {
"status": "healthy",
"service": self.name,
"port": self.port
}
def _register_signal_handlers(self):
"""Register signal handlers for graceful shutdown"""
def signal_handler(sig, frame):
self.logger.info(f"Received signal {sig}, initiating graceful shutdown...")
self.shutdown_event.set()
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
@abstractmethod
def create_app(self):
"""
Create FastAPI routes for this service.
Subclasses must implement this to add their specific endpoints.
Example:
@self.app.post("/v1/generate")
async def generate(request: MyRequest):
return await self.model.generate(request)
"""
pass
async def initialize(self):
"""
Initialize the service (load models, etc.).
Subclasses can override this for custom initialization.
"""
self.logger.info(f"Initializing {self.name} service...")
async def cleanup(self):
"""
Cleanup resources on shutdown.
Subclasses can override this for custom cleanup.
"""
self.logger.info(f"Cleaning up {self.name} service...")
def run(self):
"""
Run the service.
This is the main entry point that starts the FastAPI server.
"""
try:
self.logger.info(f"Starting {self.name} service on {self.host}:{self.port}")
# Run initialization
asyncio.run(self.initialize())
# Start uvicorn server
config = uvicorn.Config(
app=self.app,
host=self.host,
port=self.port,
log_level="info",
access_log=True
)
server = uvicorn.Server(config)
# Run server
asyncio.run(server.serve())
except KeyboardInterrupt:
self.logger.info("Keyboard interrupt received")
except Exception as e:
self.logger.error(f"Error running service: {e}", exc_info=True)
sys.exit(1)
finally:
# Cleanup
asyncio.run(self.cleanup())
self.logger.info(f"{self.name} service stopped")
class GPUService(BaseService):
"""
Base class for GPU-accelerated services.
Provides additional GPU-specific functionality.
"""
def __init__(self, name: str, port: int, host: str = "0.0.0.0"):
super().__init__(name, port, host)
self._check_gpu_availability()
def _check_gpu_availability(self):
"""Check if GPU is available"""
try:
import torch
if torch.cuda.is_available():
gpu_count = torch.cuda.device_count()
gpu_name = torch.cuda.get_device_name(0)
self.logger.info(f"GPU available: {gpu_name} (count: {gpu_count})")
else:
self.logger.warning("No GPU available - service may run slowly")
except ImportError:
self.logger.warning("PyTorch not installed - cannot check GPU availability")

15
core/requirements.txt Normal file
View File

@@ -0,0 +1,15 @@
# Core dependencies for AI service infrastructure
# FastAPI and server
fastapi==0.104.1
uvicorn[standard]==0.24.0
pydantic==2.5.0
# HTTP client for health checks and proxying
httpx==0.25.1
# YAML configuration
pyyaml==6.0.1
# Process management
psutil==5.9.6

301
core/service_manager.py Normal file
View File

@@ -0,0 +1,301 @@
#!/usr/bin/env python3
"""
Service Manager for AI Model Services
Manages lifecycle of model services running as Python processes:
- Start/stop services
- Health monitoring
- Auto-restart on failure
- Resource cleanup
"""
import asyncio
import logging
import os
import signal
import subprocess
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Optional
import httpx
@dataclass
class ServiceConfig:
"""Configuration for a service"""
name: str
script_path: Path
port: int
startup_timeout: int = 120
health_check_path: str = "/health"
auto_restart: bool = False
env: Optional[Dict[str, str]] = None
class ServiceManager:
"""Manages multiple AI model services as subprocesses"""
def __init__(self):
self.logger = logging.getLogger("ServiceManager")
self.processes: Dict[str, subprocess.Popen] = {}
self.configs: Dict[str, ServiceConfig] = {}
self.shutdown_event = asyncio.Event()
def register_service(self, config: ServiceConfig):
"""Register a service configuration"""
self.configs[config.name] = config
self.logger.info(f"Registered service: {config.name} on port {config.port}")
async def start_service(self, name: str) -> bool:
"""
Start a service by name
Args:
name: Service name to start
Returns:
bool: True if service started successfully
"""
if name not in self.configs:
self.logger.error(f"Service {name} not registered")
return False
if name in self.processes:
proc = self.processes[name]
if proc.poll() is None:
self.logger.info(f"Service {name} already running")
return True
config = self.configs[name]
self.logger.info(f"Starting service {name}...")
try:
# Prepare environment
env = os.environ.copy()
if config.env:
env.update(config.env)
env.update({
'PORT': str(config.port),
'HOST': '0.0.0.0'
})
# Start process
proc = subprocess.Popen(
['python3', str(config.script_path)],
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
preexec_fn=os.setsid # Create new process group
)
self.processes[name] = proc
self.logger.info(f"Process started for {name} (PID: {proc.pid})")
# Wait for health check
if await self._wait_for_health(name, config):
self.logger.info(f"Service {name} is healthy and ready")
return True
else:
self.logger.error(f"Service {name} failed health check")
await self.stop_service(name)
return False
except Exception as e:
self.logger.error(f"Error starting {name}: {e}", exc_info=True)
return False
async def _wait_for_health(self, name: str, config: ServiceConfig) -> bool:
"""
Wait for service to become healthy
Args:
name: Service name
config: Service configuration
Returns:
bool: True if service becomes healthy within timeout
"""
proc = self.processes.get(name)
if not proc:
return False
start_time = time.time()
url = f"http://localhost:{config.port}{config.health_check_path}"
while time.time() - start_time < config.startup_timeout:
# Check if process is still running
if proc.poll() is not None:
self.logger.error(f"Process for {name} exited prematurely (code: {proc.returncode})")
return False
# Try health check
try:
async with httpx.AsyncClient() as client:
response = await client.get(url, timeout=5.0)
if response.status_code == 200:
return True
except Exception:
pass
await asyncio.sleep(2)
return False
async def stop_service(self, name: str, timeout: int = 10):
"""
Stop a running service
Args:
name: Service name
timeout: Seconds to wait for graceful shutdown
"""
if name not in self.processes:
self.logger.warning(f"Service {name} not in process registry")
return
proc = self.processes[name]
if proc.poll() is None: # Still running
self.logger.info(f"Stopping service {name}...")
try:
# Send SIGTERM to process group
os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
# Wait for graceful shutdown
try:
proc.wait(timeout=timeout)
self.logger.info(f"Service {name} stopped gracefully")
except subprocess.TimeoutExpired:
# Force kill if not terminated
self.logger.warning(f"Service {name} did not stop gracefully, forcing kill")
os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
proc.wait()
except Exception as e:
self.logger.error(f"Error stopping {name}: {e}", exc_info=True)
del self.processes[name]
async def restart_service(self, name: str) -> bool:
"""
Restart a service
Args:
name: Service name
Returns:
bool: True if service restarted successfully
"""
self.logger.info(f"Restarting service {name}...")
await self.stop_service(name)
await asyncio.sleep(2) # Brief pause between stop and start
return await self.start_service(name)
async def check_health(self, name: str) -> bool:
"""
Check if a service is healthy
Args:
name: Service name
Returns:
bool: True if service is running and healthy
"""
if name not in self.processes:
return False
proc = self.processes[name]
if proc.poll() is not None:
return False
config = self.configs[name]
url = f"http://localhost:{config.port}{config.health_check_path}"
try:
async with httpx.AsyncClient() as client:
response = await client.get(url, timeout=5.0)
return response.status_code == 200
except Exception:
return False
async def monitor_services(self):
"""
Monitor all services and auto-restart if configured
This runs continuously until shutdown_event is set.
"""
self.logger.info("Starting service monitor...")
while not self.shutdown_event.is_set():
for name, config in self.configs.items():
if not config.auto_restart:
continue
# Check if process exists and is healthy
if name in self.processes:
proc = self.processes[name]
if proc.poll() is not None:
self.logger.warning(f"Service {name} died (code: {proc.returncode}), restarting...")
await self.restart_service(name)
elif not await self.check_health(name):
self.logger.warning(f"Service {name} unhealthy, restarting...")
await self.restart_service(name)
# Wait before next check
try:
await asyncio.wait_for(self.shutdown_event.wait(), timeout=10.0)
except asyncio.TimeoutError:
pass
self.logger.info("Service monitor stopped")
async def stop_all_services(self):
"""Stop all running services"""
self.logger.info("Stopping all services...")
for name in list(self.processes.keys()):
await self.stop_service(name)
self.logger.info("All services stopped")
def get_service_status(self, name: str) -> Dict:
"""
Get status information for a service
Args:
name: Service name
Returns:
dict: Status information
"""
if name not in self.configs:
return {"status": "unknown", "error": "Service not registered"}
if name not in self.processes:
return {"status": "stopped"}
proc = self.processes[name]
if proc.poll() is not None:
return {
"status": "exited",
"exit_code": proc.returncode
}
config = self.configs[name]
return {
"status": "running",
"pid": proc.pid,
"port": config.port
}
def get_all_service_status(self) -> Dict:
"""
Get status for all registered services
Returns:
dict: Service name -> status mapping
"""
return {
name: self.get_service_status(name)
for name in self.configs.keys()
}