feat: implement Ansible-based process architecture for RunPod
Major architecture overhaul to address RunPod Docker limitations: Core Infrastructure: - Add base_service.py: Abstract base class for all AI services - Add service_manager.py: Process lifecycle management - Add core/requirements.txt: Core dependencies Model Services (Standalone Python): - Add models/vllm/server.py: Qwen 2.5 7B text generation - Add models/flux/server.py: Flux.1 Schnell image generation - Add models/musicgen/server.py: MusicGen Medium music generation - Each service inherits from GPUService base class - OpenAI-compatible APIs - Standalone execution support Ansible Deployment: - Add playbook.yml: Comprehensive deployment automation - Add ansible.cfg: Ansible configuration - Add inventory.yml: Localhost inventory - Tags: base, python, dependencies, models, tailscale, validate, cleanup Scripts: - Add scripts/install.sh: Full installation wrapper - Add scripts/download-models.sh: Model download wrapper - Add scripts/start-all.sh: Start orchestrator - Add scripts/stop-all.sh: Stop all services Documentation: - Update ARCHITECTURE.md: Document distributed VPS+GPU architecture Benefits: - No Docker: Avoids RunPod CAP_SYS_ADMIN limitations - Fully reproducible via Ansible - Extensible: Add models in 3 steps - Direct Python execution (no container overhead) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
166
core/base_service.py
Normal file
166
core/base_service.py
Normal file
@@ -0,0 +1,166 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Base Service Class for AI Model Services
|
||||
|
||||
Provides common functionality for all model services:
|
||||
- Health check endpoint
|
||||
- Graceful shutdown handling
|
||||
- Logging configuration
|
||||
- Standard FastAPI setup
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI
|
||||
import uvicorn
|
||||
|
||||
|
||||
class BaseService(ABC):
|
||||
"""Abstract base class for all AI model services"""
|
||||
|
||||
def __init__(self, name: str, port: int, host: str = "0.0.0.0"):
|
||||
"""
|
||||
Initialize base service
|
||||
|
||||
Args:
|
||||
name: Service name (for logging)
|
||||
port: Port to run service on
|
||||
host: Host to bind to (default: 0.0.0.0)
|
||||
"""
|
||||
self.name = name
|
||||
self.port = port
|
||||
self.host = host
|
||||
self.app = FastAPI(title=f"{name} Service", version="1.0.0")
|
||||
self.logger = self._setup_logging()
|
||||
self.shutdown_event = asyncio.Event()
|
||||
|
||||
# Register standard endpoints
|
||||
self._register_health_endpoint()
|
||||
|
||||
# Register signal handlers for graceful shutdown
|
||||
self._register_signal_handlers()
|
||||
|
||||
# Allow subclasses to add custom routes
|
||||
self.create_app()
|
||||
|
||||
def _setup_logging(self) -> logging.Logger:
|
||||
"""Configure logging for the service"""
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format=f'%(asctime)s - {self.name} - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
return logging.getLogger(self.name)
|
||||
|
||||
def _register_health_endpoint(self):
|
||||
"""Register standard health check endpoint"""
|
||||
@self.app.get("/health")
|
||||
async def health_check():
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": self.name,
|
||||
"port": self.port
|
||||
}
|
||||
|
||||
def _register_signal_handlers(self):
|
||||
"""Register signal handlers for graceful shutdown"""
|
||||
def signal_handler(sig, frame):
|
||||
self.logger.info(f"Received signal {sig}, initiating graceful shutdown...")
|
||||
self.shutdown_event.set()
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
@abstractmethod
|
||||
def create_app(self):
|
||||
"""
|
||||
Create FastAPI routes for this service.
|
||||
Subclasses must implement this to add their specific endpoints.
|
||||
|
||||
Example:
|
||||
@self.app.post("/v1/generate")
|
||||
async def generate(request: MyRequest):
|
||||
return await self.model.generate(request)
|
||||
"""
|
||||
pass
|
||||
|
||||
async def initialize(self):
|
||||
"""
|
||||
Initialize the service (load models, etc.).
|
||||
Subclasses can override this for custom initialization.
|
||||
"""
|
||||
self.logger.info(f"Initializing {self.name} service...")
|
||||
|
||||
async def cleanup(self):
|
||||
"""
|
||||
Cleanup resources on shutdown.
|
||||
Subclasses can override this for custom cleanup.
|
||||
"""
|
||||
self.logger.info(f"Cleaning up {self.name} service...")
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Run the service.
|
||||
This is the main entry point that starts the FastAPI server.
|
||||
"""
|
||||
try:
|
||||
self.logger.info(f"Starting {self.name} service on {self.host}:{self.port}")
|
||||
|
||||
# Run initialization
|
||||
asyncio.run(self.initialize())
|
||||
|
||||
# Start uvicorn server
|
||||
config = uvicorn.Config(
|
||||
app=self.app,
|
||||
host=self.host,
|
||||
port=self.port,
|
||||
log_level="info",
|
||||
access_log=True
|
||||
)
|
||||
server = uvicorn.Server(config)
|
||||
|
||||
# Run server
|
||||
asyncio.run(server.serve())
|
||||
|
||||
except KeyboardInterrupt:
|
||||
self.logger.info("Keyboard interrupt received")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error running service: {e}", exc_info=True)
|
||||
sys.exit(1)
|
||||
finally:
|
||||
# Cleanup
|
||||
asyncio.run(self.cleanup())
|
||||
self.logger.info(f"{self.name} service stopped")
|
||||
|
||||
|
||||
class GPUService(BaseService):
|
||||
"""
|
||||
Base class for GPU-accelerated services.
|
||||
Provides additional GPU-specific functionality.
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, port: int, host: str = "0.0.0.0"):
|
||||
super().__init__(name, port, host)
|
||||
self._check_gpu_availability()
|
||||
|
||||
def _check_gpu_availability(self):
|
||||
"""Check if GPU is available"""
|
||||
try:
|
||||
import torch
|
||||
if torch.cuda.is_available():
|
||||
gpu_count = torch.cuda.device_count()
|
||||
gpu_name = torch.cuda.get_device_name(0)
|
||||
self.logger.info(f"GPU available: {gpu_name} (count: {gpu_count})")
|
||||
else:
|
||||
self.logger.warning("No GPU available - service may run slowly")
|
||||
except ImportError:
|
||||
self.logger.warning("PyTorch not installed - cannot check GPU availability")
|
||||
15
core/requirements.txt
Normal file
15
core/requirements.txt
Normal file
@@ -0,0 +1,15 @@
|
||||
# Core dependencies for AI service infrastructure
|
||||
|
||||
# FastAPI and server
|
||||
fastapi==0.104.1
|
||||
uvicorn[standard]==0.24.0
|
||||
pydantic==2.5.0
|
||||
|
||||
# HTTP client for health checks and proxying
|
||||
httpx==0.25.1
|
||||
|
||||
# YAML configuration
|
||||
pyyaml==6.0.1
|
||||
|
||||
# Process management
|
||||
psutil==5.9.6
|
||||
301
core/service_manager.py
Normal file
301
core/service_manager.py
Normal file
@@ -0,0 +1,301 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Service Manager for AI Model Services
|
||||
|
||||
Manages lifecycle of model services running as Python processes:
|
||||
- Start/stop services
|
||||
- Health monitoring
|
||||
- Auto-restart on failure
|
||||
- Resource cleanup
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
@dataclass
|
||||
class ServiceConfig:
|
||||
"""Configuration for a service"""
|
||||
name: str
|
||||
script_path: Path
|
||||
port: int
|
||||
startup_timeout: int = 120
|
||||
health_check_path: str = "/health"
|
||||
auto_restart: bool = False
|
||||
env: Optional[Dict[str, str]] = None
|
||||
|
||||
|
||||
class ServiceManager:
|
||||
"""Manages multiple AI model services as subprocesses"""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = logging.getLogger("ServiceManager")
|
||||
self.processes: Dict[str, subprocess.Popen] = {}
|
||||
self.configs: Dict[str, ServiceConfig] = {}
|
||||
self.shutdown_event = asyncio.Event()
|
||||
|
||||
def register_service(self, config: ServiceConfig):
|
||||
"""Register a service configuration"""
|
||||
self.configs[config.name] = config
|
||||
self.logger.info(f"Registered service: {config.name} on port {config.port}")
|
||||
|
||||
async def start_service(self, name: str) -> bool:
|
||||
"""
|
||||
Start a service by name
|
||||
|
||||
Args:
|
||||
name: Service name to start
|
||||
|
||||
Returns:
|
||||
bool: True if service started successfully
|
||||
"""
|
||||
if name not in self.configs:
|
||||
self.logger.error(f"Service {name} not registered")
|
||||
return False
|
||||
|
||||
if name in self.processes:
|
||||
proc = self.processes[name]
|
||||
if proc.poll() is None:
|
||||
self.logger.info(f"Service {name} already running")
|
||||
return True
|
||||
|
||||
config = self.configs[name]
|
||||
self.logger.info(f"Starting service {name}...")
|
||||
|
||||
try:
|
||||
# Prepare environment
|
||||
env = os.environ.copy()
|
||||
if config.env:
|
||||
env.update(config.env)
|
||||
env.update({
|
||||
'PORT': str(config.port),
|
||||
'HOST': '0.0.0.0'
|
||||
})
|
||||
|
||||
# Start process
|
||||
proc = subprocess.Popen(
|
||||
['python3', str(config.script_path)],
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
preexec_fn=os.setsid # Create new process group
|
||||
)
|
||||
|
||||
self.processes[name] = proc
|
||||
self.logger.info(f"Process started for {name} (PID: {proc.pid})")
|
||||
|
||||
# Wait for health check
|
||||
if await self._wait_for_health(name, config):
|
||||
self.logger.info(f"Service {name} is healthy and ready")
|
||||
return True
|
||||
else:
|
||||
self.logger.error(f"Service {name} failed health check")
|
||||
await self.stop_service(name)
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error starting {name}: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
async def _wait_for_health(self, name: str, config: ServiceConfig) -> bool:
|
||||
"""
|
||||
Wait for service to become healthy
|
||||
|
||||
Args:
|
||||
name: Service name
|
||||
config: Service configuration
|
||||
|
||||
Returns:
|
||||
bool: True if service becomes healthy within timeout
|
||||
"""
|
||||
proc = self.processes.get(name)
|
||||
if not proc:
|
||||
return False
|
||||
|
||||
start_time = time.time()
|
||||
url = f"http://localhost:{config.port}{config.health_check_path}"
|
||||
|
||||
while time.time() - start_time < config.startup_timeout:
|
||||
# Check if process is still running
|
||||
if proc.poll() is not None:
|
||||
self.logger.error(f"Process for {name} exited prematurely (code: {proc.returncode})")
|
||||
return False
|
||||
|
||||
# Try health check
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.get(url, timeout=5.0)
|
||||
if response.status_code == 200:
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
await asyncio.sleep(2)
|
||||
|
||||
return False
|
||||
|
||||
async def stop_service(self, name: str, timeout: int = 10):
|
||||
"""
|
||||
Stop a running service
|
||||
|
||||
Args:
|
||||
name: Service name
|
||||
timeout: Seconds to wait for graceful shutdown
|
||||
"""
|
||||
if name not in self.processes:
|
||||
self.logger.warning(f"Service {name} not in process registry")
|
||||
return
|
||||
|
||||
proc = self.processes[name]
|
||||
|
||||
if proc.poll() is None: # Still running
|
||||
self.logger.info(f"Stopping service {name}...")
|
||||
try:
|
||||
# Send SIGTERM to process group
|
||||
os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
|
||||
|
||||
# Wait for graceful shutdown
|
||||
try:
|
||||
proc.wait(timeout=timeout)
|
||||
self.logger.info(f"Service {name} stopped gracefully")
|
||||
except subprocess.TimeoutExpired:
|
||||
# Force kill if not terminated
|
||||
self.logger.warning(f"Service {name} did not stop gracefully, forcing kill")
|
||||
os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
|
||||
proc.wait()
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error stopping {name}: {e}", exc_info=True)
|
||||
|
||||
del self.processes[name]
|
||||
|
||||
async def restart_service(self, name: str) -> bool:
|
||||
"""
|
||||
Restart a service
|
||||
|
||||
Args:
|
||||
name: Service name
|
||||
|
||||
Returns:
|
||||
bool: True if service restarted successfully
|
||||
"""
|
||||
self.logger.info(f"Restarting service {name}...")
|
||||
await self.stop_service(name)
|
||||
await asyncio.sleep(2) # Brief pause between stop and start
|
||||
return await self.start_service(name)
|
||||
|
||||
async def check_health(self, name: str) -> bool:
|
||||
"""
|
||||
Check if a service is healthy
|
||||
|
||||
Args:
|
||||
name: Service name
|
||||
|
||||
Returns:
|
||||
bool: True if service is running and healthy
|
||||
"""
|
||||
if name not in self.processes:
|
||||
return False
|
||||
|
||||
proc = self.processes[name]
|
||||
if proc.poll() is not None:
|
||||
return False
|
||||
|
||||
config = self.configs[name]
|
||||
url = f"http://localhost:{config.port}{config.health_check_path}"
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.get(url, timeout=5.0)
|
||||
return response.status_code == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
async def monitor_services(self):
|
||||
"""
|
||||
Monitor all services and auto-restart if configured
|
||||
|
||||
This runs continuously until shutdown_event is set.
|
||||
"""
|
||||
self.logger.info("Starting service monitor...")
|
||||
|
||||
while not self.shutdown_event.is_set():
|
||||
for name, config in self.configs.items():
|
||||
if not config.auto_restart:
|
||||
continue
|
||||
|
||||
# Check if process exists and is healthy
|
||||
if name in self.processes:
|
||||
proc = self.processes[name]
|
||||
if proc.poll() is not None:
|
||||
self.logger.warning(f"Service {name} died (code: {proc.returncode}), restarting...")
|
||||
await self.restart_service(name)
|
||||
elif not await self.check_health(name):
|
||||
self.logger.warning(f"Service {name} unhealthy, restarting...")
|
||||
await self.restart_service(name)
|
||||
|
||||
# Wait before next check
|
||||
try:
|
||||
await asyncio.wait_for(self.shutdown_event.wait(), timeout=10.0)
|
||||
except asyncio.TimeoutError:
|
||||
pass
|
||||
|
||||
self.logger.info("Service monitor stopped")
|
||||
|
||||
async def stop_all_services(self):
|
||||
"""Stop all running services"""
|
||||
self.logger.info("Stopping all services...")
|
||||
for name in list(self.processes.keys()):
|
||||
await self.stop_service(name)
|
||||
self.logger.info("All services stopped")
|
||||
|
||||
def get_service_status(self, name: str) -> Dict:
|
||||
"""
|
||||
Get status information for a service
|
||||
|
||||
Args:
|
||||
name: Service name
|
||||
|
||||
Returns:
|
||||
dict: Status information
|
||||
"""
|
||||
if name not in self.configs:
|
||||
return {"status": "unknown", "error": "Service not registered"}
|
||||
|
||||
if name not in self.processes:
|
||||
return {"status": "stopped"}
|
||||
|
||||
proc = self.processes[name]
|
||||
if proc.poll() is not None:
|
||||
return {
|
||||
"status": "exited",
|
||||
"exit_code": proc.returncode
|
||||
}
|
||||
|
||||
config = self.configs[name]
|
||||
return {
|
||||
"status": "running",
|
||||
"pid": proc.pid,
|
||||
"port": config.port
|
||||
}
|
||||
|
||||
def get_all_service_status(self) -> Dict:
|
||||
"""
|
||||
Get status for all registered services
|
||||
|
||||
Returns:
|
||||
dict: Service name -> status mapping
|
||||
"""
|
||||
return {
|
||||
name: self.get_service_status(name)
|
||||
for name in self.configs.keys()
|
||||
}
|
||||
Reference in New Issue
Block a user