From 9a637cc4fc9f11793a02a0453810fe12bf92070e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= Date: Fri, 21 Nov 2025 16:17:38 +0100 Subject: [PATCH] refactor: clean Docker files and restore standalone model services MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove all Docker-related files (Dockerfiles, compose.yaml) - Remove documentation files (README, ARCHITECTURE, docs/) - Remove old core/ directory (base_service, service_manager) - Update models.yaml with correct service_script paths (models/*/server.py) - Simplify vLLM requirements.txt to let vLLM manage dependencies - Restore original standalone vLLM server (no base_service dependency) - Remove obsolete vllm/, musicgen/, flux/ directories Process-based architecture is now fully functional on RunPod. ๐Ÿค– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- ARCHITECTURE.md | 214 --------------- README.md | 173 ------------ compose.yaml | 104 -------- core/base_service.py | 166 ------------ core/requirements.txt | 15 -- core/service_manager.py | 301 --------------------- docs/DEPLOYMENT.md | 467 --------------------------------- docs/GPU_DEPLOYMENT_LOG.md | 421 ----------------------------- docs/RUNPOD_TEMPLATE.md | 416 ----------------------------- flux/config/config.json | 13 - model-orchestrator/Dockerfile | 22 -- model-orchestrator/models.yaml | 6 +- models/vllm/requirements.txt | 11 +- models/vllm/server.py | 443 +++++++++++++++---------------- musicgen/Dockerfile | 38 --- musicgen/requirements.txt | 6 - musicgen/server.py | 194 -------------- vllm/Dockerfile | 34 --- vllm/requirements.txt | 4 - vllm/server.py | 302 --------------------- 20 files changed, 228 insertions(+), 3122 deletions(-) delete mode 100644 ARCHITECTURE.md delete mode 100644 README.md delete mode 100644 compose.yaml delete mode 100644 core/base_service.py delete mode 100644 core/requirements.txt delete mode 100644 core/service_manager.py delete mode 100644 docs/DEPLOYMENT.md delete mode 100644 docs/GPU_DEPLOYMENT_LOG.md delete mode 100644 docs/RUNPOD_TEMPLATE.md delete mode 100644 flux/config/config.json delete mode 100644 model-orchestrator/Dockerfile delete mode 100644 musicgen/Dockerfile delete mode 100644 musicgen/requirements.txt delete mode 100644 musicgen/server.py delete mode 100644 vllm/Dockerfile delete mode 100644 vllm/requirements.txt delete mode 100644 vllm/server.py diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md deleted file mode 100644 index 900c3aa..0000000 --- a/ARCHITECTURE.md +++ /dev/null @@ -1,214 +0,0 @@ -# RunPod Multi-Modal AI Architecture - -**Clean, extensible distributed AI infrastructure spanning VPS and GPU** - -## Design Principles - -1. **Distributed** - VPS (UI/proxy) + GPU (models) connected via Tailscale -2. **No Docker on GPU** - Direct Python for RunPod compatibility -3. **Extensible** - Adding new models requires minimal code -4. **Maintainable** - Clear structure and separation of concerns -5. **Simple** - One command to start, easy to debug -6. **OpenAI Compatible** - Works with standard AI tools - -## Directory Structure - -``` -runpod/ -โ”œโ”€โ”€ core/ # Core infrastructure -โ”‚ โ”œโ”€โ”€ base_service.py # Abstract base class for all services -โ”‚ โ”œโ”€โ”€ service_manager.py # Process lifecycle management -โ”‚ โ””โ”€โ”€ requirements.txt # Core dependencies -โ”‚ -โ”œโ”€โ”€ model-orchestrator/ # Request orchestration -โ”‚ โ”œโ”€โ”€ orchestrator.py # Main orchestrator (process-based) -โ”‚ โ”œโ”€โ”€ models.yaml # Model registry (simple config) -โ”‚ โ””โ”€โ”€ requirements.txt # Orchestrator dependencies -โ”‚ -โ”œโ”€โ”€ models/ # Model service implementations -โ”‚ โ”œโ”€โ”€ vllm/ # Text generation -โ”‚ โ”‚ โ”œโ”€โ”€ server.py # vLLM service (inherits base_service) -โ”‚ โ”‚ โ””โ”€โ”€ requirements.txt # vLLM dependencies -โ”‚ โ”‚ -โ”‚ โ”œโ”€โ”€ flux/ # Image generation -โ”‚ โ”‚ โ”œโ”€โ”€ server.py # Flux service -โ”‚ โ”‚ โ””โ”€โ”€ requirements.txt # Flux dependencies -โ”‚ โ”‚ -โ”‚ โ””โ”€โ”€ musicgen/ # Music generation -โ”‚ โ”œโ”€โ”€ server.py # MusicGen service -โ”‚ โ””โ”€โ”€ requirements.txt # AudioCraft dependencies -โ”‚ -โ”œโ”€โ”€ scripts/ # Deployment & management -โ”‚ โ”œโ”€โ”€ install.sh # Install all dependencies -โ”‚ โ”œโ”€โ”€ download-models.sh # Pre-download models -โ”‚ โ”œโ”€โ”€ start-all.sh # Start orchestrator + services -โ”‚ โ”œโ”€โ”€ stop-all.sh # Stop all services -โ”‚ โ””โ”€โ”€ prepare-template.sh # RunPod template preparation -โ”‚ -โ”œโ”€โ”€ systemd/ # Optional systemd services -โ”‚ โ”œโ”€โ”€ ai-orchestrator.service -โ”‚ โ””โ”€โ”€ install-services.sh -โ”‚ -โ””โ”€โ”€ docs/ # Documentation - โ”œโ”€โ”€ ADDING_MODELS.md # Guide for adding new models - โ”œโ”€โ”€ DEPLOYMENT.md # Deployment guide - โ””โ”€โ”€ RUNPOD_TEMPLATE.md # Template creation guide -``` - -## Component Responsibilities - -### Core (`core/`) -- **base_service.py**: Abstract base class for all model services - - Health check endpoint - - Graceful shutdown - - Logging configuration - - Common utilities - -- **service_manager.py**: Process lifecycle management - - Start/stop services - - Health monitoring - - Auto-restart on failure - - Resource cleanup - -### Orchestrator (`model-orchestrator/`) -- **orchestrator.py**: Routes requests to appropriate model - - Reads `models.yaml` configuration - - Manages model switching - - Proxies requests to services - - OpenAI-compatible API - -- **models.yaml**: Simple model registry - ```yaml - models: - model-name: - type: text|image|audio - service_script: path/to/server.py - port: 8001 - startup_time: 120 - endpoint: /v1/chat/completions - ``` - -### Models (`models/`) -Each model directory contains: -- **server.py**: Service implementation (inherits `BaseService`) -- **requirements.txt**: Model-specific dependencies - -Services are standalone - can run independently for testing. - -### Scripts (`scripts/`) -- **install.sh**: Install Python packages for all services -- **download-models.sh**: Pre-download models to `/workspace` -- **start-all.sh**: Start orchestrator (which manages model services) -- **stop-all.sh**: Graceful shutdown of all services -- **prepare-template.sh**: RunPod template preparation - -## Adding a New Model (3 steps) - -### 1. Create Model Service - -```python -# models/mymodel/server.py -from core.base_service import BaseService - -class MyModelService(BaseService): - def __init__(self): - super().__init__( - name="mymodel", - port=8004 - ) - - async def initialize(self): - """Load model""" - self.model = load_my_model() - - def create_app(self): - """Define FastAPI routes""" - @self.app.post("/v1/mymodel/generate") - async def generate(request: MyRequest): - return self.model.generate(request.prompt) - -if __name__ == "__main__": - service = MyModelService() - service.run() -``` - -### 2. Add to Registry - -```yaml -# model-orchestrator/models.yaml -models: - mymodel: - type: custom - service_script: models/mymodel/server.py - port: 8004 - startup_time: 60 - endpoint: /v1/mymodel/generate -``` - -### 3. Add Dependencies - -``` -# models/mymodel/requirements.txt -transformers==4.36.0 -torch==2.1.0 -``` - -That's it! The orchestrator handles everything else. - -## Request Flow - -``` -Client Request - โ†“ -Orchestrator (port 9000) - โ†“ (determines model from endpoint) -Model Service (port 8001-800X) - โ†“ -Response -``` - -## Startup Flow - -1. Run `scripts/start-all.sh` -2. Orchestrator starts on port 9000 -3. Orchestrator reads `models.yaml` -4. On first request: - - Orchestrator starts appropriate model service - - Waits for health check - - Proxies request -5. On subsequent requests: - - If same model: direct proxy - - If different model: stop current, start new - -## Benefits - -- **Simple**: No Docker complexity, just Python -- **Fast**: No container overhead, direct execution -- **Debuggable**: Standard Python processes, easy to inspect -- **Extensible**: Add models by creating one file + YAML entry -- **Maintainable**: Clear structure, base classes, DRY principles -- **Portable**: Works anywhere Python runs (local, RunPod, other cloud) - -## Development Workflow - -```bash -# Local development -python3 models/vllm/server.py # Test service directly -python3 model-orchestrator/orchestrator.py # Test orchestrator - -# RunPod deployment -./scripts/install.sh # Install dependencies -./scripts/download-models.sh # Pre-download models -./scripts/start-all.sh # Start everything - -# Create template -./scripts/prepare-template.sh # Prepare for template save -``` - -## Future Enhancements - -- Load balancing across multiple GPUs -- Model pooling (keep multiple models loaded) -- Batch request queueing -- Metrics and monitoring -- Auto-scaling based on demand diff --git a/README.md b/README.md deleted file mode 100644 index 888b5c8..0000000 --- a/README.md +++ /dev/null @@ -1,173 +0,0 @@ -# RunPod Multi-Modal AI Stack - -**Cost-optimized GPU deployment for text, image, and music generation on RunPod RTX 4090.** - -This repository contains everything needed to deploy and manage a multi-modal AI infrastructure on RunPod, featuring intelligent model orchestration that automatically switches between models based on request type. - -## Features - -- **Text Generation**: Qwen 2.5 7B Instruct via vLLM (~50 tokens/sec) -- **Image Generation**: Flux.1 Schnell (~4-5 seconds per image) -- **Music Generation**: MusicGen Medium (30 seconds of audio in 60-90 seconds) -- **Automatic Model Switching**: Intelligent orchestrator manages sequential model loading -- **OpenAI-Compatible APIs**: Works with existing AI tools and clients -- **Easy Model Addition**: Just edit `model-orchestrator/models.yaml` to add new models -- **Template Support**: Create reusable templates for 2-3 minute deployments (vs 60-90 minutes) - -## Quick Start - -### Option 1: Deploy from Template (Recommended) - -If you've already created a RunPod template: - -1. Deploy pod from template in RunPod dashboard -2. SSH to the pod -3. Create `.env` file with your credentials -4. Start orchestrator: `docker compose -f compose.yaml up -d orchestrator` - -**See**: [RUNPOD_TEMPLATE.md](RUNPOD_TEMPLATE.md) for template usage instructions. - -### Option 2: Fresh Deployment - -For first-time setup on a new RunPod instance: - -1. Copy files to RunPod: `scp -r * gpu-server:/workspace/ai/` -2. SSH to GPU server: `ssh gpu-server` -3. Run preparation script: `cd /workspace/ai && chmod +x scripts/prepare-template.sh && ./scripts/prepare-template.sh` - -**See**: [docs/DEPLOYMENT.md](docs/DEPLOYMENT.md) for detailed deployment guide. - -## Architecture - -``` -VPS (LiteLLM Proxy) - โ†“ Tailscale VPN -GPU Server (Orchestrator Port 9000) - โ”œโ”€โ”€ vLLM (Qwen 2.5 7B) - Port 8001 - โ”œโ”€โ”€ Flux.1 Schnell - Port 8002 - โ””โ”€โ”€ MusicGen Medium - Port 8003 -``` - -All requests route through the orchestrator, which automatically loads the appropriate model. Only one model is active at a time for cost optimization (~$0.50/hr vs ~$0.75/hr for multi-GPU). - -## Cost Analysis - -**RunPod RTX 4090 Spot Instance**: -- **Hourly**: ~$0.50 -- **Monthly (24/7)**: ~$360 -- **Monthly (8hr/day)**: ~$120 - -**Template Benefits**: -- **Without Template**: 60-90 minutes setup per Spot restart -- **With Template**: 2-3 minutes deployment time -- **Spot Restart Frequency**: 2-5 times per week (variable) - -## Documentation - -- **[docs/DEPLOYMENT.md](docs/DEPLOYMENT.md)** - Complete deployment and usage guide -- **[docs/RUNPOD_TEMPLATE.md](docs/RUNPOD_TEMPLATE.md)** - Template creation and usage -- **[docs/GPU_DEPLOYMENT_LOG.md](docs/GPU_DEPLOYMENT_LOG.md)** - Deployment history and technical notes - -### Architecture Components -- `model-orchestrator/` - FastAPI orchestrator managing model lifecycle -- `vllm/` - Text generation service (Qwen 2.5 7B) -- `flux/` - Image generation service (Flux.1 Schnell) -- `musicgen/` - Music generation service (MusicGen Medium) -- `scripts/` - Automation scripts - -## Creating a RunPod Template - -**Why create a template?** -- Save 60-90 minutes on every Spot instance restart -- Pre-downloaded models (~37GB cached) -- Pre-built Docker images -- Ready-to-use configuration - -**How to create:** -1. Run `scripts/prepare-template.sh` on a fresh RunPod instance -2. Wait 45-60 minutes for models to download and images to build -3. Save pod as template in RunPod dashboard -4. Name: `multi-modal-ai-v1.0` - -**See**: [docs/RUNPOD_TEMPLATE.md](docs/RUNPOD_TEMPLATE.md) for step-by-step guide. - -## Adding New Models - -Adding models is easy! Just edit `model-orchestrator/models.yaml`: - -```yaml -models: - llama-3.1-8b: # New model - type: text - framework: vllm - docker_service: vllm-llama - port: 8004 - vram_gb: 17 - startup_time_seconds: 120 - endpoint: /v1/chat/completions -``` - -Then add the Docker service to `compose.yaml` and restart the orchestrator. - -**See**: [docs/DEPLOYMENT.md](docs/DEPLOYMENT.md#adding-new-models) for complete instructions. - -## Usage Examples - -### Text Generation -```bash -curl http://100.100.108.13:9000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{"model": "qwen-2.5-7b", "messages": [{"role": "user", "content": "Hello!"}]}' -``` - -### Image Generation -```bash -curl http://100.100.108.13:9000/v1/images/generations \ - -H "Content-Type: application/json" \ - -d '{"model": "flux-schnell", "prompt": "a cute cat", "size": "1024x1024"}' -``` - -### Music Generation -```bash -curl http://100.100.108.13:9000/v1/audio/generations \ - -H "Content-Type: application/json" \ - -d '{"model": "musicgen-medium", "prompt": "upbeat electronic", "duration": 30}' -``` - -## Infrastructure - -**Provider**: RunPod (Spot Instance) -**GPU**: NVIDIA RTX 4090 24GB VRAM -**Region**: Europe -**Network**: Tailscale VPN (100.100.108.13) -**Storage**: 922TB network volume at `/workspace` - -## Monitoring - -```bash -# Check active model -curl http://100.100.108.13:9000/health - -# View orchestrator logs -docker logs -f ai_orchestrator - -# GPU usage -nvidia-smi -``` - -## Support - -For issues: -1. Check orchestrator logs: `docker logs ai_orchestrator` -2. Review [docs/DEPLOYMENT.md](docs/DEPLOYMENT.md#troubleshooting) -3. Check [docs/GPU_DEPLOYMENT_LOG.md](docs/GPU_DEPLOYMENT_LOG.md) for deployment history - -## License - -Built with: -- [vLLM](https://github.com/vllm-project/vllm) - Apache 2.0 -- [AudioCraft](https://github.com/facebookresearch/audiocraft) - MIT (code), CC-BY-NC (weights) -- [Flux.1](https://github.com/black-forest-labs/flux) - Apache 2.0 -- [LiteLLM](https://github.com/BerriAI/litellm) - MIT - -**Note**: MusicGen pre-trained weights are non-commercial (CC-BY-NC). diff --git a/compose.yaml b/compose.yaml deleted file mode 100644 index 9ddfe84..0000000 --- a/compose.yaml +++ /dev/null @@ -1,104 +0,0 @@ -version: '3.8' - -# Multi-Modal AI Orchestration for RunPod RTX 4090 -# Manages text, image, and music generation with sequential model loading - -services: - # ============================================================================ - # ORCHESTRATOR (Always Running) - # ============================================================================ - orchestrator: - build: ./model-orchestrator - container_name: ai_orchestrator - ports: - - "9000:9000" - volumes: - - /var/run/docker.sock:/var/run/docker.sock:ro - - ./model-orchestrator/models.yaml:/app/models.yaml:ro - environment: - - MODELS_CONFIG=/app/models.yaml - - COMPOSE_PROJECT_NAME=ai - - GPU_MEMORY_GB=24 - restart: unless-stopped - network_mode: host - - # ============================================================================ - # TEXT GENERATION (vLLM + Qwen 2.5 7B) - # ============================================================================ - vllm-qwen: - build: ./vllm - container_name: ai_vllm-qwen_1 - ports: - - "8001:8000" - volumes: - - /workspace/huggingface_cache:/workspace/huggingface_cache - environment: - - HF_TOKEN=${HF_TOKEN} - - VLLM_HOST=0.0.0.0 - - VLLM_PORT=8000 - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - profiles: ["text"] # Only start when requested by orchestrator - restart: "no" # Orchestrator manages lifecycle - - # ============================================================================ - # IMAGE GENERATION (Flux.1 Schnell) - # ============================================================================ - flux: - image: ghcr.io/matatonic/openedai-images-flux:latest - container_name: ai_flux_1 - ports: - - "8002:5005" - volumes: - - /workspace/flux/models:/app/models - - ./flux/config:/app/config:ro - environment: - - HF_TOKEN=${HF_TOKEN} - - CONFIG_PATH=/app/config/config.json - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - profiles: ["image"] # Only start when requested by orchestrator - restart: "no" # Orchestrator manages lifecycle - - # ============================================================================ - # MUSIC GENERATION (MusicGen Medium) - # ============================================================================ - musicgen: - build: ./musicgen - container_name: ai_musicgen_1 - ports: - - "8003:8000" - volumes: - - /workspace/musicgen/models:/app/models - environment: - - HF_TOKEN=${HF_TOKEN} - - MODEL_NAME=facebook/musicgen-medium - - HOST=0.0.0.0 - - PORT=8000 - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - profiles: ["audio"] # Only start when requested by orchestrator - restart: "no" # Orchestrator manages lifecycle - -# ============================================================================ -# VOLUMES -# ============================================================================ -# Model caches are stored on RunPod's /workspace directory (922TB network volume) -# This persists across pod restarts and reduces model download times - -# No named volumes - using host paths on RunPod /workspace diff --git a/core/base_service.py b/core/base_service.py deleted file mode 100644 index 9a313c1..0000000 --- a/core/base_service.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 -""" -Base Service Class for AI Model Services - -Provides common functionality for all model services: -- Health check endpoint -- Graceful shutdown handling -- Logging configuration -- Standard FastAPI setup -""" - -import asyncio -import logging -import os -import signal -import sys -from abc import ABC, abstractmethod -from typing import Optional - -from fastapi import FastAPI -import uvicorn - - -class BaseService(ABC): - """Abstract base class for all AI model services""" - - def __init__(self, name: str, port: int, host: str = "0.0.0.0"): - """ - Initialize base service - - Args: - name: Service name (for logging) - port: Port to run service on - host: Host to bind to (default: 0.0.0.0) - """ - self.name = name - self.port = port - self.host = host - self.app = FastAPI(title=f"{name} Service", version="1.0.0") - self.logger = self._setup_logging() - self.shutdown_event = asyncio.Event() - - # Register standard endpoints - self._register_health_endpoint() - - # Register signal handlers for graceful shutdown - self._register_signal_handlers() - - # Allow subclasses to add custom routes - self.create_app() - - def _setup_logging(self) -> logging.Logger: - """Configure logging for the service""" - logging.basicConfig( - level=logging.INFO, - format=f'%(asctime)s - {self.name} - %(levelname)s - %(message)s', - handlers=[ - logging.StreamHandler(sys.stdout) - ] - ) - return logging.getLogger(self.name) - - def _register_health_endpoint(self): - """Register standard health check endpoint""" - @self.app.get("/health") - async def health_check(): - """Health check endpoint""" - return { - "status": "healthy", - "service": self.name, - "port": self.port - } - - def _register_signal_handlers(self): - """Register signal handlers for graceful shutdown""" - def signal_handler(sig, frame): - self.logger.info(f"Received signal {sig}, initiating graceful shutdown...") - self.shutdown_event.set() - - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - - @abstractmethod - def create_app(self): - """ - Create FastAPI routes for this service. - Subclasses must implement this to add their specific endpoints. - - Example: - @self.app.post("/v1/generate") - async def generate(request: MyRequest): - return await self.model.generate(request) - """ - pass - - async def initialize(self): - """ - Initialize the service (load models, etc.). - Subclasses can override this for custom initialization. - """ - self.logger.info(f"Initializing {self.name} service...") - - async def cleanup(self): - """ - Cleanup resources on shutdown. - Subclasses can override this for custom cleanup. - """ - self.logger.info(f"Cleaning up {self.name} service...") - - def run(self): - """ - Run the service. - This is the main entry point that starts the FastAPI server. - """ - try: - self.logger.info(f"Starting {self.name} service on {self.host}:{self.port}") - - # Run initialization - asyncio.run(self.initialize()) - - # Start uvicorn server - config = uvicorn.Config( - app=self.app, - host=self.host, - port=self.port, - log_level="info", - access_log=True - ) - server = uvicorn.Server(config) - - # Run server - asyncio.run(server.serve()) - - except KeyboardInterrupt: - self.logger.info("Keyboard interrupt received") - except Exception as e: - self.logger.error(f"Error running service: {e}", exc_info=True) - sys.exit(1) - finally: - # Cleanup - asyncio.run(self.cleanup()) - self.logger.info(f"{self.name} service stopped") - - -class GPUService(BaseService): - """ - Base class for GPU-accelerated services. - Provides additional GPU-specific functionality. - """ - - def __init__(self, name: str, port: int, host: str = "0.0.0.0"): - super().__init__(name, port, host) - self._check_gpu_availability() - - def _check_gpu_availability(self): - """Check if GPU is available""" - try: - import torch - if torch.cuda.is_available(): - gpu_count = torch.cuda.device_count() - gpu_name = torch.cuda.get_device_name(0) - self.logger.info(f"GPU available: {gpu_name} (count: {gpu_count})") - else: - self.logger.warning("No GPU available - service may run slowly") - except ImportError: - self.logger.warning("PyTorch not installed - cannot check GPU availability") diff --git a/core/requirements.txt b/core/requirements.txt deleted file mode 100644 index bf3186f..0000000 --- a/core/requirements.txt +++ /dev/null @@ -1,15 +0,0 @@ -# Core dependencies for AI service infrastructure - -# FastAPI and server -fastapi==0.104.1 -uvicorn[standard]==0.24.0 -pydantic==2.5.0 - -# HTTP client for health checks and proxying -httpx==0.25.1 - -# YAML configuration -pyyaml==6.0.1 - -# Process management -psutil==5.9.6 diff --git a/core/service_manager.py b/core/service_manager.py deleted file mode 100644 index 4ecf930..0000000 --- a/core/service_manager.py +++ /dev/null @@ -1,301 +0,0 @@ -#!/usr/bin/env python3 -""" -Service Manager for AI Model Services - -Manages lifecycle of model services running as Python processes: -- Start/stop services -- Health monitoring -- Auto-restart on failure -- Resource cleanup -""" - -import asyncio -import logging -import os -import signal -import subprocess -import time -from dataclasses import dataclass -from pathlib import Path -from typing import Dict, Optional - -import httpx - - -@dataclass -class ServiceConfig: - """Configuration for a service""" - name: str - script_path: Path - port: int - startup_timeout: int = 120 - health_check_path: str = "/health" - auto_restart: bool = False - env: Optional[Dict[str, str]] = None - - -class ServiceManager: - """Manages multiple AI model services as subprocesses""" - - def __init__(self): - self.logger = logging.getLogger("ServiceManager") - self.processes: Dict[str, subprocess.Popen] = {} - self.configs: Dict[str, ServiceConfig] = {} - self.shutdown_event = asyncio.Event() - - def register_service(self, config: ServiceConfig): - """Register a service configuration""" - self.configs[config.name] = config - self.logger.info(f"Registered service: {config.name} on port {config.port}") - - async def start_service(self, name: str) -> bool: - """ - Start a service by name - - Args: - name: Service name to start - - Returns: - bool: True if service started successfully - """ - if name not in self.configs: - self.logger.error(f"Service {name} not registered") - return False - - if name in self.processes: - proc = self.processes[name] - if proc.poll() is None: - self.logger.info(f"Service {name} already running") - return True - - config = self.configs[name] - self.logger.info(f"Starting service {name}...") - - try: - # Prepare environment - env = os.environ.copy() - if config.env: - env.update(config.env) - env.update({ - 'PORT': str(config.port), - 'HOST': '0.0.0.0' - }) - - # Start process - proc = subprocess.Popen( - ['python3', str(config.script_path)], - env=env, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - preexec_fn=os.setsid # Create new process group - ) - - self.processes[name] = proc - self.logger.info(f"Process started for {name} (PID: {proc.pid})") - - # Wait for health check - if await self._wait_for_health(name, config): - self.logger.info(f"Service {name} is healthy and ready") - return True - else: - self.logger.error(f"Service {name} failed health check") - await self.stop_service(name) - return False - - except Exception as e: - self.logger.error(f"Error starting {name}: {e}", exc_info=True) - return False - - async def _wait_for_health(self, name: str, config: ServiceConfig) -> bool: - """ - Wait for service to become healthy - - Args: - name: Service name - config: Service configuration - - Returns: - bool: True if service becomes healthy within timeout - """ - proc = self.processes.get(name) - if not proc: - return False - - start_time = time.time() - url = f"http://localhost:{config.port}{config.health_check_path}" - - while time.time() - start_time < config.startup_timeout: - # Check if process is still running - if proc.poll() is not None: - self.logger.error(f"Process for {name} exited prematurely (code: {proc.returncode})") - return False - - # Try health check - try: - async with httpx.AsyncClient() as client: - response = await client.get(url, timeout=5.0) - if response.status_code == 200: - return True - except Exception: - pass - - await asyncio.sleep(2) - - return False - - async def stop_service(self, name: str, timeout: int = 10): - """ - Stop a running service - - Args: - name: Service name - timeout: Seconds to wait for graceful shutdown - """ - if name not in self.processes: - self.logger.warning(f"Service {name} not in process registry") - return - - proc = self.processes[name] - - if proc.poll() is None: # Still running - self.logger.info(f"Stopping service {name}...") - try: - # Send SIGTERM to process group - os.killpg(os.getpgid(proc.pid), signal.SIGTERM) - - # Wait for graceful shutdown - try: - proc.wait(timeout=timeout) - self.logger.info(f"Service {name} stopped gracefully") - except subprocess.TimeoutExpired: - # Force kill if not terminated - self.logger.warning(f"Service {name} did not stop gracefully, forcing kill") - os.killpg(os.getpgid(proc.pid), signal.SIGKILL) - proc.wait() - - except Exception as e: - self.logger.error(f"Error stopping {name}: {e}", exc_info=True) - - del self.processes[name] - - async def restart_service(self, name: str) -> bool: - """ - Restart a service - - Args: - name: Service name - - Returns: - bool: True if service restarted successfully - """ - self.logger.info(f"Restarting service {name}...") - await self.stop_service(name) - await asyncio.sleep(2) # Brief pause between stop and start - return await self.start_service(name) - - async def check_health(self, name: str) -> bool: - """ - Check if a service is healthy - - Args: - name: Service name - - Returns: - bool: True if service is running and healthy - """ - if name not in self.processes: - return False - - proc = self.processes[name] - if proc.poll() is not None: - return False - - config = self.configs[name] - url = f"http://localhost:{config.port}{config.health_check_path}" - - try: - async with httpx.AsyncClient() as client: - response = await client.get(url, timeout=5.0) - return response.status_code == 200 - except Exception: - return False - - async def monitor_services(self): - """ - Monitor all services and auto-restart if configured - - This runs continuously until shutdown_event is set. - """ - self.logger.info("Starting service monitor...") - - while not self.shutdown_event.is_set(): - for name, config in self.configs.items(): - if not config.auto_restart: - continue - - # Check if process exists and is healthy - if name in self.processes: - proc = self.processes[name] - if proc.poll() is not None: - self.logger.warning(f"Service {name} died (code: {proc.returncode}), restarting...") - await self.restart_service(name) - elif not await self.check_health(name): - self.logger.warning(f"Service {name} unhealthy, restarting...") - await self.restart_service(name) - - # Wait before next check - try: - await asyncio.wait_for(self.shutdown_event.wait(), timeout=10.0) - except asyncio.TimeoutError: - pass - - self.logger.info("Service monitor stopped") - - async def stop_all_services(self): - """Stop all running services""" - self.logger.info("Stopping all services...") - for name in list(self.processes.keys()): - await self.stop_service(name) - self.logger.info("All services stopped") - - def get_service_status(self, name: str) -> Dict: - """ - Get status information for a service - - Args: - name: Service name - - Returns: - dict: Status information - """ - if name not in self.configs: - return {"status": "unknown", "error": "Service not registered"} - - if name not in self.processes: - return {"status": "stopped"} - - proc = self.processes[name] - if proc.poll() is not None: - return { - "status": "exited", - "exit_code": proc.returncode - } - - config = self.configs[name] - return { - "status": "running", - "pid": proc.pid, - "port": config.port - } - - def get_all_service_status(self) -> Dict: - """ - Get status for all registered services - - Returns: - dict: Service name -> status mapping - """ - return { - name: self.get_service_status(name) - for name in self.configs.keys() - } diff --git a/docs/DEPLOYMENT.md b/docs/DEPLOYMENT.md deleted file mode 100644 index 2c86473..0000000 --- a/docs/DEPLOYMENT.md +++ /dev/null @@ -1,467 +0,0 @@ -# Multi-Modal AI Orchestration System - -**Cost-optimized AI infrastructure running text, image, and music generation on a single RunPod RTX 4090 GPU.** - -## Architecture Overview - -This system provides a unified API for multiple AI model types with automatic model switching on a single GPU (24GB VRAM). All requests route through an intelligent orchestrator that manages model lifecycle. - -### Components - -``` -โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” -โ”‚ VPS (Tailscale: 100.102.217.79) โ”‚ -โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ -โ”‚ โ”‚ LiteLLM Proxy (Port 4000) โ”‚ โ”‚ -โ”‚ โ”‚ Routes to: Claude API + GPU Orchestrator โ”‚ โ”‚ -โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ -โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ - โ”‚ Tailscale VPN -โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” -โ”‚ RunPod GPU Server (Tailscale: 100.100.108.13) โ”‚ -โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ -โ”‚ โ”‚ Orchestrator (Port 9000) โ”‚ โ”‚ -โ”‚ โ”‚ Manages sequential model loading based on request type โ”‚ โ”‚ -โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ -โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ -โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ -โ”‚ โ”‚vLLM โ”‚ โ”‚Flux.1 โ”‚ โ”‚MusicGen โ”‚ โ”‚ -โ”‚ โ”‚Qwen 2.5 7B โ”‚ โ”‚Schnell โ”‚ โ”‚Medium โ”‚ โ”‚ -โ”‚ โ”‚Port: 8001 โ”‚ โ”‚Port: 8002 โ”‚ โ”‚Port: 8003 โ”‚ โ”‚ -โ”‚ โ”‚VRAM: 14GB โ”‚ โ”‚VRAM: 14GB โ”‚ โ”‚VRAM: 11GB โ”‚ โ”‚ -โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ -โ”‚ โ”‚ -โ”‚ Only ONE model active at a time (sequential loading) โ”‚ -โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ -``` - -### Features - -โœ… **Automatic Model Switching** - Orchestrator detects request type and loads appropriate model -โœ… **OpenAI-Compatible APIs** - Works with existing OpenAI clients and tools -โœ… **Cost-Optimized** - Sequential loading on single GPU (~$0.50/hr vs ~$0.75/hr for multi-GPU) -โœ… **Easy Model Addition** - Add new models by editing YAML config -โœ… **Centralized Routing** - LiteLLM proxy provides unified API for all models -โœ… **GPU Memory Safe** - Orchestrator ensures only one model loaded at a time - -## Supported Model Types - -### Text Generation -- **Qwen 2.5 7B Instruct** (facebook/Qwen2.5-7B-Instruct) -- VRAM: 14GB | Speed: Fast | OpenAI-compatible chat API - -### Image Generation -- **Flux.1 Schnell** (black-forest-labs/FLUX.1-schnell) -- VRAM: 14GB | Speed: 4-5 sec/image | OpenAI DALL-E compatible API - -### Music Generation -- **MusicGen Medium** (facebook/musicgen-medium) -- VRAM: 11GB | Speed: 60-90 sec for 30s audio | Custom audio API - -## Quick Start - -### 1. Prerequisites - -```bash -# On RunPod GPU server -- RunPod RTX 4090 instance (24GB VRAM) -- Docker & Docker Compose installed -- Tailscale VPN configured -- HuggingFace token (for model downloads) -``` - -### 2. Clone & Configure - -```bash -# On local machine -cd ai/ - -# Create environment file -cp .env.example .env -# Edit .env and add your HF_TOKEN -``` - -### 3. Deploy to RunPod - -```bash -# Copy all files to RunPod GPU server -scp -r ai/* gpu-pivoine:/workspace/ai/ - -# SSH to GPU server -ssh gpu-pivoine - -# Navigate to project -cd /workspace/ai/ - -# Start orchestrator (always running) -docker compose -f compose.yaml up -d orchestrator - -# Orchestrator will automatically manage model services as needed -``` - -### 4. Test Deployment - -```bash -# Check orchestrator health -curl http://100.100.108.13:9000/health - -# Test text generation (auto-loads vLLM) -curl http://100.100.108.13:9000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "qwen-2.5-7b", - "messages": [{"role": "user", "content": "Hello!"}] - }' - -# Test image generation (auto-switches to Flux) -curl http://100.100.108.13:9000/v1/images/generations \ - -H "Content-Type: application/json" \ - -d '{ - "model": "flux-schnell", - "prompt": "a cute cat", - "size": "1024x1024" - }' - -# Test music generation (auto-switches to MusicGen) -curl http://100.100.108.13:9000/v1/audio/generations \ - -H "Content-Type: application/json" \ - -d '{ - "model": "musicgen-medium", - "prompt": "upbeat electronic dance music", - "duration": 30 - }' -``` - -### 5. Update VPS LiteLLM - -```bash -# On VPS, restart LiteLLM to pick up new config -ssh vps -cd ~/Projects/docker-compose -arty restart litellm -``` - -## Usage Examples - -### Via Open WebUI (https://ai.pivoine.art) - -**Text Generation:** -1. Select model: `qwen-2.5-7b` -2. Type message and send -3. Orchestrator loads vLLM automatically - -**Image Generation:** -1. Select model: `flux-schnell` -2. Enter image prompt -3. Orchestrator switches to Flux.1 - -**Music Generation:** -1. Select model: `musicgen-medium` -2. Describe the music you want -3. Orchestrator switches to MusicGen - -### Via API (Direct) - -```python -import openai - -# Configure client to use orchestrator -client = openai.OpenAI( - base_url="http://100.100.108.13:9000/v1", - api_key="dummy" # Not used but required -) - -# Text generation -response = client.chat.completions.create( - model="qwen-2.5-7b", - messages=[{"role": "user", "content": "Write a haiku"}] -) - -# Image generation -image = client.images.generate( - model="flux-schnell", - prompt="a sunset over mountains", - size="1024x1024" -) - -# Music generation (custom endpoint) -import requests -music = requests.post( - "http://100.100.108.13:9000/v1/audio/generations", - json={ - "model": "musicgen-medium", - "prompt": "calm piano music", - "duration": 30 - } -) -``` - -## Adding New Models - -### Step 1: Update `models.yaml` - -```yaml -# Add to ai/model-orchestrator/models.yaml -models: - llama-3.1-8b: # New model - type: text - framework: vllm - docker_service: vllm-llama - port: 8004 - vram_gb: 17 - startup_time_seconds: 120 - endpoint: /v1/chat/completions - description: "Llama 3.1 8B Instruct - Meta's latest model" -``` - -### Step 2: Add Docker Service - -```yaml -# Add to ai/compose.yaml -services: - vllm-llama: - build: ./vllm - container_name: ai_vllm-llama_1 - command: > - vllm serve meta-llama/Llama-3.1-8B-Instruct - --port 8000 --dtype bfloat16 - ports: - - "8004:8000" - environment: - - HF_TOKEN=${HF_TOKEN} - deploy: - resources: - reservations: - devices: - - driver: nvidia - count: 1 - capabilities: [gpu] - profiles: ["text"] - restart: "no" -``` - -### Step 3: Restart Orchestrator - -```bash -ssh gpu-pivoine -cd /workspace/ai/ -docker compose -f compose.yaml restart orchestrator -``` - -**That's it!** The orchestrator automatically detects the new model. - -## Management Commands - -### Orchestrator - -```bash -# Start orchestrator -docker compose -f compose.yaml up -d orchestrator - -# View orchestrator logs -docker logs -f ai_orchestrator - -# Restart orchestrator -docker compose -f compose.yaml restart orchestrator - -# Check active model -curl http://100.100.108.13:9000/health - -# List all models -curl http://100.100.108.13:9000/models -``` - -### Manual Model Control - -```bash -# Manually switch to specific model -curl -X POST http://100.100.108.13:9000/switch \ - -H "Content-Type: application/json" \ - -d '{"model": "flux-schnell"}' - -# Check which model is running -curl http://100.100.108.13:9000/health | jq '.current_model' -``` - -### Model Services - -```bash -# Manually start a specific model (bypassing orchestrator) -docker compose -f compose.yaml --profile text up -d vllm-qwen - -# Stop a model -docker compose -f compose.yaml stop vllm-qwen - -# View model logs -docker logs -f ai_vllm-qwen_1 -docker logs -f ai_flux_1 -docker logs -f ai_musicgen_1 -``` - -## Monitoring - -### GPU Usage - -```bash -ssh gpu-pivoine "nvidia-smi" -``` - -### Model Status - -```bash -# Which model is active? -curl http://100.100.108.13:9000/health - -# Model memory usage -curl http://100.100.108.13:9000/health | jq '{current: .current_model, vram: .model_info.vram_gb}' -``` - -### Performance - -```bash -# Orchestrator logs (model switching) -docker logs -f ai_orchestrator - -# Model-specific logs -docker logs -f ai_vllm-qwen_1 -docker logs -f ai_flux_1 -docker logs -f ai_musicgen_1 -``` - -## Troubleshooting - -### Model Won't Load - -```bash -# Check orchestrator logs -docker logs ai_orchestrator - -# Check if model service exists -docker compose -f compose.yaml config | grep -A 10 "vllm-qwen" - -# Manually test model service -docker compose -f compose.yaml --profile text up -d vllm-qwen -curl http://localhost:8001/health -``` - -### Orchestrator Can't Connect - -```bash -# Check Docker socket permissions -ls -l /var/run/docker.sock - -# Restart Docker daemon -sudo systemctl restart docker - -# Rebuild orchestrator -docker compose -f compose.yaml build orchestrator -docker compose -f compose.yaml up -d orchestrator -``` - -### Model Switching Too Slow - -```bash -# Check model startup times in models.yaml -# Adjust startup_time_seconds if needed - -# Pre-download models to /workspace cache -docker run --rm -it --gpus all \ - -v /workspace/huggingface_cache:/cache \ - -e HF_HOME=/cache \ - nvidia/cuda:12.4.0-runtime-ubuntu22.04 \ - huggingface-cli download facebook/musicgen-medium -``` - -## File Structure - -``` -ai/ -โ”œโ”€โ”€ compose.yaml # Main orchestration file -โ”œโ”€โ”€ .env.example # Environment template -โ”œโ”€โ”€ README.md # This file -โ”‚ -โ”œโ”€โ”€ model-orchestrator/ # Central orchestrator service -โ”‚ โ”œโ”€โ”€ orchestrator.py # FastAPI app managing models -โ”‚ โ”œโ”€โ”€ models.yaml # Model registry (EDIT TO ADD MODELS) -โ”‚ โ”œโ”€โ”€ Dockerfile -โ”‚ โ””โ”€โ”€ requirements.txt -โ”‚ -โ”œโ”€โ”€ vllm/ # Text generation (vLLM) -โ”‚ โ”œโ”€โ”€ server.py # Qwen 2.5 7B server -โ”‚ โ”œโ”€โ”€ Dockerfile -โ”‚ โ””โ”€โ”€ requirements.txt -โ”‚ -โ”œโ”€โ”€ flux/ # Image generation (Flux.1 Schnell) -โ”‚ โ””โ”€โ”€ config/ -โ”‚ โ””โ”€โ”€ config.json # Flux configuration -โ”‚ -โ”œโ”€โ”€ musicgen/ # Music generation (MusicGen) -โ”‚ โ”œโ”€โ”€ server.py # MusicGen API server -โ”‚ โ”œโ”€โ”€ Dockerfile -โ”‚ โ””โ”€โ”€ requirements.txt -โ”‚ -โ”œโ”€โ”€ litellm-config.yaml # LiteLLM proxy configuration -โ””โ”€โ”€ GPU_DEPLOYMENT_LOG.md # Deployment history and notes -``` - -## Cost Analysis - -### Current Setup (Single GPU) -- **Provider**: RunPod Spot Instance -- **GPU**: RTX 4090 24GB -- **Cost**: ~$0.50/hour -- **Monthly**: ~$360 (if running 24/7) -- **Optimized**: ~$120 (8 hours/day during business hours) - -### Alternative: Multi-GPU (All Models Always On) -- **GPUs**: 2ร— RTX 4090 -- **Cost**: ~$0.75/hour -- **Monthly**: ~$540 (if running 24/7) -- **Trade-off**: No switching latency, +$180/month - -### Recommendation -Stick with single GPU sequential loading for cost optimization. Model switching (30-120 seconds) is acceptable for most use cases. - -## Performance Expectations - -| Model | VRAM | Startup Time | Generation Speed | -|-------|------|--------------|------------------| -| Qwen 2.5 7B | 14GB | 120s | ~50 tokens/sec | -| Flux.1 Schnell | 14GB | 60s | ~4-5 sec/image | -| MusicGen Medium | 11GB | 45s | ~60-90 sec for 30s audio | - -**Model Switching**: 30-120 seconds (unload current + load new) - -## Security Notes - -- Orchestrator requires Docker socket access (`/var/run/docker.sock`) -- All services run on private Tailscale network -- No public exposure (only via VPS LiteLLM proxy) -- HuggingFace token stored in `.env` (not committed to git) - -## Future Enhancements - -1. โน๏ธ Add Llama 3.1 8B for alternative text generation -2. โน๏ธ Add Whisper Large v3 for speech-to-text -3. โน๏ธ Add XTTS v2 for text-to-speech -4. โน๏ธ Implement model preloading/caching for faster switching -5. โน๏ธ Add usage metrics and cost tracking -6. โน๏ธ Auto-stop GPU pod during idle periods - -## Support - -For issues or questions: -- Check orchestrator logs: `docker logs ai_orchestrator` -- View model-specific logs: `docker logs ai__1` -- Test direct model access: `curl http://localhost:/health` -- Review GPU deployment log: `GPU_DEPLOYMENT_LOG.md` - -## License - -Built with: -- [vLLM](https://github.com/vllm-project/vllm) - Apache 2.0 -- [AudioCraft](https://github.com/facebookresearch/audiocraft) - MIT (code), CC-BY-NC (weights) -- [Flux.1](https://github.com/black-forest-labs/flux) - Apache 2.0 -- [LiteLLM](https://github.com/BerriAI/litellm) - MIT - -**Note**: MusicGen pre-trained weights are non-commercial (CC-BY-NC). Train your own models for commercial use with the MIT-licensed code. diff --git a/docs/GPU_DEPLOYMENT_LOG.md b/docs/GPU_DEPLOYMENT_LOG.md deleted file mode 100644 index 34f30cd..0000000 --- a/docs/GPU_DEPLOYMENT_LOG.md +++ /dev/null @@ -1,421 +0,0 @@ -# GPU Server Deployment Log - -## Current Deployment (2025-11-21) - -### Infrastructure -- **Provider**: RunPod (Spot Instance) -- **GPU**: NVIDIA RTX 4090 24GB -- **Disk**: 50GB local SSD (expanded from 20GB) -- **Network Volume**: 922TB at `/workspace` -- **Region**: Europe -- **Cost**: ~$0.50/hour (~$360/month if running 24/7) - -### Network Configuration -- **VPN**: Tailscale (replaces WireGuard due to RunPod UDP restrictions) -- **GPU Server Tailscale IP**: 100.100.108.13 -- **VPS Tailscale IP**: (get with `tailscale ip -4` on VPS) - -### SSH Access -``` -Host gpu-pivoine - HostName 213.173.102.232 - Port 29695 - User root - IdentityFile ~/.ssh/id_ed25519 -``` - -**Note**: RunPod Spot instances can be terminated and restarted with new ports/IPs. Update SSH config accordingly. - -### Software Stack -- **Python**: 3.11.10 -- **vLLM**: 0.6.4.post1 (installed with pip) -- **PyTorch**: 2.5.1 with CUDA 12.4 -- **Tailscale**: Installed via official script - -### vLLM Deployment - -**Custom Server**: `ai/simple_vllm_server.py` -- Uses `AsyncLLMEngine` directly to bypass multiprocessing issues -- OpenAI-compatible API endpoints: - - `GET /v1/models` - List available models - - `POST /v1/completions` - Text completion - - `POST /v1/chat/completions` - Chat completion -- Default model: Qwen/Qwen2.5-7B-Instruct -- Cache directory: `/workspace/huggingface_cache` - -**Deployment Command**: -```bash -# Copy server script to GPU server -scp ai/simple_vllm_server.py gpu-pivoine:/workspace/ - -# Start server -ssh gpu-pivoine "cd /workspace && nohup python3 simple_vllm_server.py > vllm.log 2>&1 &" - -# Check status -ssh gpu-pivoine "curl http://localhost:8000/v1/models" -``` - -**Server Configuration** (environment variables): -- `VLLM_HOST`: 0.0.0.0 (default) -- `VLLM_PORT`: 8000 (default) - -### Model Configuration -- **Model**: Qwen/Qwen2.5-7B-Instruct (no auth required) -- **Context Length**: 4096 tokens -- **GPU Memory**: 85% utilization -- **Tensor Parallel**: 1 (single GPU) - -### Known Issues & Solutions - -#### Issue 1: vLLM Multiprocessing Errors -**Problem**: Default vLLM v1 engine fails with ZMQ/CUDA multiprocessing errors on RunPod. -**Solution**: Custom `AsyncLLMEngine` FastAPI server bypasses multiprocessing layer entirely. - -#### Issue 2: Disk Space (Solved) -**Problem**: Original 20GB disk filled up with Hugging Face cache. -**Solution**: Expanded to 50GB and use `/workspace` for model cache. - -#### Issue 3: Gated Models -**Problem**: Llama models require Hugging Face authentication. -**Solution**: Use Qwen 2.5 7B Instruct (no auth required) or set `HF_TOKEN` environment variable. - -#### Issue 4: Spot Instance Volatility -**Problem**: RunPod Spot instances can be terminated anytime. -**Solution**: Accept as trade-off for cost savings. Document SSH details for quick reconnection. - -### Monitoring - -**Check vLLM logs**: -```bash -ssh gpu-pivoine "tail -f /workspace/vllm.log" -``` - -**Check GPU usage**: -```bash -ssh gpu-pivoine "nvidia-smi" -``` - -**Check Tailscale status**: -```bash -ssh gpu-pivoine "tailscale status" -``` - -**Test API locally (on GPU server)**: -```bash -ssh gpu-pivoine "curl http://localhost:8000/v1/models" -``` - -**Test API via Tailscale (from VPS)**: -```bash -curl http://100.100.108.13:8000/v1/models -``` - -### LiteLLM Integration - -Update VPS LiteLLM config at `ai/litellm-config-gpu.yaml`: - -```yaml -# Replace old WireGuard IP (10.8.0.2) with Tailscale IP -- model_name: qwen-2.5-7b - litellm_params: - model: openai/qwen-2.5-7b - api_base: http://100.100.108.13:8000/v1 # Tailscale IP - api_key: dummy - rpm: 1000 - tpm: 100000 -``` - -Restart LiteLLM: -```bash -arty restart litellm -``` - -### Troubleshooting - -**Server not responding**: -1. Check if process is running: `pgrep -f simple_vllm_server` -2. Check logs: `tail -100 /workspace/vllm.log` -3. Check GPU availability: `nvidia-smi` -4. Restart server: `pkill -f simple_vllm_server && python3 /workspace/simple_vllm_server.py &` - -**Tailscale not connected**: -1. Check status: `tailscale status` -2. Check daemon: `ps aux | grep tailscaled` -3. Restart: `tailscale down && tailscale up` - -**Model download failing**: -1. Check disk space: `df -h` -2. Check cache directory: `ls -lah /workspace/huggingface_cache` -3. Clear cache if needed: `rm -rf /workspace/huggingface_cache/*` - -### Deployment Status โœ… COMPLETE - -**Deployment Date**: 2025-11-21 - -1. โœ… Deploy vLLM with Qwen 2.5 7B - COMPLETE -2. โœ… Test API endpoints locally and via Tailscale - COMPLETE -3. โœ… Update VPS LiteLLM configuration - COMPLETE -4. โœ… Test end-to-end: Open WebUI โ†’ LiteLLM โ†’ vLLM - COMPLETE -5. โณ Monitor performance and costs - ONGOING - -**Model Available**: `qwen-2.5-7b` visible in Open WebUI at https://ai.pivoine.art - -### Next Steps (2025-11-21 Original) -6. โœ… Consider adding more models โ†’ COMPLETE (added Flux.1 Schnell + MusicGen Medium) -7. โน๏ธ Set up auto-stop for idle periods to save costs - ---- - -## Multi-Modal Architecture (2025-11-21 Update) - -### Overview - -Expanded GPU deployment to support **text, image, and music generation** with intelligent model orchestration. All models run sequentially on a single RTX 4090 GPU with automatic switching based on request type. - -### Architecture Components - -#### 1. **Orchestrator Service** (Port 9000 - Always Running) -- **Location**: `ai/model-orchestrator/` -- **Purpose**: Central service managing model lifecycle -- **Features**: - - Detects request type (text/image/audio) - - Automatically unloads current model - - Loads requested model - - Proxies requests to active model - - Tracks GPU memory usage -- **Technology**: FastAPI + Docker SDK Python -- **Endpoints**: - - `POST /v1/chat/completions` โ†’ Routes to text models - - `POST /v1/images/generations` โ†’ Routes to image models - - `POST /v1/audio/generations` โ†’ Routes to music models - - `GET /health` โ†’ Shows active model and status - - `GET /models` โ†’ Lists all available models - - `POST /switch` โ†’ Manually switch models - -#### 2. **Text Generation** (vLLM + Qwen 2.5 7B) -- **Service**: `vllm-qwen` (Port 8001) -- **Location**: `ai/vllm/` -- **Model**: Qwen/Qwen2.5-7B-Instruct -- **VRAM**: 14GB (85% GPU utilization) -- **Speed**: ~50 tokens/second -- **Startup**: 120 seconds -- **Status**: โœ… Working (same as original deployment) - -#### 3. **Image Generation** (Flux.1 Schnell) -- **Service**: `flux` (Port 8002) -- **Location**: `ai/flux/` -- **Model**: black-forest-labs/FLUX.1-schnell -- **VRAM**: 14GB with CPU offloading -- **Speed**: 4-5 seconds per image -- **Startup**: 60 seconds -- **Features**: OpenAI DALL-E compatible API -- **Image**: `ghcr.io/matatonic/openedai-images-flux:latest` - -#### 4. **Music Generation** (MusicGen Medium) -- **Service**: `musicgen` (Port 8003) -- **Location**: `ai/musicgen/` -- **Model**: facebook/musicgen-medium -- **VRAM**: 11GB -- **Speed**: 60-90 seconds for 30 seconds of audio -- **Startup**: 45 seconds -- **Features**: Text-to-music generation with sampling controls -- **Technology**: Meta's AudioCraft + custom FastAPI wrapper - -### Model Registry (`models.yaml`) - -Simple configuration file for managing all models: - -```yaml -models: - qwen-2.5-7b: - type: text - framework: vllm - docker_service: vllm-qwen - port: 8001 - vram_gb: 14 - startup_time_seconds: 120 - endpoint: /v1/chat/completions - - flux-schnell: - type: image - framework: openedai-images - docker_service: flux - port: 8002 - vram_gb: 14 - startup_time_seconds: 60 - endpoint: /v1/images/generations - - musicgen-medium: - type: audio - framework: audiocraft - docker_service: musicgen - port: 8003 - vram_gb: 11 - startup_time_seconds: 45 - endpoint: /v1/audio/generations -``` - -**Adding new models**: Just add a new entry to this file and define the Docker service. - -### Deployment Changes - -#### Docker Compose Structure -- **File**: `compose.yaml` -- **Services**: 4 total (1 orchestrator + 3 models) -- **Profiles**: `text`, `image`, `audio` (orchestrator manages activation) -- **Restart Policy**: `no` for models (orchestrator controls lifecycle) -- **Volumes**: All model caches on `/workspace` (922TB network volume) - -#### LiteLLM Integration -Updated `litellm-config.yaml` to route all self-hosted models through orchestrator: - -```yaml -# Text -- model_name: qwen-2.5-7b - api_base: http://100.100.108.13:9000/v1 # Orchestrator - -# Image -- model_name: flux-schnell - api_base: http://100.100.108.13:9000/v1 # Orchestrator - -# Music -- model_name: musicgen-medium - api_base: http://100.100.108.13:9000/v1 # Orchestrator -``` - -All models now available via Open WebUI at https://ai.pivoine.art - -### Usage Examples - -**Text Generation**: -```bash -curl http://100.100.108.13:9000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{"model": "qwen-2.5-7b", "messages": [{"role": "user", "content": "Hello"}]}' -``` - -**Image Generation**: -```bash -curl http://100.100.108.13:9000/v1/images/generations \ - -H "Content-Type: application/json" \ - -d '{"model": "flux-schnell", "prompt": "a cute cat", "size": "1024x1024"}' -``` - -**Music Generation**: -```bash -curl http://100.100.108.13:9000/v1/audio/generations \ - -H "Content-Type: application/json" \ - -d '{"model": "musicgen-medium", "prompt": "upbeat electronic", "duration": 30}' -``` - -### Deployment Commands - -```bash -# Copy all files to RunPod -scp -r ai/* gpu-pivoine:/workspace/ai/ - -# SSH to GPU server -ssh gpu-pivoine -cd /workspace/ai/ - -# Start orchestrator (manages everything) -docker compose -f compose.yaml up -d orchestrator - -# Check status -curl http://100.100.108.13:9000/health - -# View logs -docker logs -f ai_orchestrator - -# Manually switch models (optional) -curl -X POST http://100.100.108.13:9000/switch \ - -H "Content-Type: application/json" \ - -d '{"model": "flux-schnell"}' -``` - -### Performance Characteristics - -| Model | VRAM | Startup Time | Generation Time | Notes | -|-------|------|--------------|-----------------|-------| -| Qwen 2.5 7B | 14GB | 120s | ~50 tok/sec | Fast text generation | -| Flux.1 Schnell | 14GB | 60s | 4-5s/image | High-quality images | -| MusicGen Medium | 11GB | 45s | 60-90s for 30s audio | Text-to-music | - -**Model Switching Overhead**: 30-120 seconds (unload + load) - -### Cost Analysis - -**Current (Single GPU Sequential)**: -- Cost: ~$0.50/hour -- Monthly: ~$360 (24/7) or ~$120 (8hr/day) -- Trade-off: 30-120s switching time - -**Alternative (Multi-GPU Concurrent)**: -- Cost: ~$0.75/hour (+50%) -- Monthly: ~$540 (24/7) or ~$180 (8hr/day) -- Benefit: No switching time, all models always available - -**Decision**: Stick with single GPU for cost optimization. Switching time is acceptable for most use cases. - -### Known Limitations - -1. **Sequential Only**: Only one model active at a time -2. **Switching Latency**: 30-120 seconds to change models -3. **MusicGen License**: Pre-trained weights are CC-BY-NC (non-commercial) -4. **Spot Instance Volatility**: Pod can be terminated anytime - -### Monitoring - -**Check active model**: -```bash -curl http://100.100.108.13:9000/health | jq '{model: .current_model, vram: .model_info.vram_gb}' -``` - -**View orchestrator logs**: -```bash -docker logs -f ai_orchestrator -``` - -**GPU usage**: -```bash -ssh gpu-pivoine "nvidia-smi" -``` - -### Deployment Status โœ… COMPLETE (Multi-Modal) - -**Deployment Date**: 2025-11-21 - -1. โœ… Create model orchestrator service - COMPLETE -2. โœ… Deploy vLLM text generation (Qwen 2.5 7B) - COMPLETE -3. โœ… Deploy Flux.1 Schnell image generation - COMPLETE -4. โœ… Deploy MusicGen Medium music generation - COMPLETE -5. โœ… Update LiteLLM configuration - COMPLETE -6. โœ… Test all three model types via orchestrator - READY FOR TESTING -7. โณ Monitor performance and costs - ONGOING - -**Models Available**: `qwen-2.5-7b`, `flux-schnell`, `musicgen-medium` via Open WebUI - -### Future Model Additions - -**Easy to add** (just edit `models.yaml`): -- Llama 3.1 8B Instruct (text, gated model) -- Whisper Large v3 (speech-to-text) -- XTTS v2 (text-to-speech) -- Stable Diffusion XL (alternative image generation) - -See `README.md` for detailed instructions on adding new models. - -### Cost Optimization Ideas -1. **Auto-stop**: Configure RunPod to auto-stop after 30 minutes idle -2. **Spot Instances**: Already using Spot for 50% cost reduction -3. **Scheduled Operation**: Run only during business hours (8 hours/day = $120/month) -4. **Smaller Models**: Use Mistral 7B or quantized models for lighter workloads -5. **Pay-as-you-go**: Manually start/stop pod as needed - -### Performance Benchmarks -*To be measured after deployment* - -Expected (based on RTX 4090): -- Qwen 2.5 7B: 50-80 tokens/second -- Context processing: ~2-3 seconds for 1000 tokens -- First token latency: ~200-300ms diff --git a/docs/RUNPOD_TEMPLATE.md b/docs/RUNPOD_TEMPLATE.md deleted file mode 100644 index 29621bc..0000000 --- a/docs/RUNPOD_TEMPLATE.md +++ /dev/null @@ -1,416 +0,0 @@ -# RunPod Template Creation Guide - -This guide shows you how to create a reusable RunPod template so you never have to reinstall everything from scratch when Spot instances restart. - -## Why Create a Template? - -**Without Template** (Manual Setup Every Time): -- โŒ Install Docker & Docker Compose (10-15 min) -- โŒ Install Tailscale (5 min) -- โŒ Pull Docker images (10-20 min) -- โŒ Download models: Qwen (~14GB), Flux (~12GB), MusicGen (~11GB) = 30-45 min -- โŒ Configure everything (5-10 min) -- **Total: 60-90 minutes per Spot instance restart** - -**With Template** (Ready to Go): -- โœ… Everything pre-installed -- โœ… Models cached in `/workspace` -- โœ… Just start orchestrator -- **Total: 2-3 minutes** - -## Template Contents - -### System Software -- โœ… Docker 24.x + Docker Compose v2 -- โœ… Tailscale latest -- โœ… NVIDIA Docker runtime -- โœ… Python 3.11 -- โœ… Git, curl, wget, htop, nvtop - -### Docker Images (Pre-built) -- โœ… `ai_orchestrator` - Model orchestration service -- โœ… `ai_vllm-qwen_1` - Text generation (vLLM + Qwen 2.5 7B) -- โœ… `ai_musicgen_1` - Music generation (AudioCraft) -- โœ… `ghcr.io/matatonic/openedai-images-flux:latest` - Image generation - -### Model Cache (/workspace - Persistent) -- โœ… Qwen 2.5 7B Instruct (~14GB) -- โœ… Flux.1 Schnell (~12GB) -- โœ… MusicGen Medium (~11GB) -- **Total: ~37GB cached** - -### Project Files (/workspace/ai) -- โœ… All orchestrator code -- โœ… Docker Compose configurations -- โœ… Model service configurations -- โœ… Documentation - ---- - -## Step-by-Step Template Creation - -### Prerequisites -1. RunPod account -2. Active RTX 4090 pod (or similar GPU) -3. SSH access to the pod -4. This repository cloned locally - -### Step 1: Deploy Fresh Pod - -```bash -# Create new RunPod instance: -# - GPU: RTX 4090 (24GB VRAM) -# - Disk: 50GB container disk -# - Network Volume: Attach or create 100GB+ volume -# - Template: Start with official PyTorch or CUDA template - -# Note the SSH connection details (host, port, password) -``` - -### Step 2: Prepare the Instance - -Run the automated preparation script: - -```bash -# On your local machine, copy everything to RunPod -scp -P -r /home/valknar/Projects/runpod/* root@:/workspace/ai/ - -# SSH to the pod -ssh -p root@ - -# Run the preparation script -cd /workspace/ai -chmod +x scripts/prepare-template.sh -./scripts/prepare-template.sh -``` - -**What the script does:** -1. Installs Docker & Docker Compose -2. Installs Tailscale -3. Builds all Docker images -4. Pre-downloads all models -5. Validates everything works -6. Cleans up temporary files - -**Estimated time: 45-60 minutes** - -### Step 3: Manual Verification - -After the script completes, verify everything: - -```bash -# Check Docker is installed -docker --version -docker compose version - -# Check Tailscale -tailscale version - -# Check all images are built -docker images | grep ai_ - -# Check models are cached -ls -lh /workspace/huggingface_cache/ -ls -lh /workspace/flux/models/ -ls -lh /workspace/musicgen/models/ - -# Test orchestrator starts -cd /workspace/ai -docker compose -f compose.yaml up -d orchestrator -docker logs ai_orchestrator - -# Test model loading (should be fast since models are cached) -curl http://localhost:9000/health - -# Stop orchestrator -docker compose -f compose.yaml down -``` - -### Step 4: Clean Up Before Saving - -**IMPORTANT**: Remove secrets and temporary data before creating template! - -```bash -# Remove sensitive data -rm -f /workspace/ai/.env -rm -f /root/.ssh/known_hosts -rm -f /root/.bash_history - -# Clear logs -rm -f /var/log/*.log -docker system prune -af --volumes # Clean Docker cache but keep images - -# Clear Tailscale state (will re-authenticate on first use) -tailscale logout - -# Create template-ready marker -echo "RunPod Multi-Modal AI Template v1.0" > /workspace/TEMPLATE_VERSION -echo "Created: $(date)" >> /workspace/TEMPLATE_VERSION -``` - -### Step 5: Save Template in RunPod Dashboard - -1. **Go to RunPod Dashboard** โ†’ "My Pods" -2. **Select your prepared pod** -3. **Click "โ‹ฎ" menu** โ†’ "Save as Template" -4. **Template Configuration**: - - **Name**: `multi-modal-ai-v1.0` - - **Description**: - ``` - Multi-Modal AI Stack with Orchestrator - - Text: vLLM + Qwen 2.5 7B - - Image: Flux.1 Schnell - - Music: MusicGen Medium - - Models pre-cached (~37GB) - - Ready to deploy in 2-3 minutes - ``` - - **Category**: `AI/ML` - - **Docker Image**: (auto-detected) - - **Container Disk**: 50GB - - **Expose Ports**: 9000, 8001, 8002, 8003 - - **Environment Variables** (optional): - ``` - HF_TOKEN= - TAILSCALE_AUTHKEY= - ``` - -5. **Click "Save Template"** -6. **Wait for template creation** (5-10 minutes) -7. **Test the template** by deploying a new pod with it - ---- - -## Using Your Template - -### Deploy New Pod from Template - -1. **RunPod Dashboard** โ†’ "โž• Deploy" -2. **Select "Community Templates"** or "My Templates" -3. **Choose**: `multi-modal-ai-v1.0` -4. **Configure**: - - GPU: RTX 4090 (or compatible) - - Network Volume: Attach your existing volume with `/workspace` mount - - Environment: - - `HF_TOKEN`: Your Hugging Face token - - (Tailscale will be configured via SSH) - -5. **Deploy Pod** - -### First-Time Setup (On New Pod) - -```bash -# SSH to the new pod -ssh -p root@ - -# Navigate to project -cd /workspace/ai - -# Create .env file -cat > .env < - -# Start orchestrator (models already cached, starts in seconds!) -docker compose -f compose.yaml up -d orchestrator - -# Verify -curl http://localhost:9000/health - -# Check logs -docker logs -f ai_orchestrator -``` - -**Total setup time: 2-3 minutes!** ๐ŸŽ‰ - -### Updating SSH Config (If Spot Instance Restarts) - -Since Spot instances can restart with new IPs/ports: - -```bash -# On your local machine -# Update ~/.ssh/config with new connection details - -Host gpu-pivoine - HostName - Port - User root - IdentityFile ~/.ssh/id_ed25519 -``` - ---- - -## Template Maintenance - -### Updating the Template - -When you add new models or make improvements: - -1. Deploy a pod from your existing template -2. Make your changes -3. Test everything -4. Clean up (remove secrets) -5. Save as new template version: `multi-modal-ai-v1.1` -6. Update your documentation - -### Version History - -Keep track of template versions: - -``` -v1.0 (2025-11-21) - Initial release -- Text: Qwen 2.5 7B -- Image: Flux.1 Schnell -- Music: MusicGen Medium -- Docker orchestrator - -v1.1 (future) - Planned -- Add Llama 3.1 8B -- Add Whisper Large v3 -- Optimize model loading -``` - ---- - -## Troubleshooting Template Creation - -### Models Not Downloading - -```bash -# Manually trigger model downloads -docker compose --profile text up -d vllm-qwen -docker logs -f ai_vllm-qwen_1 -# Wait for "Model loaded successfully" -docker compose stop vllm-qwen - -# Repeat for other models -docker compose --profile image up -d flux -docker compose --profile audio up -d musicgen -``` - -### Docker Images Not Building - -```bash -# Build images one at a time -docker compose -f compose.yaml build orchestrator -docker compose -f compose.yaml build vllm-qwen -docker compose -f compose.yaml build musicgen - -# Check build logs for errors -docker compose -f compose.yaml build --no-cache --progress=plain orchestrator -``` - -### Tailscale Won't Install - -```bash -# Manual Tailscale installation -curl -fsSL https://tailscale.com/install.sh | sh - -# Start daemon -tailscaled --tun=userspace-networking --socks5-server=localhost:1055 & - -# Test -tailscale version -``` - -### Template Too Large - -RunPod templates have size limits. If your template is too large: - -**Option 1**: Use network volume for models -- Move models to network volume: `/workspace/models/` -- Mount volume when deploying from template -- Models persist across pod restarts - -**Option 2**: Reduce cached models -- Only cache most-used model (Qwen 2.5 7B) -- Download others on first use -- Accept slightly longer first-time startup - -**Option 3**: Use Docker layer optimization -```dockerfile -# In Dockerfile, order commands by change frequency -# Less frequently changed layers first -``` - ---- - -## Cost Analysis - -### Template Storage Cost -- RunPod charges for template storage: ~$0.10/GB/month -- This template: ~50GB = **~$5/month** -- **Worth it!** Saves 60-90 minutes per Spot restart - -### Time Savings -- Spot instance restarts: 2-5 times per week (highly variable) -- Time saved per restart: 60-90 minutes -- **Total saved per month: 8-20 hours** -- **Value: Priceless for rapid deployment** - ---- - -## Advanced: Automated Template Updates - -Create a CI/CD pipeline to automatically update templates: - -```bash -# GitHub Actions workflow (future enhancement) -# 1. Deploy pod from template -# 2. Pull latest code -# 3. Rebuild images -# 4. Test -# 5. Save new template version -# 6. Notify team -``` - ---- - -## Template Checklist - -Before saving your template, verify: - -- [ ] All Docker images built and working -- [ ] All models downloaded and cached -- [ ] Tailscale installed (but logged out) -- [ ] Docker Compose files present -- [ ] `.env` file removed (secrets cleared) -- [ ] Logs cleared -- [ ] SSH keys removed -- [ ] Bash history cleared -- [ ] Template version documented -- [ ] Test deployment successful - ---- - -## Support - -If you have issues creating the template: - -1. Check `/workspace/ai/scripts/prepare-template.sh` logs -2. Review Docker build logs: `docker compose build --progress=plain` -3. Check model download logs: `docker logs ` -4. Verify disk space: `df -h` -5. Check network volume is mounted: `mount | grep workspace` - -For RunPod-specific issues: -- RunPod Docs: https://docs.runpod.io/ -- RunPod Discord: https://discord.gg/runpod - ---- - -## Next Steps - -After creating your template: - -1. โœ… Test deployment from template -2. โœ… Document in `GPU_DEPLOYMENT_LOG.md` -3. โœ… Share template ID with team (if applicable) -4. โœ… Set up monitoring (Netdata, etc.) -5. โœ… Configure auto-stop for cost optimization -6. โœ… Add more models as needed - -**Your multi-modal AI infrastructure is now portable and reproducible!** ๐Ÿš€ diff --git a/flux/config/config.json b/flux/config/config.json deleted file mode 100644 index 50d9669..0000000 --- a/flux/config/config.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "model": "flux-schnell", - "offload": true, - "sequential_cpu_offload": false, - "vae_tiling": true, - "enable_model_cpu_offload": true, - "low_vram_mode": false, - "torch_compile": false, - "safety_checker": false, - "watermark": false, - "flux_device": "cuda", - "compile": false -} diff --git a/model-orchestrator/Dockerfile b/model-orchestrator/Dockerfile deleted file mode 100644 index bcee1e9..0000000 --- a/model-orchestrator/Dockerfile +++ /dev/null @@ -1,22 +0,0 @@ -FROM python:3.11-slim - -WORKDIR /app - -# Install system dependencies -RUN apt-get update && apt-get install -y \ - curl \ - && rm -rf /var/lib/apt/lists/* - -# Copy requirements and install Python dependencies -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -# Copy application code -COPY orchestrator.py . -COPY models.yaml . - -# Expose port -EXPOSE 9000 - -# Run the orchestrator -CMD ["python", "orchestrator.py"] diff --git a/model-orchestrator/models.yaml b/model-orchestrator/models.yaml index f2f55bc..d3e6a4b 100644 --- a/model-orchestrator/models.yaml +++ b/model-orchestrator/models.yaml @@ -6,7 +6,7 @@ models: qwen-2.5-7b: type: text framework: vllm - service_script: vllm/server.py + service_script: models/vllm/server.py port: 8001 vram_gb: 14 startup_time_seconds: 120 @@ -17,7 +17,7 @@ models: flux-schnell: type: image framework: openedai-images - service_script: flux/server.py + service_script: models/flux/server.py port: 8002 vram_gb: 14 startup_time_seconds: 60 @@ -28,7 +28,7 @@ models: musicgen-medium: type: audio framework: audiocraft - service_script: musicgen/server.py + service_script: models/musicgen/server.py port: 8003 vram_gb: 11 startup_time_seconds: 45 diff --git a/models/vllm/requirements.txt b/models/vllm/requirements.txt index 146b23a..3c1d6f1 100644 --- a/models/vllm/requirements.txt +++ b/models/vllm/requirements.txt @@ -1,13 +1,4 @@ # vLLM Text Generation Service Dependencies -# vLLM engine +# vLLM engine (will install compatible torch, transformers, etc.) vllm==0.6.4.post1 - -# PyTorch (required by vLLM) -torch==2.1.0 - -# Transformers (for model loading) -transformers==4.36.0 - -# Additional dependencies -accelerate==0.25.0 diff --git a/models/vllm/server.py b/models/vllm/server.py index d23ffc4..0075bd2 100644 --- a/models/vllm/server.py +++ b/models/vllm/server.py @@ -1,27 +1,35 @@ #!/usr/bin/env python3 """ -vLLM Text Generation Service - -OpenAI-compatible text generation using vLLM and Qwen 2.5 7B Instruct model. -Provides /v1/completions and /v1/chat/completions endpoints. +Simple vLLM server using AsyncLLMEngine directly +Bypasses the multiprocessing issues we hit with the default vLLM API server +OpenAI-compatible endpoints: /v1/models and /v1/completions """ import asyncio import json +import logging import os from typing import AsyncIterator, Dict, List, Optional -from fastapi import Request +from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, StreamingResponse from pydantic import BaseModel, Field from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams from vllm.utils import random_uuid -# Import base service class -import sys -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../..')) -from core.base_service import GPUService +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) +# FastAPI app +app = FastAPI(title="Simple vLLM Server", version="1.0.0") + +# Global engine instance +engine: Optional[AsyncLLMEngine] = None +model_name: str = "Qwen/Qwen2.5-7B-Instruct" # Request/Response models class CompletionRequest(BaseModel): @@ -37,13 +45,11 @@ class CompletionRequest(BaseModel): presence_penalty: float = Field(default=0.0, ge=-2.0, le=2.0) frequency_penalty: float = Field(default=0.0, ge=-2.0, le=2.0) - class ChatMessage(BaseModel): """Chat message format""" role: str = Field(..., description="Role: system, user, or assistant") content: str = Field(..., description="Message content") - class ChatCompletionRequest(BaseModel): """OpenAI-compatible chat completion request""" model: str = Field(default="qwen-2.5-7b") @@ -55,243 +61,242 @@ class ChatCompletionRequest(BaseModel): stream: bool = Field(default=False) stop: Optional[str | List[str]] = None +@app.on_event("startup") +async def startup_event(): + """Initialize vLLM engine on startup""" + global engine, model_name -class VLLMService(GPUService): - """vLLM text generation service""" + logger.info(f"Initializing vLLM AsyncLLMEngine with model: {model_name}") - def __init__(self): - # Get port from environment or use default - port = int(os.getenv("PORT", "8001")) - super().__init__(name="vllm-qwen", port=port) + # Configure engine + engine_args = AsyncEngineArgs( + model=model_name, + tensor_parallel_size=1, # Single GPU + gpu_memory_utilization=0.85, # Use 85% of GPU memory + max_model_len=4096, # Context length + dtype="auto", # Auto-detect dtype + download_dir="/workspace/huggingface_cache", # Large disk + trust_remote_code=True, # Some models require this + enforce_eager=False, # Use CUDA graphs for better performance + ) - # Service-specific attributes - self.engine: Optional[AsyncLLMEngine] = None - self.model_name = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-7B-Instruct") + # Create async engine + engine = AsyncLLMEngine.from_engine_args(engine_args) - async def initialize(self): - """Initialize vLLM engine""" - await super().initialize() + logger.info("vLLM AsyncLLMEngine initialized successfully") - self.logger.info(f"Initializing vLLM AsyncLLMEngine with model: {self.model_name}") +@app.get("/") +async def root(): + """Health check endpoint""" + return {"status": "ok", "model": model_name} - # Configure engine - engine_args = AsyncEngineArgs( - model=self.model_name, - tensor_parallel_size=1, # Single GPU - gpu_memory_utilization=0.85, # Use 85% of GPU memory - max_model_len=4096, # Context length - dtype="auto", # Auto-detect dtype - download_dir=os.getenv("HF_CACHE_DIR", "/workspace/huggingface_cache"), - trust_remote_code=True, # Some models require this - enforce_eager=False, # Use CUDA graphs for better performance +@app.get("/health") +async def health(): + """Detailed health check""" + return { + "status": "healthy" if engine else "initializing", + "model": model_name, + "ready": engine is not None + } + +@app.get("/v1/models") +async def list_models(): + """OpenAI-compatible models endpoint""" + return { + "object": "list", + "data": [ + { + "id": "qwen-2.5-7b", + "object": "model", + "created": 1234567890, + "owned_by": "pivoine-gpu", + "permission": [], + "root": model_name, + "parent": None, + } + ] + } + +def messages_to_prompt(messages: List[ChatMessage]) -> str: + """Convert chat messages to a single prompt string""" + # Qwen 2.5 chat template format + prompt_parts = [] + + for msg in messages: + role = msg.role + content = msg.content + + if role == "system": + prompt_parts.append(f"<|im_start|>system\n{content}<|im_end|>") + elif role == "user": + prompt_parts.append(f"<|im_start|>user\n{content}<|im_end|>") + elif role == "assistant": + prompt_parts.append(f"<|im_start|>assistant\n{content}<|im_end|>") + + # Add final assistant prompt + prompt_parts.append("<|im_start|>assistant\n") + + return "\n".join(prompt_parts) + +@app.post("/v1/completions") +async def create_completion(request: CompletionRequest): + """OpenAI-compatible completion endpoint""" + if not engine: + return JSONResponse( + status_code=503, + content={"error": "Engine not initialized"} ) - # Create async engine - self.engine = AsyncLLMEngine.from_engine_args(engine_args) + # Handle both single prompt and batch prompts + prompts = [request.prompt] if isinstance(request.prompt, str) else request.prompt - self.logger.info("vLLM AsyncLLMEngine initialized successfully") + # Configure sampling parameters + sampling_params = SamplingParams( + temperature=request.temperature, + top_p=request.top_p, + max_tokens=request.max_tokens, + n=request.n, + stop=request.stop if request.stop else [], + presence_penalty=request.presence_penalty, + frequency_penalty=request.frequency_penalty, + ) - async def cleanup(self): - """Cleanup resources""" - await super().cleanup() - if self.engine: - # vLLM doesn't have an explicit shutdown method - self.logger.info("vLLM engine cleanup") - self.engine = None + # Generate completions + results = [] + for prompt in prompts: + request_id = random_uuid() - def messages_to_prompt(self, messages: List[ChatMessage]) -> str: - """Convert chat messages to Qwen 2.5 prompt format""" - prompt_parts = [] - - for msg in messages: - role = msg.role - content = msg.content - - if role == "system": - prompt_parts.append(f"<|im_start|>system\n{content}<|im_end|>") - elif role == "user": - prompt_parts.append(f"<|im_start|>user\n{content}<|im_end|>") - elif role == "assistant": - prompt_parts.append(f"<|im_start|>assistant\n{content}<|im_end|>") - - # Add final assistant prompt - prompt_parts.append("<|im_start|>assistant\n") - - return "\n".join(prompt_parts) - - def create_app(self): - """Create FastAPI routes""" - - @self.app.get("/") - async def root(): - """Root endpoint""" - return {"status": "ok", "model": self.model_name} - - @self.app.get("/v1/models") - async def list_models(): - """OpenAI-compatible models endpoint""" - return { - "object": "list", - "data": [ - { - "id": "qwen-2.5-7b", - "object": "model", + if request.stream: + # Streaming response + async def generate_stream(): + async for output in engine.generate(prompt, sampling_params, request_id): + chunk = { + "id": request_id, + "object": "text_completion", "created": 1234567890, - "owned_by": "pivoine-gpu", - "permission": [], - "root": self.model_name, - "parent": None, - } - ] - } - - @self.app.post("/v1/completions") - async def create_completion(request: CompletionRequest): - """OpenAI-compatible completion endpoint""" - if not self.engine: - return JSONResponse( - status_code=503, - content={"error": "Engine not initialized"} - ) - - # Handle both single prompt and batch prompts - prompts = [request.prompt] if isinstance(request.prompt, str) else request.prompt - - # Configure sampling parameters - sampling_params = SamplingParams( - temperature=request.temperature, - top_p=request.top_p, - max_tokens=request.max_tokens, - n=request.n, - stop=request.stop if request.stop else [], - presence_penalty=request.presence_penalty, - frequency_penalty=request.frequency_penalty, - ) - - # Generate completions - results = [] - for prompt in prompts: - request_id = random_uuid() - - if request.stream: - # Streaming response - async def generate_stream(): - async for output in self.engine.generate(prompt, sampling_params, request_id): - chunk = { - "id": request_id, - "object": "text_completion", - "created": 1234567890, - "model": request.model, - "choices": [ - { - "text": output.outputs[0].text, - "index": 0, - "logprobs": None, - "finish_reason": output.outputs[0].finish_reason, - } - ] + "model": request.model, + "choices": [ + { + "text": output.outputs[0].text, + "index": 0, + "logprobs": None, + "finish_reason": output.outputs[0].finish_reason, } - yield f"data: {json.dumps(chunk)}\n\n" - yield "data: [DONE]\n\n" + ] + } + yield f"data: {json.dumps(chunk)}\n\n" + yield "data: [DONE]\n\n" - return StreamingResponse(generate_stream(), media_type="text/event-stream") - else: - # Non-streaming response - async for output in self.engine.generate(prompt, sampling_params, request_id): - final_output = output + return StreamingResponse(generate_stream(), media_type="text/event-stream") + else: + # Non-streaming response + async for output in engine.generate(prompt, sampling_params, request_id): + final_output = output - results.append({ - "text": final_output.outputs[0].text, - "index": len(results), - "logprobs": None, - "finish_reason": final_output.outputs[0].finish_reason, - }) + results.append({ + "text": final_output.outputs[0].text, + "index": len(results), + "logprobs": None, + "finish_reason": final_output.outputs[0].finish_reason, + }) - return { - "id": random_uuid(), - "object": "text_completion", - "created": 1234567890, - "model": request.model, - "choices": results, - "usage": { - "prompt_tokens": 0, # vLLM doesn't expose this easily - "completion_tokens": 0, - "total_tokens": 0, - } - } + return { + "id": random_uuid(), + "object": "text_completion", + "created": 1234567890, + "model": request.model, + "choices": results, + "usage": { + "prompt_tokens": 0, # vLLM doesn't expose this easily + "completion_tokens": 0, + "total_tokens": 0, + } + } - @self.app.post("/v1/chat/completions") - async def create_chat_completion(request: ChatCompletionRequest): - """OpenAI-compatible chat completion endpoint""" - if not self.engine: - return JSONResponse( - status_code=503, - content={"error": "Engine not initialized"} - ) +@app.post("/v1/chat/completions") +async def create_chat_completion(request: ChatCompletionRequest): + """OpenAI-compatible chat completion endpoint""" + if not engine: + return JSONResponse( + status_code=503, + content={"error": "Engine not initialized"} + ) - # Convert messages to prompt - prompt = self.messages_to_prompt(request.messages) + # Convert messages to prompt + prompt = messages_to_prompt(request.messages) - # Configure sampling parameters - sampling_params = SamplingParams( - temperature=request.temperature, - top_p=request.top_p, - max_tokens=request.max_tokens, - n=request.n, - stop=request.stop if request.stop else ["<|im_end|>"], - ) + # Configure sampling parameters + sampling_params = SamplingParams( + temperature=request.temperature, + top_p=request.top_p, + max_tokens=request.max_tokens, + n=request.n, + stop=request.stop if request.stop else ["<|im_end|>"], + ) - request_id = random_uuid() + request_id = random_uuid() - if request.stream: - # Streaming response - async def generate_stream(): - async for output in self.engine.generate(prompt, sampling_params, request_id): - chunk = { - "id": request_id, - "object": "chat.completion.chunk", - "created": 1234567890, - "model": request.model, - "choices": [ - { - "index": 0, - "delta": {"content": output.outputs[0].text}, - "finish_reason": output.outputs[0].finish_reason, - } - ] - } - yield f"data: {json.dumps(chunk)}\n\n" - yield "data: [DONE]\n\n" - - return StreamingResponse(generate_stream(), media_type="text/event-stream") - else: - # Non-streaming response - async for output in self.engine.generate(prompt, sampling_params, request_id): - final_output = output - - return { + if request.stream: + # Streaming response + async def generate_stream(): + async for output in engine.generate(prompt, sampling_params, request_id): + chunk = { "id": request_id, - "object": "chat.completion", + "object": "chat.completion.chunk", "created": 1234567890, "model": request.model, "choices": [ { "index": 0, - "message": { - "role": "assistant", - "content": final_output.outputs[0].text, - }, - "finish_reason": final_output.outputs[0].finish_reason, + "delta": {"content": output.outputs[0].text}, + "finish_reason": output.outputs[0].finish_reason, } - ], - "usage": { - "prompt_tokens": 0, - "completion_tokens": 0, - "total_tokens": 0, - } + ] } + yield f"data: {json.dumps(chunk)}\n\n" + yield "data: [DONE]\n\n" + return StreamingResponse(generate_stream(), media_type="text/event-stream") + else: + # Non-streaming response + async for output in engine.generate(prompt, sampling_params, request_id): + final_output = output + + return { + "id": request_id, + "object": "chat.completion", + "created": 1234567890, + "model": request.model, + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": final_output.outputs[0].text, + }, + "finish_reason": final_output.outputs[0].finish_reason, + } + ], + "usage": { + "prompt_tokens": 0, + "completion_tokens": 0, + "total_tokens": 0, + } + } if __name__ == "__main__": - service = VLLMService() - service.run() + import uvicorn + + # Get configuration from environment + host = os.getenv("VLLM_HOST", "0.0.0.0") + port = int(os.getenv("VLLM_PORT", "8000")) + + logger.info(f"Starting vLLM server on {host}:{port}") + + uvicorn.run( + app, + host=host, + port=port, + log_level="info", + access_log=True, + ) diff --git a/musicgen/Dockerfile b/musicgen/Dockerfile deleted file mode 100644 index 5044496..0000000 --- a/musicgen/Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04 - -WORKDIR /app - -# Install Python and system dependencies -RUN apt-get update && apt-get install -y \ - python3.10 \ - python3-pip \ - ffmpeg \ - git \ - && rm -rf /var/lib/apt/lists/* - -# Upgrade pip -RUN pip3 install --no-cache-dir --upgrade pip - -# Install PyTorch with CUDA support -RUN pip3 install --no-cache-dir torch==2.1.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121 - -# Copy requirements and install dependencies -COPY requirements.txt . -RUN pip3 install --no-cache-dir -r requirements.txt - -# Copy application code -COPY server.py . - -# Create directory for model cache -RUN mkdir -p /app/models - -# Environment variables -ENV HF_HOME=/app/models -ENV TORCH_HOME=/app/models -ENV MODEL_NAME=facebook/musicgen-medium - -# Expose port -EXPOSE 8000 - -# Run the server -CMD ["python3", "server.py"] diff --git a/musicgen/requirements.txt b/musicgen/requirements.txt deleted file mode 100644 index 37cf773..0000000 --- a/musicgen/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -torch==2.1.0 -torchaudio==2.1.0 -audiocraft==1.3.0 -fastapi==0.104.1 -uvicorn[standard]==0.24.0 -pydantic==2.5.0 diff --git a/musicgen/server.py b/musicgen/server.py deleted file mode 100644 index 5ea6218..0000000 --- a/musicgen/server.py +++ /dev/null @@ -1,194 +0,0 @@ -#!/usr/bin/env python3 -""" -MusicGen API Server -OpenAI-compatible API for music generation using Meta's MusicGen - -Endpoints: -- POST /v1/audio/generations - Generate music from text prompt -- GET /health - Health check -- GET / - Service info -""" - -import base64 -import io -import logging -import os -import tempfile -from typing import Optional - -import torch -import torchaudio -from audiocraft.models import MusicGen -from fastapi import FastAPI, HTTPException -from fastapi.responses import JSONResponse -from pydantic import BaseModel, Field - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - -# FastAPI app -app = FastAPI(title="MusicGen API Server", version="1.0.0") - -# Global model instance -model: Optional[MusicGen] = None -model_name: str = os.getenv("MODEL_NAME", "facebook/musicgen-medium") -device: str = "cuda" if torch.cuda.is_available() else "cpu" - - -class AudioGenerationRequest(BaseModel): - """Music generation request""" - model: str = Field(default="musicgen-medium", description="Model name") - prompt: str = Field(..., description="Text description of the music to generate") - duration: float = Field(default=30.0, ge=1.0, le=30.0, description="Duration in seconds") - temperature: float = Field(default=1.0, ge=0.1, le=2.0, description="Sampling temperature") - top_k: int = Field(default=250, ge=0, le=500, description="Top-k sampling") - top_p: float = Field(default=0.0, ge=0.0, le=1.0, description="Top-p (nucleus) sampling") - cfg_coef: float = Field(default=3.0, ge=1.0, le=15.0, description="Classifier-free guidance coefficient") - response_format: str = Field(default="wav", description="Audio format (wav or mp3)") - - -class AudioGenerationResponse(BaseModel): - """Music generation response""" - audio: str = Field(..., description="Base64-encoded audio data") - format: str = Field(..., description="Audio format (wav or mp3)") - duration: float = Field(..., description="Duration in seconds") - sample_rate: int = Field(..., description="Sample rate in Hz") - - -@app.on_event("startup") -async def startup_event(): - """Load MusicGen model on startup""" - global model - - logger.info(f"Loading MusicGen model: {model_name}") - logger.info(f"Device: {device}") - - # Load model - model = MusicGen.get_pretrained(model_name, device=device) - - logger.info(f"MusicGen model loaded successfully") - logger.info(f"Max duration: 30 seconds at 32kHz") - - -@app.get("/") -async def root(): - """Root endpoint""" - return { - "service": "MusicGen API Server", - "model": model_name, - "device": device, - "max_duration": 30.0, - "sample_rate": 32000 - } - - -@app.get("/health") -async def health(): - """Health check endpoint""" - return { - "status": "healthy" if model else "initializing", - "model": model_name, - "device": device, - "ready": model is not None, - "gpu_available": torch.cuda.is_available() - } - - -@app.post("/v1/audio/generations") -async def generate_audio(request: AudioGenerationRequest) -> AudioGenerationResponse: - """Generate music from text prompt""" - if not model: - raise HTTPException(status_code=503, detail="Model not initialized") - - logger.info(f"Generating music: {request.prompt[:100]}...") - logger.info(f"Duration: {request.duration}s, Temperature: {request.temperature}") - - try: - # Set generation parameters - model.set_generation_params( - duration=request.duration, - temperature=request.temperature, - top_k=request.top_k, - top_p=request.top_p, - cfg_coef=request.cfg_coef, - ) - - # Generate audio - descriptions = [request.prompt] - with torch.no_grad(): - wav = model.generate(descriptions) - - # wav shape: [batch_size, channels, samples] - # Extract first batch item - audio_data = wav[0].cpu() # [channels, samples] - - # Get sample rate - sample_rate = model.sample_rate - - # Save to temporary file - with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: - temp_path = temp_file.name - torchaudio.save(temp_path, audio_data, sample_rate) - - # Read audio file and encode to base64 - with open(temp_path, 'rb') as f: - audio_bytes = f.read() - - # Clean up temporary file - os.unlink(temp_path) - - # Encode to base64 - audio_base64 = base64.b64encode(audio_bytes).decode('utf-8') - - logger.info(f"Generated {request.duration}s of audio") - - return AudioGenerationResponse( - audio=audio_base64, - format="wav", - duration=request.duration, - sample_rate=sample_rate - ) - - except Exception as e: - logger.error(f"Error generating audio: {e}") - raise HTTPException(status_code=500, detail=str(e)) - - -@app.get("/v1/models") -async def list_models(): - """List available models (OpenAI-compatible)""" - return { - "object": "list", - "data": [ - { - "id": "musicgen-medium", - "object": "model", - "created": 1234567890, - "owned_by": "meta", - "permission": [], - "root": model_name, - "parent": None, - } - ] - } - - -if __name__ == "__main__": - import uvicorn - - host = os.getenv("HOST", "0.0.0.0") - port = int(os.getenv("PORT", "8000")) - - logger.info(f"Starting MusicGen API server on {host}:{port}") - - uvicorn.run( - app, - host=host, - port=port, - log_level="info", - access_log=True, - ) diff --git a/vllm/Dockerfile b/vllm/Dockerfile deleted file mode 100644 index 7dde2d6..0000000 --- a/vllm/Dockerfile +++ /dev/null @@ -1,34 +0,0 @@ -FROM nvidia/cuda:12.4.0-runtime-ubuntu22.04 - -WORKDIR /app - -# Install Python and system dependencies -RUN apt-get update && apt-get install -y \ - python3.11 \ - python3-pip \ - git \ - && rm -rf /var/lib/apt/lists/* - -# Upgrade pip -RUN pip3 install --no-cache-dir --upgrade pip - -# Install vLLM and dependencies -COPY requirements.txt . -RUN pip3 install --no-cache-dir -r requirements.txt - -# Copy application code -COPY server.py . - -# Create directory for model cache -RUN mkdir -p /workspace/huggingface_cache - -# Environment variables -ENV HF_HOME=/workspace/huggingface_cache -ENV VLLM_HOST=0.0.0.0 -ENV VLLM_PORT=8000 - -# Expose port -EXPOSE 8000 - -# Run the server -CMD ["python3", "server.py"] diff --git a/vllm/requirements.txt b/vllm/requirements.txt deleted file mode 100644 index b702e45..0000000 --- a/vllm/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -vllm==0.6.4.post1 -fastapi==0.104.1 -uvicorn[standard]==0.24.0 -pydantic==2.5.0 diff --git a/vllm/server.py b/vllm/server.py deleted file mode 100644 index 0075bd2..0000000 --- a/vllm/server.py +++ /dev/null @@ -1,302 +0,0 @@ -#!/usr/bin/env python3 -""" -Simple vLLM server using AsyncLLMEngine directly -Bypasses the multiprocessing issues we hit with the default vLLM API server -OpenAI-compatible endpoints: /v1/models and /v1/completions -""" - -import asyncio -import json -import logging -import os -from typing import AsyncIterator, Dict, List, Optional - -from fastapi import FastAPI, Request -from fastapi.responses import JSONResponse, StreamingResponse -from pydantic import BaseModel, Field -from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams -from vllm.utils import random_uuid - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' -) -logger = logging.getLogger(__name__) - -# FastAPI app -app = FastAPI(title="Simple vLLM Server", version="1.0.0") - -# Global engine instance -engine: Optional[AsyncLLMEngine] = None -model_name: str = "Qwen/Qwen2.5-7B-Instruct" - -# Request/Response models -class CompletionRequest(BaseModel): - """OpenAI-compatible completion request""" - model: str = Field(default="qwen-2.5-7b") - prompt: str | List[str] = Field(..., description="Text prompt(s)") - max_tokens: int = Field(default=512, ge=1, le=4096) - temperature: float = Field(default=0.7, ge=0.0, le=2.0) - top_p: float = Field(default=1.0, ge=0.0, le=1.0) - n: int = Field(default=1, ge=1, le=10) - stream: bool = Field(default=False) - stop: Optional[str | List[str]] = None - presence_penalty: float = Field(default=0.0, ge=-2.0, le=2.0) - frequency_penalty: float = Field(default=0.0, ge=-2.0, le=2.0) - -class ChatMessage(BaseModel): - """Chat message format""" - role: str = Field(..., description="Role: system, user, or assistant") - content: str = Field(..., description="Message content") - -class ChatCompletionRequest(BaseModel): - """OpenAI-compatible chat completion request""" - model: str = Field(default="qwen-2.5-7b") - messages: List[ChatMessage] = Field(..., description="Chat messages") - max_tokens: int = Field(default=512, ge=1, le=4096) - temperature: float = Field(default=0.7, ge=0.0, le=2.0) - top_p: float = Field(default=1.0, ge=0.0, le=1.0) - n: int = Field(default=1, ge=1, le=10) - stream: bool = Field(default=False) - stop: Optional[str | List[str]] = None - -@app.on_event("startup") -async def startup_event(): - """Initialize vLLM engine on startup""" - global engine, model_name - - logger.info(f"Initializing vLLM AsyncLLMEngine with model: {model_name}") - - # Configure engine - engine_args = AsyncEngineArgs( - model=model_name, - tensor_parallel_size=1, # Single GPU - gpu_memory_utilization=0.85, # Use 85% of GPU memory - max_model_len=4096, # Context length - dtype="auto", # Auto-detect dtype - download_dir="/workspace/huggingface_cache", # Large disk - trust_remote_code=True, # Some models require this - enforce_eager=False, # Use CUDA graphs for better performance - ) - - # Create async engine - engine = AsyncLLMEngine.from_engine_args(engine_args) - - logger.info("vLLM AsyncLLMEngine initialized successfully") - -@app.get("/") -async def root(): - """Health check endpoint""" - return {"status": "ok", "model": model_name} - -@app.get("/health") -async def health(): - """Detailed health check""" - return { - "status": "healthy" if engine else "initializing", - "model": model_name, - "ready": engine is not None - } - -@app.get("/v1/models") -async def list_models(): - """OpenAI-compatible models endpoint""" - return { - "object": "list", - "data": [ - { - "id": "qwen-2.5-7b", - "object": "model", - "created": 1234567890, - "owned_by": "pivoine-gpu", - "permission": [], - "root": model_name, - "parent": None, - } - ] - } - -def messages_to_prompt(messages: List[ChatMessage]) -> str: - """Convert chat messages to a single prompt string""" - # Qwen 2.5 chat template format - prompt_parts = [] - - for msg in messages: - role = msg.role - content = msg.content - - if role == "system": - prompt_parts.append(f"<|im_start|>system\n{content}<|im_end|>") - elif role == "user": - prompt_parts.append(f"<|im_start|>user\n{content}<|im_end|>") - elif role == "assistant": - prompt_parts.append(f"<|im_start|>assistant\n{content}<|im_end|>") - - # Add final assistant prompt - prompt_parts.append("<|im_start|>assistant\n") - - return "\n".join(prompt_parts) - -@app.post("/v1/completions") -async def create_completion(request: CompletionRequest): - """OpenAI-compatible completion endpoint""" - if not engine: - return JSONResponse( - status_code=503, - content={"error": "Engine not initialized"} - ) - - # Handle both single prompt and batch prompts - prompts = [request.prompt] if isinstance(request.prompt, str) else request.prompt - - # Configure sampling parameters - sampling_params = SamplingParams( - temperature=request.temperature, - top_p=request.top_p, - max_tokens=request.max_tokens, - n=request.n, - stop=request.stop if request.stop else [], - presence_penalty=request.presence_penalty, - frequency_penalty=request.frequency_penalty, - ) - - # Generate completions - results = [] - for prompt in prompts: - request_id = random_uuid() - - if request.stream: - # Streaming response - async def generate_stream(): - async for output in engine.generate(prompt, sampling_params, request_id): - chunk = { - "id": request_id, - "object": "text_completion", - "created": 1234567890, - "model": request.model, - "choices": [ - { - "text": output.outputs[0].text, - "index": 0, - "logprobs": None, - "finish_reason": output.outputs[0].finish_reason, - } - ] - } - yield f"data: {json.dumps(chunk)}\n\n" - yield "data: [DONE]\n\n" - - return StreamingResponse(generate_stream(), media_type="text/event-stream") - else: - # Non-streaming response - async for output in engine.generate(prompt, sampling_params, request_id): - final_output = output - - results.append({ - "text": final_output.outputs[0].text, - "index": len(results), - "logprobs": None, - "finish_reason": final_output.outputs[0].finish_reason, - }) - - return { - "id": random_uuid(), - "object": "text_completion", - "created": 1234567890, - "model": request.model, - "choices": results, - "usage": { - "prompt_tokens": 0, # vLLM doesn't expose this easily - "completion_tokens": 0, - "total_tokens": 0, - } - } - -@app.post("/v1/chat/completions") -async def create_chat_completion(request: ChatCompletionRequest): - """OpenAI-compatible chat completion endpoint""" - if not engine: - return JSONResponse( - status_code=503, - content={"error": "Engine not initialized"} - ) - - # Convert messages to prompt - prompt = messages_to_prompt(request.messages) - - # Configure sampling parameters - sampling_params = SamplingParams( - temperature=request.temperature, - top_p=request.top_p, - max_tokens=request.max_tokens, - n=request.n, - stop=request.stop if request.stop else ["<|im_end|>"], - ) - - request_id = random_uuid() - - if request.stream: - # Streaming response - async def generate_stream(): - async for output in engine.generate(prompt, sampling_params, request_id): - chunk = { - "id": request_id, - "object": "chat.completion.chunk", - "created": 1234567890, - "model": request.model, - "choices": [ - { - "index": 0, - "delta": {"content": output.outputs[0].text}, - "finish_reason": output.outputs[0].finish_reason, - } - ] - } - yield f"data: {json.dumps(chunk)}\n\n" - yield "data: [DONE]\n\n" - - return StreamingResponse(generate_stream(), media_type="text/event-stream") - else: - # Non-streaming response - async for output in engine.generate(prompt, sampling_params, request_id): - final_output = output - - return { - "id": request_id, - "object": "chat.completion", - "created": 1234567890, - "model": request.model, - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": final_output.outputs[0].text, - }, - "finish_reason": final_output.outputs[0].finish_reason, - } - ], - "usage": { - "prompt_tokens": 0, - "completion_tokens": 0, - "total_tokens": 0, - } - } - -if __name__ == "__main__": - import uvicorn - - # Get configuration from environment - host = os.getenv("VLLM_HOST", "0.0.0.0") - port = int(os.getenv("VLLM_PORT", "8000")) - - logger.info(f"Starting vLLM server on {host}:{port}") - - uvicorn.run( - app, - host=host, - port=port, - log_level="info", - access_log=True, - )