Started redesigning architecture to run services directly without Docker: **Completed:** - Created new process-based orchestrator (orchestrator_subprocess.py) - Uses subprocess instead of Docker SDK for process management - Updated models.yaml to reference service_script paths - vLLM server already standalone-ready **Still needed:** - Create/update Flux and MusicGen standalone servers - Create systemd service files or startup scripts - Update prepare-template script for Python deployment - Remove Docker/Compose dependencies - Test full stack on RunPod - Update documentation Reason for change: RunPod's containerized environment doesn't support Docker-in-Docker (requires CAP_SYS_ADMIN). Direct Python execution is simpler, faster, and more reliable for RunPod. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
90 lines
2.4 KiB
YAML
90 lines
2.4 KiB
YAML
# Model Registry for AI Orchestrator
|
|
# Add new models by appending to this file
|
|
|
|
models:
|
|
# Text Generation Models
|
|
qwen-2.5-7b:
|
|
type: text
|
|
framework: vllm
|
|
service_script: vllm/server.py
|
|
port: 8001
|
|
vram_gb: 14
|
|
startup_time_seconds: 120
|
|
endpoint: /v1/chat/completions
|
|
description: "Qwen 2.5 7B Instruct - Fast text generation, no authentication required"
|
|
|
|
# Image Generation Models
|
|
flux-schnell:
|
|
type: image
|
|
framework: openedai-images
|
|
service_script: flux/server.py
|
|
port: 8002
|
|
vram_gb: 14
|
|
startup_time_seconds: 60
|
|
endpoint: /v1/images/generations
|
|
description: "Flux.1 Schnell - Fast high-quality image generation (4-5 sec/image)"
|
|
|
|
# Music Generation Models
|
|
musicgen-medium:
|
|
type: audio
|
|
framework: audiocraft
|
|
service_script: musicgen/server.py
|
|
port: 8003
|
|
vram_gb: 11
|
|
startup_time_seconds: 45
|
|
endpoint: /v1/audio/generations
|
|
description: "MusicGen Medium - Text-to-music generation (60-90 sec for 30s audio)"
|
|
|
|
# Example: Add more models easily by uncommenting and customizing below
|
|
|
|
# Future Text Models:
|
|
# llama-3.1-8b:
|
|
# type: text
|
|
# framework: vllm
|
|
# docker_service: vllm-llama
|
|
# port: 8004
|
|
# vram_gb: 17
|
|
# startup_time_seconds: 120
|
|
# endpoint: /v1/chat/completions
|
|
# description: "Llama 3.1 8B Instruct - Meta's latest model"
|
|
|
|
# Future Image Models:
|
|
# sdxl:
|
|
# type: image
|
|
# framework: openedai-images
|
|
# docker_service: sdxl
|
|
# port: 8005
|
|
# vram_gb: 10
|
|
# startup_time_seconds: 45
|
|
# endpoint: /v1/images/generations
|
|
# description: "Stable Diffusion XL - High quality image generation"
|
|
|
|
# Future Audio Models:
|
|
# whisper-large:
|
|
# type: audio
|
|
# framework: faster-whisper
|
|
# docker_service: whisper
|
|
# port: 8006
|
|
# vram_gb: 3
|
|
# startup_time_seconds: 30
|
|
# endpoint: /v1/audio/transcriptions
|
|
# description: "Whisper Large v3 - Speech-to-text transcription"
|
|
#
|
|
# xtts-v2:
|
|
# type: audio
|
|
# framework: openedai-speech
|
|
# docker_service: tts
|
|
# port: 8007
|
|
# vram_gb: 3
|
|
# startup_time_seconds: 30
|
|
# endpoint: /v1/audio/speech
|
|
# description: "XTTS v2 - High-quality text-to-speech with voice cloning"
|
|
|
|
# Configuration
|
|
config:
|
|
gpu_memory_total_gb: 24
|
|
allow_concurrent_loading: false # Sequential loading only
|
|
model_switch_timeout_seconds: 300 # 5 minutes max for model switching
|
|
health_check_interval_seconds: 10
|
|
default_model: qwen-2.5-7b
|