Initial commit: RunPod multi-modal AI orchestration stack

- Multi-modal AI infrastructure for RunPod RTX 4090 - Automatic model orchestration (text, image, music) - Text: vLLM + Qwen 2.5 7B Instruct - Image: Flux.1 Schnell via OpenEDAI - Music: MusicGen Medium via AudioCraft - Cost-optimized sequential loading on single GPU - Template preparation scripts for rapid deployment - Comprehensive documentation (README, DEPLOYMENT, TEMPLATE)
2025-11-21 14:34:55 +01:00
commit 277f1c95bd
35 changed files with 7654 additions and 0 deletions
--- a/model-orchestrator/models.yaml
+++ b/model-orchestrator/models.yaml
@@ -0,0 +1,89 @@
+# Model Registry for AI Orchestrator
+# Add new models by appending to this file
+
+models:
+  # Text Generation Models
+  qwen-2.5-7b:
+    type: text
+    framework: vllm
+    docker_service: vllm-qwen
+    port: 8001
+    vram_gb: 14
+    startup_time_seconds: 120
+    endpoint: /v1/chat/completions
+    description: "Qwen 2.5 7B Instruct - Fast text generation, no authentication required"
+
+  # Image Generation Models
+  flux-schnell:
+    type: image
+    framework: openedai-images
+    docker_service: flux
+    port: 8002
+    vram_gb: 14
+    startup_time_seconds: 60
+    endpoint: /v1/images/generations
+    description: "Flux.1 Schnell - Fast high-quality image generation (4-5 sec/image)"
+
+  # Music Generation Models
+  musicgen-medium:
+    type: audio
+    framework: audiocraft
+    docker_service: musicgen
+    port: 8003
+    vram_gb: 11
+    startup_time_seconds: 45
+    endpoint: /v1/audio/generations
+    description: "MusicGen Medium - Text-to-music generation (60-90 sec for 30s audio)"
+
+# Example: Add more models easily by uncommenting and customizing below
+
+# Future Text Models:
+#  llama-3.1-8b:
+#    type: text
+#    framework: vllm
+#    docker_service: vllm-llama
+#    port: 8004
+#    vram_gb: 17
+#    startup_time_seconds: 120
+#    endpoint: /v1/chat/completions
+#    description: "Llama 3.1 8B Instruct - Meta's latest model"
+
+# Future Image Models:
+#  sdxl:
+#    type: image
+#    framework: openedai-images
+#    docker_service: sdxl
+#    port: 8005
+#    vram_gb: 10
+#    startup_time_seconds: 45
+#    endpoint: /v1/images/generations
+#    description: "Stable Diffusion XL - High quality image generation"
+
+# Future Audio Models:
+#  whisper-large:
+#    type: audio
+#    framework: faster-whisper
+#    docker_service: whisper
+#    port: 8006
+#    vram_gb: 3
+#    startup_time_seconds: 30
+#    endpoint: /v1/audio/transcriptions
+#    description: "Whisper Large v3 - Speech-to-text transcription"
+#
+#  xtts-v2:
+#    type: audio
+#    framework: openedai-speech
+#    docker_service: tts
+#    port: 8007
+#    vram_gb: 3
+#    startup_time_seconds: 30
+#    endpoint: /v1/audio/speech
+#    description: "XTTS v2 - High-quality text-to-speech with voice cloning"
+
+# Configuration
+config:
+  gpu_memory_total_gb: 24
+  allow_concurrent_loading: false  # Sequential loading only
+  model_switch_timeout_seconds: 300  # 5 minutes max for model switching
+  health_check_interval_seconds: 10
+  default_model: qwen-2.5-7b