runpod/model-orchestrator/models.yaml

# Model Registry for AI Orchestrator
# Add new models by appending to this file

models:
  # Text Generation Models
  qwen-2.5-7b:
    type: text
    framework: vllm
    service_script: models/vllm/server.py
    port: 8000
    vram_gb: 14
    startup_time_seconds: 120
    endpoint: /v1/chat/completions
    description: "Qwen 2.5 7B Instruct - Fast text generation, no authentication required"

  llama-3.1-8b:
    type: text
    framework: vllm
    service_script: models/vllm/server.py
    port: 8001
    vram_gb: 17
    startup_time_seconds: 120
    endpoint: /v1/chat/completions
    description: "Llama 3.1 8B Instruct - Meta's latest model"

# Example: Add more models easily by uncommenting and customizing below

# Future Text Models:
#  llama-3.1-8b:
#    type: text
#    framework: vllm
#    docker_service: vllm-llama
#    port: 8004
#    vram_gb: 17
#    startup_time_seconds: 120
#    endpoint: /v1/chat/completions
#    description: "Llama 3.1 8B Instruct - Meta's latest model"

# Future Image Models:
#  sdxl:
#    type: image
#    framework: openedai-images
#    docker_service: sdxl
#    port: 8005
#    vram_gb: 10
#    startup_time_seconds: 45
#    endpoint: /v1/images/generations
#    description: "Stable Diffusion XL - High quality image generation"

# Future Audio Models:
#  whisper-large:
#    type: audio
#    framework: faster-whisper
#    docker_service: whisper
#    port: 8006
#    vram_gb: 3
#    startup_time_seconds: 30
#    endpoint: /v1/audio/transcriptions
#    description: "Whisper Large v3 - Speech-to-text transcription"
#
#  xtts-v2:
#    type: audio
#    framework: openedai-speech
#    docker_service: tts
#    port: 8007
#    vram_gb: 3
#    startup_time_seconds: 30
#    endpoint: /v1/audio/speech
#    description: "XTTS v2 - High-quality text-to-speech with voice cloning"

# Configuration
config:
  gpu_memory_total_gb: 24
  allow_concurrent_loading: false  # Sequential loading only
  model_switch_timeout_seconds: 300  # 5 minutes max for model switching
  health_check_interval_seconds: 10
  default_model: qwen-2.5-7b