# Model Registry for AI Orchestrator # Add new models by appending to this file models: # Text Generation Models qwen-2.5-7b: type: text framework: vllm service_script: models/vllm/server.py port: 8000 vram_gb: 14 startup_time_seconds: 120 endpoint: /v1/chat/completions description: "Qwen 2.5 7B Instruct - Fast text generation, no authentication required" llama-3.1-8b: type: text framework: vllm service_script: models/vllm/server.py port: 8001 vram_gb: 17 startup_time_seconds: 120 endpoint: /v1/chat/completions description: "Llama 3.1 8B Instruct - Meta's latest model" # Example: Add more models easily by uncommenting and customizing below # Future Text Models: # llama-3.1-8b: # type: text # framework: vllm # docker_service: vllm-llama # port: 8004 # vram_gb: 17 # startup_time_seconds: 120 # endpoint: /v1/chat/completions # description: "Llama 3.1 8B Instruct - Meta's latest model" # Future Image Models: # sdxl: # type: image # framework: openedai-images # docker_service: sdxl # port: 8005 # vram_gb: 10 # startup_time_seconds: 45 # endpoint: /v1/images/generations # description: "Stable Diffusion XL - High quality image generation" # Future Audio Models: # whisper-large: # type: audio # framework: faster-whisper # docker_service: whisper # port: 8006 # vram_gb: 3 # startup_time_seconds: 30 # endpoint: /v1/audio/transcriptions # description: "Whisper Large v3 - Speech-to-text transcription" # # xtts-v2: # type: audio # framework: openedai-speech # docker_service: tts # port: 8007 # vram_gb: 3 # startup_time_seconds: 30 # endpoint: /v1/audio/speech # description: "XTTS v2 - High-quality text-to-speech with voice cloning" # Configuration config: gpu_memory_total_gb: 24 allow_concurrent_loading: false # Sequential loading only model_switch_timeout_seconds: 300 # 5 minutes max for model switching health_check_interval_seconds: 10 default_model: qwen-2.5-7b