78 lines
2.0 KiB
YAML
78 lines
2.0 KiB
YAML
# Model Registry for AI Orchestrator
|
|
# Add new models by appending to this file
|
|
|
|
models:
|
|
# Text Generation Models
|
|
qwen-2.5-7b:
|
|
type: text
|
|
framework: vllm
|
|
service_script: models/vllm/server.py
|
|
port: 8000
|
|
vram_gb: 14
|
|
startup_time_seconds: 120
|
|
endpoint: /v1/chat/completions
|
|
description: "Qwen 2.5 7B Instruct - Fast text generation, no authentication required"
|
|
|
|
llama-3.1-8b:
|
|
type: text
|
|
framework: vllm
|
|
service_script: models/vllm/server.py
|
|
port: 8001
|
|
vram_gb: 17
|
|
startup_time_seconds: 120
|
|
endpoint: /v1/chat/completions
|
|
description: "Llama 3.1 8B Instruct - Meta's latest model"
|
|
|
|
# Example: Add more models easily by uncommenting and customizing below
|
|
|
|
# Future Text Models:
|
|
# llama-3.1-8b:
|
|
# type: text
|
|
# framework: vllm
|
|
# docker_service: vllm-llama
|
|
# port: 8004
|
|
# vram_gb: 17
|
|
# startup_time_seconds: 120
|
|
# endpoint: /v1/chat/completions
|
|
# description: "Llama 3.1 8B Instruct - Meta's latest model"
|
|
|
|
# Future Image Models:
|
|
# sdxl:
|
|
# type: image
|
|
# framework: openedai-images
|
|
# docker_service: sdxl
|
|
# port: 8005
|
|
# vram_gb: 10
|
|
# startup_time_seconds: 45
|
|
# endpoint: /v1/images/generations
|
|
# description: "Stable Diffusion XL - High quality image generation"
|
|
|
|
# Future Audio Models:
|
|
# whisper-large:
|
|
# type: audio
|
|
# framework: faster-whisper
|
|
# docker_service: whisper
|
|
# port: 8006
|
|
# vram_gb: 3
|
|
# startup_time_seconds: 30
|
|
# endpoint: /v1/audio/transcriptions
|
|
# description: "Whisper Large v3 - Speech-to-text transcription"
|
|
#
|
|
# xtts-v2:
|
|
# type: audio
|
|
# framework: openedai-speech
|
|
# docker_service: tts
|
|
# port: 8007
|
|
# vram_gb: 3
|
|
# startup_time_seconds: 30
|
|
# endpoint: /v1/audio/speech
|
|
# description: "XTTS v2 - High-quality text-to-speech with voice cloning"
|
|
|
|
# Configuration
|
|
config:
|
|
gpu_memory_total_gb: 24
|
|
allow_concurrent_loading: false # Sequential loading only
|
|
model_switch_timeout_seconds: 300 # 5 minutes max for model switching
|
|
health_check_interval_seconds: 10
|
|
default_model: qwen-2.5-7b
|