- Updated qwen-2.5-7b port from 8001 to 8000 in models.yaml - Matches actual vLLM server default port configuration - Tested and verified: orchestrator successfully loaded model and generated response
90 lines
2.4 KiB
YAML
90 lines
2.4 KiB
YAML
# Model Registry for AI Orchestrator
|
|
# Add new models by appending to this file
|
|
|
|
models:
|
|
# Text Generation Models
|
|
qwen-2.5-7b:
|
|
type: text
|
|
framework: vllm
|
|
service_script: models/vllm/server.py
|
|
port: 8000
|
|
vram_gb: 14
|
|
startup_time_seconds: 120
|
|
endpoint: /v1/chat/completions
|
|
description: "Qwen 2.5 7B Instruct - Fast text generation, no authentication required"
|
|
|
|
# Image Generation Models
|
|
flux-schnell:
|
|
type: image
|
|
framework: openedai-images
|
|
service_script: models/flux/server.py
|
|
port: 8002
|
|
vram_gb: 14
|
|
startup_time_seconds: 60
|
|
endpoint: /v1/images/generations
|
|
description: "Flux.1 Schnell - Fast high-quality image generation (4-5 sec/image)"
|
|
|
|
# Music Generation Models
|
|
musicgen-medium:
|
|
type: audio
|
|
framework: audiocraft
|
|
service_script: models/musicgen/server.py
|
|
port: 8003
|
|
vram_gb: 11
|
|
startup_time_seconds: 45
|
|
endpoint: /v1/audio/generations
|
|
description: "MusicGen Medium - Text-to-music generation (60-90 sec for 30s audio)"
|
|
|
|
# Example: Add more models easily by uncommenting and customizing below
|
|
|
|
# Future Text Models:
|
|
# llama-3.1-8b:
|
|
# type: text
|
|
# framework: vllm
|
|
# docker_service: vllm-llama
|
|
# port: 8004
|
|
# vram_gb: 17
|
|
# startup_time_seconds: 120
|
|
# endpoint: /v1/chat/completions
|
|
# description: "Llama 3.1 8B Instruct - Meta's latest model"
|
|
|
|
# Future Image Models:
|
|
# sdxl:
|
|
# type: image
|
|
# framework: openedai-images
|
|
# docker_service: sdxl
|
|
# port: 8005
|
|
# vram_gb: 10
|
|
# startup_time_seconds: 45
|
|
# endpoint: /v1/images/generations
|
|
# description: "Stable Diffusion XL - High quality image generation"
|
|
|
|
# Future Audio Models:
|
|
# whisper-large:
|
|
# type: audio
|
|
# framework: faster-whisper
|
|
# docker_service: whisper
|
|
# port: 8006
|
|
# vram_gb: 3
|
|
# startup_time_seconds: 30
|
|
# endpoint: /v1/audio/transcriptions
|
|
# description: "Whisper Large v3 - Speech-to-text transcription"
|
|
#
|
|
# xtts-v2:
|
|
# type: audio
|
|
# framework: openedai-speech
|
|
# docker_service: tts
|
|
# port: 8007
|
|
# vram_gb: 3
|
|
# startup_time_seconds: 30
|
|
# endpoint: /v1/audio/speech
|
|
# description: "XTTS v2 - High-quality text-to-speech with voice cloning"
|
|
|
|
# Configuration
|
|
config:
|
|
gpu_memory_total_gb: 24
|
|
allow_concurrent_loading: false # Sequential loading only
|
|
model_switch_timeout_seconds: 300 # 5 minutes max for model switching
|
|
health_check_interval_seconds: 10
|
|
default_model: qwen-2.5-7b
|