Initial commit: RunPod multi-modal AI orchestration stack
- Multi-modal AI infrastructure for RunPod RTX 4090 - Automatic model orchestration (text, image, music) - Text: vLLM + Qwen 2.5 7B Instruct - Image: Flux.1 Schnell via OpenEDAI - Music: MusicGen Medium via AudioCraft - Cost-optimized sequential loading on single GPU - Template preparation scripts for rapid deployment - Comprehensive documentation (README, DEPLOYMENT, TEMPLATE)
This commit is contained in:
199
litellm-config-gpu.yaml
Normal file
199
litellm-config-gpu.yaml
Normal file
@@ -0,0 +1,199 @@
|
||||
# LiteLLM Configuration with GPU Server Integration
|
||||
# This config includes both Anthropic Claude (API) and self-hosted models (vLLM on GPU server)
|
||||
|
||||
model_list:
|
||||
# =============================================================================
|
||||
# Anthropic Claude Models (API-based, for complex reasoning)
|
||||
# =============================================================================
|
||||
|
||||
- model_name: claude-sonnet-4
|
||||
litellm_params:
|
||||
model: anthropic/claude-sonnet-4-20250514
|
||||
api_key: os.environ/ANTHROPIC_API_KEY
|
||||
|
||||
- model_name: claude-sonnet-4.5
|
||||
litellm_params:
|
||||
model: anthropic/claude-sonnet-4-5-20250929
|
||||
api_key: os.environ/ANTHROPIC_API_KEY
|
||||
|
||||
- model_name: claude-3-5-sonnet
|
||||
litellm_params:
|
||||
model: anthropic/claude-3-5-sonnet-20241022
|
||||
api_key: os.environ/ANTHROPIC_API_KEY
|
||||
|
||||
- model_name: claude-3-opus
|
||||
litellm_params:
|
||||
model: anthropic/claude-3-opus-20240229
|
||||
api_key: os.environ/ANTHROPIC_API_KEY
|
||||
|
||||
- model_name: claude-3-haiku
|
||||
litellm_params:
|
||||
model: anthropic/claude-3-haiku-20240307
|
||||
api_key: os.environ/ANTHROPIC_API_KEY
|
||||
|
||||
# =============================================================================
|
||||
# Self-Hosted Models (vLLM on GPU server via WireGuard VPN)
|
||||
# =============================================================================
|
||||
|
||||
# Llama 3.1 8B Instruct - Fast, general-purpose, good for routine tasks
|
||||
- model_name: llama-3.1-8b
|
||||
litellm_params:
|
||||
model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct
|
||||
api_base: http://10.8.0.2:8000/v1
|
||||
api_key: dummy # vLLM doesn't require auth
|
||||
rpm: 1000 # Rate limit: requests per minute
|
||||
tpm: 100000 # Rate limit: tokens per minute
|
||||
|
||||
# Alternative models (uncomment and configure on GPU server as needed)
|
||||
|
||||
# Qwen 2.5 14B Instruct - Excellent multilingual, stronger reasoning
|
||||
# - model_name: qwen-2.5-14b
|
||||
# litellm_params:
|
||||
# model: openai/Qwen/Qwen2.5-14B-Instruct
|
||||
# api_base: http://10.8.0.2:8000/v1
|
||||
# api_key: dummy
|
||||
# rpm: 800
|
||||
# tpm: 80000
|
||||
|
||||
# Mistral 7B Instruct - Very fast, lightweight
|
||||
# - model_name: mistral-7b
|
||||
# litellm_params:
|
||||
# model: openai/mistralai/Mistral-7B-Instruct-v0.3
|
||||
# api_base: http://10.8.0.2:8000/v1
|
||||
# api_key: dummy
|
||||
# rpm: 1200
|
||||
# tpm: 120000
|
||||
|
||||
# DeepSeek Coder 6.7B - Code generation specialist
|
||||
# - model_name: deepseek-coder-6.7b
|
||||
# litellm_params:
|
||||
# model: openai/deepseek-ai/deepseek-coder-6.7b-instruct
|
||||
# api_base: http://10.8.0.2:8000/v1
|
||||
# api_key: dummy
|
||||
# rpm: 1000
|
||||
# tpm: 100000
|
||||
|
||||
# =============================================================================
|
||||
# Router Settings - Intelligent Model Selection
|
||||
# =============================================================================
|
||||
|
||||
# Model aliases for easy switching in Open WebUI
|
||||
model_name_map:
|
||||
# Default model (self-hosted, fast)
|
||||
gpt-3.5-turbo: llama-3.1-8b
|
||||
|
||||
# Power users can use Claude for complex tasks
|
||||
gpt-4: claude-sonnet-4.5
|
||||
gpt-4-turbo: claude-sonnet-4.5
|
||||
|
||||
# LiteLLM Settings
|
||||
litellm_settings:
|
||||
drop_params: true
|
||||
set_verbose: false # Disable verbose logging for better performance
|
||||
|
||||
# Enable caching with Redis for better performance
|
||||
cache: true
|
||||
cache_params:
|
||||
type: redis
|
||||
host: redis
|
||||
port: 6379
|
||||
ttl: 3600 # Cache for 1 hour
|
||||
|
||||
# Force strip specific parameters globally
|
||||
allowed_fails: 0
|
||||
|
||||
# Modify params before sending to provider
|
||||
modify_params: true
|
||||
|
||||
# Enable success and failure logging but minimize overhead
|
||||
success_callback: [] # Disable all success callbacks to reduce DB writes
|
||||
failure_callback: [] # Disable all failure callbacks
|
||||
|
||||
# Router Settings
|
||||
router_settings:
|
||||
allowed_fails: 0
|
||||
|
||||
# Routing strategy: Try self-hosted first, fallback to Claude on failure
|
||||
routing_strategy: simple-shuffle
|
||||
|
||||
# Cooldown for failed models
|
||||
cooldown_time: 30 # seconds
|
||||
|
||||
# Drop unsupported parameters
|
||||
default_litellm_params:
|
||||
drop_params: true
|
||||
|
||||
# General Settings
|
||||
general_settings:
|
||||
disable_responses_id_security: true
|
||||
|
||||
# Disable spend tracking to reduce database overhead
|
||||
disable_spend_logs: false # Keep enabled to track API vs GPU costs
|
||||
|
||||
# Disable tag tracking
|
||||
disable_tag_tracking: true
|
||||
|
||||
# Disable daily spend updates
|
||||
disable_daily_spend_logs: false # Keep enabled for cost analysis
|
||||
|
||||
# Master key for authentication (set via env var)
|
||||
master_key: os.environ/LITELLM_MASTER_KEY
|
||||
|
||||
# Database for logging (optional but recommended for cost tracking)
|
||||
database_url: os.environ/DATABASE_URL
|
||||
|
||||
# Enable OpenAPI docs
|
||||
docs_url: /docs
|
||||
|
||||
# =============================================================================
|
||||
# Usage Guidelines (for Open WebUI users)
|
||||
# =============================================================================
|
||||
#
|
||||
# Model Selection Guide:
|
||||
#
|
||||
# Use llama-3.1-8b for:
|
||||
# - General chat and Q&A
|
||||
# - Simple code generation
|
||||
# - Data extraction
|
||||
# - Summarization
|
||||
# - Translation
|
||||
# - Most routine tasks
|
||||
# Cost: ~$0/month (self-hosted)
|
||||
# Speed: ~50-80 tokens/second
|
||||
#
|
||||
# Use qwen-2.5-14b for:
|
||||
# - Complex reasoning
|
||||
# - Multi-step problems
|
||||
# - Advanced code generation
|
||||
# - Multilingual tasks
|
||||
# Cost: ~$0/month (self-hosted)
|
||||
# Speed: ~30-50 tokens/second
|
||||
#
|
||||
# Use claude-sonnet-4.5 for:
|
||||
# - Very complex reasoning
|
||||
# - Long documents (200K context)
|
||||
# - Production-critical code
|
||||
# - When quality matters most
|
||||
# Cost: ~$3/million input tokens, ~$15/million output tokens
|
||||
# Speed: ~30-40 tokens/second
|
||||
#
|
||||
# Use claude-3-haiku for:
|
||||
# - API fallback (if self-hosted down)
|
||||
# - Very fast responses needed
|
||||
# Cost: ~$0.25/million input tokens, ~$1.25/million output tokens
|
||||
# Speed: ~60-80 tokens/second
|
||||
#
|
||||
# =============================================================================
|
||||
|
||||
# Health Check Configuration
|
||||
health_check:
|
||||
# Check vLLM health endpoint
|
||||
enabled: true
|
||||
interval: 30 # seconds
|
||||
timeout: 5 # seconds
|
||||
|
||||
# Fallback Configuration
|
||||
# If GPU server is down, automatically use Claude
|
||||
fallback:
|
||||
- ["llama-3.1-8b", "claude-3-haiku"]
|
||||
- ["qwen-2.5-14b", "claude-sonnet-4.5"]
|
||||
Reference in New Issue
Block a user