- Add setup guides (SETUP_GUIDE, TAILSCALE_SETUP, DOCKER_GPU_SETUP, etc.) - Add deployment configurations (litellm-config-gpu.yaml, gpu-server-compose.yaml) - Add GPU_DEPLOYMENT_LOG.md with current infrastructure details - Add GPU_EXPANSION_PLAN.md with complete provider comparison - Add deploy-gpu-stack.sh automation script 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
200 lines
5.9 KiB
YAML
200 lines
5.9 KiB
YAML
# LiteLLM Configuration with GPU Server Integration
|
|
# This config includes both Anthropic Claude (API) and self-hosted models (vLLM on GPU server)
|
|
|
|
model_list:
|
|
# =============================================================================
|
|
# Anthropic Claude Models (API-based, for complex reasoning)
|
|
# =============================================================================
|
|
|
|
- model_name: claude-sonnet-4
|
|
litellm_params:
|
|
model: anthropic/claude-sonnet-4-20250514
|
|
api_key: os.environ/ANTHROPIC_API_KEY
|
|
|
|
- model_name: claude-sonnet-4.5
|
|
litellm_params:
|
|
model: anthropic/claude-sonnet-4-5-20250929
|
|
api_key: os.environ/ANTHROPIC_API_KEY
|
|
|
|
- model_name: claude-3-5-sonnet
|
|
litellm_params:
|
|
model: anthropic/claude-3-5-sonnet-20241022
|
|
api_key: os.environ/ANTHROPIC_API_KEY
|
|
|
|
- model_name: claude-3-opus
|
|
litellm_params:
|
|
model: anthropic/claude-3-opus-20240229
|
|
api_key: os.environ/ANTHROPIC_API_KEY
|
|
|
|
- model_name: claude-3-haiku
|
|
litellm_params:
|
|
model: anthropic/claude-3-haiku-20240307
|
|
api_key: os.environ/ANTHROPIC_API_KEY
|
|
|
|
# =============================================================================
|
|
# Self-Hosted Models (vLLM on GPU server via WireGuard VPN)
|
|
# =============================================================================
|
|
|
|
# Llama 3.1 8B Instruct - Fast, general-purpose, good for routine tasks
|
|
- model_name: llama-3.1-8b
|
|
litellm_params:
|
|
model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct
|
|
api_base: http://10.8.0.2:8000/v1
|
|
api_key: dummy # vLLM doesn't require auth
|
|
rpm: 1000 # Rate limit: requests per minute
|
|
tpm: 100000 # Rate limit: tokens per minute
|
|
|
|
# Alternative models (uncomment and configure on GPU server as needed)
|
|
|
|
# Qwen 2.5 14B Instruct - Excellent multilingual, stronger reasoning
|
|
# - model_name: qwen-2.5-14b
|
|
# litellm_params:
|
|
# model: openai/Qwen/Qwen2.5-14B-Instruct
|
|
# api_base: http://10.8.0.2:8000/v1
|
|
# api_key: dummy
|
|
# rpm: 800
|
|
# tpm: 80000
|
|
|
|
# Mistral 7B Instruct - Very fast, lightweight
|
|
# - model_name: mistral-7b
|
|
# litellm_params:
|
|
# model: openai/mistralai/Mistral-7B-Instruct-v0.3
|
|
# api_base: http://10.8.0.2:8000/v1
|
|
# api_key: dummy
|
|
# rpm: 1200
|
|
# tpm: 120000
|
|
|
|
# DeepSeek Coder 6.7B - Code generation specialist
|
|
# - model_name: deepseek-coder-6.7b
|
|
# litellm_params:
|
|
# model: openai/deepseek-ai/deepseek-coder-6.7b-instruct
|
|
# api_base: http://10.8.0.2:8000/v1
|
|
# api_key: dummy
|
|
# rpm: 1000
|
|
# tpm: 100000
|
|
|
|
# =============================================================================
|
|
# Router Settings - Intelligent Model Selection
|
|
# =============================================================================
|
|
|
|
# Model aliases for easy switching in Open WebUI
|
|
model_name_map:
|
|
# Default model (self-hosted, fast)
|
|
gpt-3.5-turbo: llama-3.1-8b
|
|
|
|
# Power users can use Claude for complex tasks
|
|
gpt-4: claude-sonnet-4.5
|
|
gpt-4-turbo: claude-sonnet-4.5
|
|
|
|
# LiteLLM Settings
|
|
litellm_settings:
|
|
drop_params: true
|
|
set_verbose: false # Disable verbose logging for better performance
|
|
|
|
# Enable caching with Redis for better performance
|
|
cache: true
|
|
cache_params:
|
|
type: redis
|
|
host: redis
|
|
port: 6379
|
|
ttl: 3600 # Cache for 1 hour
|
|
|
|
# Force strip specific parameters globally
|
|
allowed_fails: 0
|
|
|
|
# Modify params before sending to provider
|
|
modify_params: true
|
|
|
|
# Enable success and failure logging but minimize overhead
|
|
success_callback: [] # Disable all success callbacks to reduce DB writes
|
|
failure_callback: [] # Disable all failure callbacks
|
|
|
|
# Router Settings
|
|
router_settings:
|
|
allowed_fails: 0
|
|
|
|
# Routing strategy: Try self-hosted first, fallback to Claude on failure
|
|
routing_strategy: simple-shuffle
|
|
|
|
# Cooldown for failed models
|
|
cooldown_time: 30 # seconds
|
|
|
|
# Drop unsupported parameters
|
|
default_litellm_params:
|
|
drop_params: true
|
|
|
|
# General Settings
|
|
general_settings:
|
|
disable_responses_id_security: true
|
|
|
|
# Disable spend tracking to reduce database overhead
|
|
disable_spend_logs: false # Keep enabled to track API vs GPU costs
|
|
|
|
# Disable tag tracking
|
|
disable_tag_tracking: true
|
|
|
|
# Disable daily spend updates
|
|
disable_daily_spend_logs: false # Keep enabled for cost analysis
|
|
|
|
# Master key for authentication (set via env var)
|
|
master_key: os.environ/LITELLM_MASTER_KEY
|
|
|
|
# Database for logging (optional but recommended for cost tracking)
|
|
database_url: os.environ/DATABASE_URL
|
|
|
|
# Enable OpenAPI docs
|
|
docs_url: /docs
|
|
|
|
# =============================================================================
|
|
# Usage Guidelines (for Open WebUI users)
|
|
# =============================================================================
|
|
#
|
|
# Model Selection Guide:
|
|
#
|
|
# Use llama-3.1-8b for:
|
|
# - General chat and Q&A
|
|
# - Simple code generation
|
|
# - Data extraction
|
|
# - Summarization
|
|
# - Translation
|
|
# - Most routine tasks
|
|
# Cost: ~$0/month (self-hosted)
|
|
# Speed: ~50-80 tokens/second
|
|
#
|
|
# Use qwen-2.5-14b for:
|
|
# - Complex reasoning
|
|
# - Multi-step problems
|
|
# - Advanced code generation
|
|
# - Multilingual tasks
|
|
# Cost: ~$0/month (self-hosted)
|
|
# Speed: ~30-50 tokens/second
|
|
#
|
|
# Use claude-sonnet-4.5 for:
|
|
# - Very complex reasoning
|
|
# - Long documents (200K context)
|
|
# - Production-critical code
|
|
# - When quality matters most
|
|
# Cost: ~$3/million input tokens, ~$15/million output tokens
|
|
# Speed: ~30-40 tokens/second
|
|
#
|
|
# Use claude-3-haiku for:
|
|
# - API fallback (if self-hosted down)
|
|
# - Very fast responses needed
|
|
# Cost: ~$0.25/million input tokens, ~$1.25/million output tokens
|
|
# Speed: ~60-80 tokens/second
|
|
#
|
|
# =============================================================================
|
|
|
|
# Health Check Configuration
|
|
health_check:
|
|
# Check vLLM health endpoint
|
|
enabled: true
|
|
interval: 30 # seconds
|
|
timeout: 5 # seconds
|
|
|
|
# Fallback Configuration
|
|
# If GPU server is down, automatically use Claude
|
|
fallback:
|
|
- ["llama-3.1-8b", "claude-3-haiku"]
|
|
- ["qwen-2.5-14b", "claude-sonnet-4.5"]
|