Files
docker-compose/ai/litellm-config-gpu.yaml
Sebastian Krüger 8de88d96ac docs(ai): add comprehensive GPU setup documentation and configs
- Add setup guides (SETUP_GUIDE, TAILSCALE_SETUP, DOCKER_GPU_SETUP, etc.)
- Add deployment configurations (litellm-config-gpu.yaml, gpu-server-compose.yaml)
- Add GPU_DEPLOYMENT_LOG.md with current infrastructure details
- Add GPU_EXPANSION_PLAN.md with complete provider comparison
- Add deploy-gpu-stack.sh automation script

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-21 12:57:06 +01:00

200 lines
5.9 KiB
YAML

# LiteLLM Configuration with GPU Server Integration
# This config includes both Anthropic Claude (API) and self-hosted models (vLLM on GPU server)
model_list:
# =============================================================================
# Anthropic Claude Models (API-based, for complex reasoning)
# =============================================================================
- model_name: claude-sonnet-4
litellm_params:
model: anthropic/claude-sonnet-4-20250514
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-sonnet-4.5
litellm_params:
model: anthropic/claude-sonnet-4-5-20250929
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-3-5-sonnet
litellm_params:
model: anthropic/claude-3-5-sonnet-20241022
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-3-opus
litellm_params:
model: anthropic/claude-3-opus-20240229
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-3-haiku
litellm_params:
model: anthropic/claude-3-haiku-20240307
api_key: os.environ/ANTHROPIC_API_KEY
# =============================================================================
# Self-Hosted Models (vLLM on GPU server via WireGuard VPN)
# =============================================================================
# Llama 3.1 8B Instruct - Fast, general-purpose, good for routine tasks
- model_name: llama-3.1-8b
litellm_params:
model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct
api_base: http://10.8.0.2:8000/v1
api_key: dummy # vLLM doesn't require auth
rpm: 1000 # Rate limit: requests per minute
tpm: 100000 # Rate limit: tokens per minute
# Alternative models (uncomment and configure on GPU server as needed)
# Qwen 2.5 14B Instruct - Excellent multilingual, stronger reasoning
# - model_name: qwen-2.5-14b
# litellm_params:
# model: openai/Qwen/Qwen2.5-14B-Instruct
# api_base: http://10.8.0.2:8000/v1
# api_key: dummy
# rpm: 800
# tpm: 80000
# Mistral 7B Instruct - Very fast, lightweight
# - model_name: mistral-7b
# litellm_params:
# model: openai/mistralai/Mistral-7B-Instruct-v0.3
# api_base: http://10.8.0.2:8000/v1
# api_key: dummy
# rpm: 1200
# tpm: 120000
# DeepSeek Coder 6.7B - Code generation specialist
# - model_name: deepseek-coder-6.7b
# litellm_params:
# model: openai/deepseek-ai/deepseek-coder-6.7b-instruct
# api_base: http://10.8.0.2:8000/v1
# api_key: dummy
# rpm: 1000
# tpm: 100000
# =============================================================================
# Router Settings - Intelligent Model Selection
# =============================================================================
# Model aliases for easy switching in Open WebUI
model_name_map:
# Default model (self-hosted, fast)
gpt-3.5-turbo: llama-3.1-8b
# Power users can use Claude for complex tasks
gpt-4: claude-sonnet-4.5
gpt-4-turbo: claude-sonnet-4.5
# LiteLLM Settings
litellm_settings:
drop_params: true
set_verbose: false # Disable verbose logging for better performance
# Enable caching with Redis for better performance
cache: true
cache_params:
type: redis
host: redis
port: 6379
ttl: 3600 # Cache for 1 hour
# Force strip specific parameters globally
allowed_fails: 0
# Modify params before sending to provider
modify_params: true
# Enable success and failure logging but minimize overhead
success_callback: [] # Disable all success callbacks to reduce DB writes
failure_callback: [] # Disable all failure callbacks
# Router Settings
router_settings:
allowed_fails: 0
# Routing strategy: Try self-hosted first, fallback to Claude on failure
routing_strategy: simple-shuffle
# Cooldown for failed models
cooldown_time: 30 # seconds
# Drop unsupported parameters
default_litellm_params:
drop_params: true
# General Settings
general_settings:
disable_responses_id_security: true
# Disable spend tracking to reduce database overhead
disable_spend_logs: false # Keep enabled to track API vs GPU costs
# Disable tag tracking
disable_tag_tracking: true
# Disable daily spend updates
disable_daily_spend_logs: false # Keep enabled for cost analysis
# Master key for authentication (set via env var)
master_key: os.environ/LITELLM_MASTER_KEY
# Database for logging (optional but recommended for cost tracking)
database_url: os.environ/DATABASE_URL
# Enable OpenAPI docs
docs_url: /docs
# =============================================================================
# Usage Guidelines (for Open WebUI users)
# =============================================================================
#
# Model Selection Guide:
#
# Use llama-3.1-8b for:
# - General chat and Q&A
# - Simple code generation
# - Data extraction
# - Summarization
# - Translation
# - Most routine tasks
# Cost: ~$0/month (self-hosted)
# Speed: ~50-80 tokens/second
#
# Use qwen-2.5-14b for:
# - Complex reasoning
# - Multi-step problems
# - Advanced code generation
# - Multilingual tasks
# Cost: ~$0/month (self-hosted)
# Speed: ~30-50 tokens/second
#
# Use claude-sonnet-4.5 for:
# - Very complex reasoning
# - Long documents (200K context)
# - Production-critical code
# - When quality matters most
# Cost: ~$3/million input tokens, ~$15/million output tokens
# Speed: ~30-40 tokens/second
#
# Use claude-3-haiku for:
# - API fallback (if self-hosted down)
# - Very fast responses needed
# Cost: ~$0.25/million input tokens, ~$1.25/million output tokens
# Speed: ~60-80 tokens/second
#
# =============================================================================
# Health Check Configuration
health_check:
# Check vLLM health endpoint
enabled: true
interval: 30 # seconds
timeout: 5 # seconds
# Fallback Configuration
# If GPU server is down, automatically use Claude
fallback:
- ["llama-3.1-8b", "claude-3-haiku"]
- ["qwen-2.5-14b", "claude-sonnet-4.5"]