# LiteLLM Configuration with GPU Server Integration # This config includes both Anthropic Claude (API) and self-hosted models (vLLM on GPU server) model_list: # ============================================================================= # Anthropic Claude Models (API-based, for complex reasoning) # ============================================================================= - model_name: claude-sonnet-4 litellm_params: model: anthropic/claude-sonnet-4-20250514 api_key: os.environ/ANTHROPIC_API_KEY - model_name: claude-sonnet-4.5 litellm_params: model: anthropic/claude-sonnet-4-5-20250929 api_key: os.environ/ANTHROPIC_API_KEY - model_name: claude-3-5-sonnet litellm_params: model: anthropic/claude-3-5-sonnet-20241022 api_key: os.environ/ANTHROPIC_API_KEY - model_name: claude-3-opus litellm_params: model: anthropic/claude-3-opus-20240229 api_key: os.environ/ANTHROPIC_API_KEY - model_name: claude-3-haiku litellm_params: model: anthropic/claude-3-haiku-20240307 api_key: os.environ/ANTHROPIC_API_KEY # ============================================================================= # Self-Hosted Models (vLLM on GPU server via WireGuard VPN) # ============================================================================= # Llama 3.1 8B Instruct - Fast, general-purpose, good for routine tasks - model_name: llama-3.1-8b litellm_params: model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct api_base: http://10.8.0.2:8000/v1 api_key: dummy # vLLM doesn't require auth rpm: 1000 # Rate limit: requests per minute tpm: 100000 # Rate limit: tokens per minute # Alternative models (uncomment and configure on GPU server as needed) # Qwen 2.5 14B Instruct - Excellent multilingual, stronger reasoning # - model_name: qwen-2.5-14b # litellm_params: # model: openai/Qwen/Qwen2.5-14B-Instruct # api_base: http://10.8.0.2:8000/v1 # api_key: dummy # rpm: 800 # tpm: 80000 # Mistral 7B Instruct - Very fast, lightweight # - model_name: mistral-7b # litellm_params: # model: openai/mistralai/Mistral-7B-Instruct-v0.3 # api_base: http://10.8.0.2:8000/v1 # api_key: dummy # rpm: 1200 # tpm: 120000 # DeepSeek Coder 6.7B - Code generation specialist # - model_name: deepseek-coder-6.7b # litellm_params: # model: openai/deepseek-ai/deepseek-coder-6.7b-instruct # api_base: http://10.8.0.2:8000/v1 # api_key: dummy # rpm: 1000 # tpm: 100000 # ============================================================================= # Router Settings - Intelligent Model Selection # ============================================================================= # Model aliases for easy switching in Open WebUI model_name_map: # Default model (self-hosted, fast) gpt-3.5-turbo: llama-3.1-8b # Power users can use Claude for complex tasks gpt-4: claude-sonnet-4.5 gpt-4-turbo: claude-sonnet-4.5 # LiteLLM Settings litellm_settings: drop_params: true set_verbose: false # Disable verbose logging for better performance # Enable caching with Redis for better performance cache: true cache_params: type: redis host: redis port: 6379 ttl: 3600 # Cache for 1 hour # Force strip specific parameters globally allowed_fails: 0 # Modify params before sending to provider modify_params: true # Enable success and failure logging but minimize overhead success_callback: [] # Disable all success callbacks to reduce DB writes failure_callback: [] # Disable all failure callbacks # Router Settings router_settings: allowed_fails: 0 # Routing strategy: Try self-hosted first, fallback to Claude on failure routing_strategy: simple-shuffle # Cooldown for failed models cooldown_time: 30 # seconds # Drop unsupported parameters default_litellm_params: drop_params: true # General Settings general_settings: disable_responses_id_security: true # Disable spend tracking to reduce database overhead disable_spend_logs: false # Keep enabled to track API vs GPU costs # Disable tag tracking disable_tag_tracking: true # Disable daily spend updates disable_daily_spend_logs: false # Keep enabled for cost analysis # Master key for authentication (set via env var) master_key: os.environ/LITELLM_MASTER_KEY # Database for logging (optional but recommended for cost tracking) database_url: os.environ/DATABASE_URL # Enable OpenAPI docs docs_url: /docs # ============================================================================= # Usage Guidelines (for Open WebUI users) # ============================================================================= # # Model Selection Guide: # # Use llama-3.1-8b for: # - General chat and Q&A # - Simple code generation # - Data extraction # - Summarization # - Translation # - Most routine tasks # Cost: ~$0/month (self-hosted) # Speed: ~50-80 tokens/second # # Use qwen-2.5-14b for: # - Complex reasoning # - Multi-step problems # - Advanced code generation # - Multilingual tasks # Cost: ~$0/month (self-hosted) # Speed: ~30-50 tokens/second # # Use claude-sonnet-4.5 for: # - Very complex reasoning # - Long documents (200K context) # - Production-critical code # - When quality matters most # Cost: ~$3/million input tokens, ~$15/million output tokens # Speed: ~30-40 tokens/second # # Use claude-3-haiku for: # - API fallback (if self-hosted down) # - Very fast responses needed # Cost: ~$0.25/million input tokens, ~$1.25/million output tokens # Speed: ~60-80 tokens/second # # ============================================================================= # Health Check Configuration health_check: # Check vLLM health endpoint enabled: true interval: 30 # seconds timeout: 5 # seconds # Fallback Configuration # If GPU server is down, automatically use Claude fallback: - ["llama-3.1-8b", "claude-3-haiku"] - ["qwen-2.5-14b", "claude-sonnet-4.5"]