docker-compose/ai/litellm-config-gpu.yaml

# LiteLLM Configuration with GPU Server Integration
# This config includes both Anthropic Claude (API) and self-hosted models (vLLM on GPU server)

model_list:
  # =============================================================================
  # Anthropic Claude Models (API-based, for complex reasoning)
  # =============================================================================

  - model_name: claude-sonnet-4
    litellm_params:
      model: anthropic/claude-sonnet-4-20250514
      api_key: os.environ/ANTHROPIC_API_KEY

  - model_name: claude-sonnet-4.5
    litellm_params:
      model: anthropic/claude-sonnet-4-5-20250929
      api_key: os.environ/ANTHROPIC_API_KEY

  - model_name: claude-3-5-sonnet
    litellm_params:
      model: anthropic/claude-3-5-sonnet-20241022
      api_key: os.environ/ANTHROPIC_API_KEY

  - model_name: claude-3-opus
    litellm_params:
      model: anthropic/claude-3-opus-20240229
      api_key: os.environ/ANTHROPIC_API_KEY

  - model_name: claude-3-haiku
    litellm_params:
      model: anthropic/claude-3-haiku-20240307
      api_key: os.environ/ANTHROPIC_API_KEY

  # =============================================================================
  # Self-Hosted Models (vLLM on GPU server via WireGuard VPN)
  # =============================================================================

  # Llama 3.1 8B Instruct - Fast, general-purpose, good for routine tasks
  - model_name: llama-3.1-8b
    litellm_params:
      model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct
      api_base: http://10.8.0.2:8000/v1
      api_key: dummy  # vLLM doesn't require auth
      rpm: 1000  # Rate limit: requests per minute
      tpm: 100000  # Rate limit: tokens per minute

  # Alternative models (uncomment and configure on GPU server as needed)

  # Qwen 2.5 14B Instruct - Excellent multilingual, stronger reasoning
  # - model_name: qwen-2.5-14b
  #   litellm_params:
  #     model: openai/Qwen/Qwen2.5-14B-Instruct
  #     api_base: http://10.8.0.2:8000/v1
  #     api_key: dummy
  #     rpm: 800
  #     tpm: 80000

  # Mistral 7B Instruct - Very fast, lightweight
  # - model_name: mistral-7b
  #   litellm_params:
  #     model: openai/mistralai/Mistral-7B-Instruct-v0.3
  #     api_base: http://10.8.0.2:8000/v1
  #     api_key: dummy
  #     rpm: 1200
  #     tpm: 120000

  # DeepSeek Coder 6.7B - Code generation specialist
  # - model_name: deepseek-coder-6.7b
  #   litellm_params:
  #     model: openai/deepseek-ai/deepseek-coder-6.7b-instruct
  #     api_base: http://10.8.0.2:8000/v1
  #     api_key: dummy
  #     rpm: 1000
  #     tpm: 100000

# =============================================================================
# Router Settings - Intelligent Model Selection
# =============================================================================

# Model aliases for easy switching in Open WebUI
model_name_map:
  # Default model (self-hosted, fast)
  gpt-3.5-turbo: llama-3.1-8b

  # Power users can use Claude for complex tasks
  gpt-4: claude-sonnet-4.5
  gpt-4-turbo: claude-sonnet-4.5

# LiteLLM Settings
litellm_settings:
  drop_params: true
  set_verbose: false  # Disable verbose logging for better performance

  # Enable caching with Redis for better performance
  cache: true
  cache_params:
    type: redis
    host: redis
    port: 6379
    ttl: 3600  # Cache for 1 hour

  # Force strip specific parameters globally
  allowed_fails: 0

  # Modify params before sending to provider
  modify_params: true

  # Enable success and failure logging but minimize overhead
  success_callback: []  # Disable all success callbacks to reduce DB writes
  failure_callback: []  # Disable all failure callbacks

# Router Settings
router_settings:
  allowed_fails: 0

  # Routing strategy: Try self-hosted first, fallback to Claude on failure
  routing_strategy: simple-shuffle

  # Cooldown for failed models
  cooldown_time: 30  # seconds

# Drop unsupported parameters
default_litellm_params:
  drop_params: true

# General Settings
general_settings:
  disable_responses_id_security: true

  # Disable spend tracking to reduce database overhead
  disable_spend_logs: false  # Keep enabled to track API vs GPU costs

  # Disable tag tracking
  disable_tag_tracking: true

  # Disable daily spend updates
  disable_daily_spend_logs: false  # Keep enabled for cost analysis

  # Master key for authentication (set via env var)
  master_key: os.environ/LITELLM_MASTER_KEY

  # Database for logging (optional but recommended for cost tracking)
  database_url: os.environ/DATABASE_URL

  # Enable OpenAPI docs
  docs_url: /docs

# =============================================================================
# Usage Guidelines (for Open WebUI users)
# =============================================================================
#
# Model Selection Guide:
#
# Use llama-3.1-8b for:
# - General chat and Q&A
# - Simple code generation
# - Data extraction
# - Summarization
# - Translation
# - Most routine tasks
# Cost: ~$0/month (self-hosted)
# Speed: ~50-80 tokens/second
#
# Use qwen-2.5-14b for:
# - Complex reasoning
# - Multi-step problems
# - Advanced code generation
# - Multilingual tasks
# Cost: ~$0/month (self-hosted)
# Speed: ~30-50 tokens/second
#
# Use claude-sonnet-4.5 for:
# - Very complex reasoning
# - Long documents (200K context)
# - Production-critical code
# - When quality matters most
# Cost: ~$3/million input tokens, ~$15/million output tokens
# Speed: ~30-40 tokens/second
#
# Use claude-3-haiku for:
# - API fallback (if self-hosted down)
# - Very fast responses needed
# Cost: ~$0.25/million input tokens, ~$1.25/million output tokens
# Speed: ~60-80 tokens/second
#
# =============================================================================

# Health Check Configuration
health_check:
  # Check vLLM health endpoint
  enabled: true
  interval: 30  # seconds
  timeout: 5  # seconds

# Fallback Configuration
# If GPU server is down, automatically use Claude
fallback:
  - ["llama-3.1-8b", "claude-3-haiku"]
  - ["qwen-2.5-14b", "claude-sonnet-4.5"]