ai/litellm-config.yaml

model_list:
  - model_name: claude-sonnet-4
    litellm_params:
      model: anthropic/claude-sonnet-4-20250514
      api_key: os.environ/ANTHROPIC_API_KEY

  - model_name: claude-sonnet-4.5
    litellm_params:
      model: anthropic/claude-sonnet-4-5-20250929
      api_key: os.environ/ANTHROPIC_API_KEY

  - model_name: claude-3-5-sonnet
    litellm_params:
      model: anthropic/claude-3-5-sonnet-20241022
      api_key: os.environ/ANTHROPIC_API_KEY

  - model_name: claude-3-opus
    litellm_params:
      model: anthropic/claude-3-opus-20240229
      api_key: os.environ/ANTHROPIC_API_KEY

  - model_name: claude-3-haiku
    litellm_params:
      model: anthropic/claude-3-haiku-20240307
      api_key: os.environ/ANTHROPIC_API_KEY

  # ===========================================================================
  # SELF-HOSTED MODELS - DIRECT vLLM SERVERS (GPU Server via Tailscale VPN)
  # ===========================================================================
  # Direct connections to dedicated vLLM servers (no orchestrator)

  # Text Generation - Qwen 2.5 7B (Port 8000)
  - model_name: qwen-2.5-7b
    litellm_params:
      model: hosted_vllm/openai/qwen-2.5-7b  # hosted_vllm/openai/ prefix for proper streaming
      api_base: os.environ/GPU_VLLM_QWEN_URL  # Direct to vLLM Qwen server
      api_key: "EMPTY"  # vLLM doesn't validate API keys
      rpm: 1000
      tpm: 100000
      timeout: 600  # 10 minutes for generation
      stream_timeout: 600
      supports_system_messages: true  # Qwen supports system messages
      stream: true  # Enable streaming by default

  # Text Generation - Llama 3.1 8B (Port 8001)
  - model_name: llama-3.1-8b
    litellm_params:
      model: hosted_vllm/openai/llama-3.1-8b  # hosted_vllm/openai/ prefix for proper streaming
      api_base: os.environ/GPU_VLLM_LLAMA_URL  # Direct to vLLM Llama server
      api_key: "EMPTY"  # vLLM doesn't validate API keys
      rpm: 1000
      tpm: 100000
      timeout: 600  # 10 minutes for generation
      stream_timeout: 600
      supports_system_messages: true  # Llama supports system messages
      stream: true  # Enable streaming by default

litellm_settings:
  drop_params: false  # DISABLED: Was breaking streaming
  set_verbose: true  # Enable verbose logging for debugging streaming issues
  # Enable caching now that streaming is fixed
  cache: true
  cache_params:
    type: redis
    host: core_redis
    port: 6379
    ttl: 3600  # Cache for 1 hour
  # Force strip specific parameters globally
  allowed_fails: 0
  # Modify params before sending to provider
  modify_params: false  # DISABLED: Was breaking streaming
  # Enable success and failure logging but minimize overhead
  success_callback: []  # Disable all success callbacks to reduce DB writes
  failure_callback: []  # Disable all failure callbacks

router_settings:
  allowed_fails: 0

# Drop unsupported parameters
default_litellm_params:
  drop_params: false  # DISABLED: Was breaking streaming

general_settings:
  disable_responses_id_security: true
  # Disable spend tracking to reduce database overhead
  disable_spend_logs: true
  # Disable tag tracking
  disable_tag_tracking: true
  # Disable daily spend updates
  disable_daily_spend_logs: true