docker-compose/ai/litellm-config.yaml

model_list:
  - model_name: claude-sonnet-4
    litellm_params:
      model: anthropic/claude-sonnet-4-20250514
      api_key: os.environ/ANTHROPIC_API_KEY

  - model_name: claude-sonnet-4.5
    litellm_params:
      model: anthropic/claude-sonnet-4-5-20250929
      api_key: os.environ/ANTHROPIC_API_KEY

  - model_name: claude-3-5-sonnet
    litellm_params:
      model: anthropic/claude-3-5-sonnet-20241022
      api_key: os.environ/ANTHROPIC_API_KEY

  - model_name: claude-3-opus
    litellm_params:
      model: anthropic/claude-3-opus-20240229
      api_key: os.environ/ANTHROPIC_API_KEY

  - model_name: claude-3-haiku
    litellm_params:
      model: anthropic/claude-3-haiku-20240307
      api_key: os.environ/ANTHROPIC_API_KEY

  # ===========================================================================
  # SELF-HOSTED MODELS VIA ORCHESTRATOR (GPU Server via Tailscale VPN)
  # ===========================================================================
  # All requests route through orchestrator (port 9000) which manages model loading

  # Text Generation
  - model_name: qwen-2.5-7b
    litellm_params:
      model: hosted_vllm/openai/qwen-2.5-7b  # hosted_vllm/openai/ for vLLM via orchestrator
      api_base: http://os.environ/GPU_TAILSCALE_IP:9000/v1  # RunPod GPU via Tailscale
      api_key: dummy
      rpm: 1000
      tpm: 100000
      timeout: 600  # 10 minutes for generation
      stream_timeout: 600
      supports_system_messages: false  # vLLM handles system messages differently
      stream: true  # Enable streaming by default

  - model_name: llama-3.1-8b
    litellm_params:
      model: hosted_vllm/openai/llama-3.1-8b  # hosted_vllm/openai/ for vLLM via orchestrator
      api_base: http://os.environ/GPU_TAILSCALE_IP:9000/v1  # RunPod GPU via Tailscale
      api_key: dummy
      rpm: 1000
      tpm: 100000
      timeout: 600  # 10 minutes for generation
      stream_timeout: 600
      supports_system_messages: true  # Llama supports system messages
      stream: true  # Enable streaming by default

litellm_settings:
  drop_params: false  # DISABLED: Was breaking streaming
  set_verbose: true  # Enable verbose logging for debugging streaming issues
  # Enable caching now that streaming is fixed
  cache: true
  cache_params:
    type: redis
    host: core_redis
    port: 6379
    ttl: 3600  # Cache for 1 hour
  # Force strip specific parameters globally
  allowed_fails: 0
  # Modify params before sending to provider
  modify_params: false  # DISABLED: Was breaking streaming
  # Enable success and failure logging but minimize overhead
  success_callback: []  # Disable all success callbacks to reduce DB writes
  failure_callback: []  # Disable all failure callbacks

router_settings:
  allowed_fails: 0

# Drop unsupported parameters
default_litellm_params:
  drop_params: false  # DISABLED: Was breaking streaming

general_settings:
  disable_responses_id_security: true
  # Disable spend tracking to reduce database overhead
  disable_spend_logs: true
  # Disable tag tracking
  disable_tag_tracking: true
  # Disable daily spend updates
  disable_daily_spend_logs: true