docker-compose/ai/litellm-config.yaml

model_list:
  - model_name: claude-sonnet-4
    litellm_params:
      model: anthropic/claude-sonnet-4-20250514
      api_key: os.environ/ANTHROPIC_API_KEY

  - model_name: claude-sonnet-4.5
    litellm_params:
      model: anthropic/claude-sonnet-4-5-20250929
      api_key: os.environ/ANTHROPIC_API_KEY

  - model_name: claude-3-5-sonnet
    litellm_params:
      model: anthropic/claude-3-5-sonnet-20241022
      api_key: os.environ/ANTHROPIC_API_KEY

  - model_name: claude-3-opus
    litellm_params:
      model: anthropic/claude-3-opus-20240229
      api_key: os.environ/ANTHROPIC_API_KEY

  - model_name: claude-3-haiku
    litellm_params:
      model: anthropic/claude-3-haiku-20240307
      api_key: os.environ/ANTHROPIC_API_KEY

  # ===========================================================================
  # SELF-HOSTED MODELS VIA ORCHESTRATOR (GPU Server via Tailscale VPN)
  # ===========================================================================
  # All requests route through orchestrator (port 9000) which manages model loading

  # Text Generation
  - model_name: qwen-2.5-7b
    litellm_params:
      model: hosted_vllm/openai/qwen-2.5-7b  # hosted_vllm/openai/ for vLLM via orchestrator
      api_base: http://100.121.199.88:9000/v1  # RunPod GPU via Tailscale
      api_key: dummy
      rpm: 1000
      tpm: 100000
      timeout: 600  # 10 minutes for generation
      stream_timeout: 600
      supports_system_messages: false  # vLLM handles system messages differently
      stream: true  # Enable streaming by default

  # Image Generation
  - model_name: flux-schnell
    litellm_params:
      model: openai/dall-e-3  # OpenAI-compatible mapping
      api_base: http://100.121.199.88:9000/v1  # RunPod GPU via Tailscale
      api_key: dummy
      rpm: 100
      max_parallel_requests: 3

  # Music Generation
  - model_name: musicgen-medium
    litellm_params:
      model: openai/musicgen-medium
      api_base: http://100.121.199.88:9000/v1  # RunPod GPU via Tailscale
      api_key: dummy
      rpm: 50
      max_parallel_requests: 1

litellm_settings:
  drop_params: false  # DISABLED: Was breaking streaming
  set_verbose: true  # Enable verbose logging for debugging streaming issues
  # Disable caching - it breaks streaming responses
  cache: false
  # cache_params:
  #   type: redis
  #   host: redis
  #   port: 6379
  #   ttl: 3600  # Cache for 1 hour
  # Force strip specific parameters globally
  allowed_fails: 0
  # Modify params before sending to provider
  modify_params: false  # DISABLED: Was breaking streaming
  # Enable success and failure logging but minimize overhead
  success_callback: []  # Disable all success callbacks to reduce DB writes
  failure_callback: []  # Disable all failure callbacks

router_settings:
  allowed_fails: 0

# Drop unsupported parameters
default_litellm_params:
  drop_params: false  # DISABLED: Was breaking streaming

general_settings:
  disable_responses_id_security: true
  # Disable spend tracking to reduce database overhead
  disable_spend_logs: true
  # Disable tag tracking
  disable_tag_tracking: true
  # Disable daily spend updates
  disable_daily_spend_logs: true