Files
docker-compose/ai/litellm-config.yaml

96 lines
3.1 KiB
YAML

model_list:
- model_name: claude-sonnet-4
litellm_params:
model: anthropic/claude-sonnet-4-20250514
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-sonnet-4.5
litellm_params:
model: anthropic/claude-sonnet-4-5-20250929
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-3-5-sonnet
litellm_params:
model: anthropic/claude-3-5-sonnet-20241022
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-3-opus
litellm_params:
model: anthropic/claude-3-opus-20240229
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-3-haiku
litellm_params:
model: anthropic/claude-3-haiku-20240307
api_key: os.environ/ANTHROPIC_API_KEY
# ===========================================================================
# SELF-HOSTED MODELS VIA ORCHESTRATOR (GPU Server via Tailscale VPN)
# ===========================================================================
# All requests route through orchestrator (port 9000) which manages model loading
# Text Generation
- model_name: qwen-2.5-7b
litellm_params:
model: hosted_vllm/openai/qwen-2.5-7b # hosted_vllm/openai/ for vLLM via orchestrator
api_base: http://100.121.199.88:9000/v1 # RunPod GPU via Tailscale
api_key: dummy
rpm: 1000
tpm: 100000
timeout: 600 # 10 minutes for generation
stream_timeout: 600
supports_system_messages: false # vLLM handles system messages differently
stream: true # Enable streaming by default
# Image Generation
- model_name: flux-schnell
litellm_params:
model: openai/dall-e-3 # OpenAI-compatible mapping
api_base: http://100.121.199.88:9000/v1 # RunPod GPU via Tailscale
api_key: dummy
rpm: 100
max_parallel_requests: 3
# Music Generation
- model_name: musicgen-medium
litellm_params:
model: openai/musicgen-medium
api_base: http://100.121.199.88:9000/v1 # RunPod GPU via Tailscale
api_key: dummy
rpm: 50
max_parallel_requests: 1
litellm_settings:
drop_params: false # DISABLED: Was breaking streaming
set_verbose: true # Enable verbose logging for debugging streaming issues
# Disable caching - it breaks streaming responses
cache: false
# cache_params:
# type: redis
# host: redis
# port: 6379
# ttl: 3600 # Cache for 1 hour
# Force strip specific parameters globally
allowed_fails: 0
# Modify params before sending to provider
modify_params: false # DISABLED: Was breaking streaming
# Enable success and failure logging but minimize overhead
success_callback: [] # Disable all success callbacks to reduce DB writes
failure_callback: [] # Disable all failure callbacks
router_settings:
allowed_fails: 0
# Drop unsupported parameters
default_litellm_params:
drop_params: false # DISABLED: Was breaking streaming
general_settings:
disable_responses_id_security: true
# Disable spend tracking to reduce database overhead
disable_spend_logs: true
# Disable tag tracking
disable_tag_tracking: true
# Disable daily spend updates
disable_daily_spend_logs: true