Files
docker-compose/ai/litellm-config.yaml

90 lines
3.0 KiB
YAML

model_list:
- model_name: claude-sonnet-4
litellm_params:
model: anthropic/claude-sonnet-4-20250514
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-sonnet-4.5
litellm_params:
model: anthropic/claude-sonnet-4-5-20250929
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-3-5-sonnet
litellm_params:
model: anthropic/claude-3-5-sonnet-20241022
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-3-opus
litellm_params:
model: anthropic/claude-3-opus-20240229
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-3-haiku
litellm_params:
model: anthropic/claude-3-haiku-20240307
api_key: os.environ/ANTHROPIC_API_KEY
# ===========================================================================
# SELF-HOSTED MODELS VIA ORCHESTRATOR (GPU Server via Tailscale VPN)
# ===========================================================================
# All requests route through orchestrator (port 9000) which manages model loading
# Text Generation
- model_name: qwen-2.5-7b
litellm_params:
model: hosted_vllm/openai/qwen-2.5-7b # hosted_vllm/openai/ for vLLM via orchestrator
api_base: http://100.121.199.88:9000/v1 # RunPod GPU via Tailscale
api_key: dummy
rpm: 1000
tpm: 100000
timeout: 600 # 10 minutes for generation
stream_timeout: 600
supports_system_messages: false # vLLM handles system messages differently
stream: true # Enable streaming by default
- model_name: llama-3.1-8b
litellm_params:
model: hosted_vllm/openai/llama-3.1-8b # hosted_vllm/openai/ for vLLM via orchestrator
api_base: http://100.121.199.88:9000/v1 # RunPod GPU via Tailscale
api_key: dummy
rpm: 1000
tpm: 100000
timeout: 600 # 10 minutes for generation
stream_timeout: 600
supports_system_messages: true # Llama supports system messages
stream: true # Enable streaming by default
litellm_settings:
drop_params: false # DISABLED: Was breaking streaming
set_verbose: true # Enable verbose logging for debugging streaming issues
# Enable caching now that streaming is fixed
cache: true
cache_params:
type: redis
host: core_redis
port: 6379
ttl: 3600 # Cache for 1 hour
# Force strip specific parameters globally
allowed_fails: 0
# Modify params before sending to provider
modify_params: false # DISABLED: Was breaking streaming
# Enable success and failure logging but minimize overhead
success_callback: [] # Disable all success callbacks to reduce DB writes
failure_callback: [] # Disable all failure callbacks
router_settings:
allowed_fails: 0
# Drop unsupported parameters
default_litellm_params:
drop_params: false # DISABLED: Was breaking streaming
general_settings:
disable_responses_id_security: true
# Disable spend tracking to reduce database overhead
disable_spend_logs: true
# Disable tag tracking
disable_tag_tracking: true
# Disable daily spend updates
disable_daily_spend_logs: true