Initial commit: RunPod multi-modal AI orchestration stack

- Multi-modal AI infrastructure for RunPod RTX 4090 - Automatic model orchestration (text, image, music) - Text: vLLM + Qwen 2.5 7B Instruct - Image: Flux.1 Schnell via OpenEDAI - Music: MusicGen Medium via AudioCraft - Cost-optimized sequential loading on single GPU - Template preparation scripts for rapid deployment - Comprehensive documentation (README, DEPLOYMENT, TEMPLATE)
2025-11-21 14:34:55 +01:00
commit 277f1c95bd
35 changed files with 7654 additions and 0 deletions
--- a/litellm-config.yaml
+++ b/litellm-config.yaml
@@ -0,0 +1,91 @@
+model_list:
+  - model_name: claude-sonnet-4
+    litellm_params:
+      model: anthropic/claude-sonnet-4-20250514
+      api_key: os.environ/ANTHROPIC_API_KEY
+
+  - model_name: claude-sonnet-4.5
+    litellm_params:
+      model: anthropic/claude-sonnet-4-5-20250929
+      api_key: os.environ/ANTHROPIC_API_KEY
+
+  - model_name: claude-3-5-sonnet
+    litellm_params:
+      model: anthropic/claude-3-5-sonnet-20241022
+      api_key: os.environ/ANTHROPIC_API_KEY
+
+  - model_name: claude-3-opus
+    litellm_params:
+      model: anthropic/claude-3-opus-20240229
+      api_key: os.environ/ANTHROPIC_API_KEY
+
+  - model_name: claude-3-haiku
+    litellm_params:
+      model: anthropic/claude-3-haiku-20240307
+      api_key: os.environ/ANTHROPIC_API_KEY
+
+  # ===========================================================================
+  # SELF-HOSTED MODELS VIA ORCHESTRATOR (GPU Server via Tailscale VPN)
+  # ===========================================================================
+  # All requests route through orchestrator (port 9000) which manages model loading
+
+  # Text Generation
+  - model_name: qwen-2.5-7b
+    litellm_params:
+      model: openai/qwen-2.5-7b
+      api_base: http://100.100.108.13:9000/v1  # Orchestrator endpoint
+      api_key: dummy
+      rpm: 1000
+      tpm: 100000
+
+  # Image Generation
+  - model_name: flux-schnell
+    litellm_params:
+      model: openai/dall-e-3  # OpenAI-compatible mapping
+      api_base: http://100.100.108.13:9000/v1  # Orchestrator endpoint
+      api_key: dummy
+      rpm: 100
+      max_parallel_requests: 3
+
+  # Music Generation
+  - model_name: musicgen-medium
+    litellm_params:
+      model: openai/musicgen-medium
+      api_base: http://100.100.108.13:9000/v1  # Orchestrator endpoint
+      api_key: dummy
+      rpm: 50
+      max_parallel_requests: 1
+
+litellm_settings:
+  drop_params: true
+  set_verbose: false  # Disable verbose logging for better performance
+  # Enable caching with Redis for better performance
+  cache: true
+  cache_params:
+    type: redis
+    host: redis
+    port: 6379
+    ttl: 3600  # Cache for 1 hour
+  # Force strip specific parameters globally
+  allowed_fails: 0
+  # Modify params before sending to provider
+  modify_params: true
+  # Enable success and failure logging but minimize overhead
+  success_callback: []  # Disable all success callbacks to reduce DB writes
+  failure_callback: []  # Disable all failure callbacks
+
+router_settings:
+  allowed_fails: 0
+
+# Drop unsupported parameters
+default_litellm_params:
+  drop_params: true
+
+general_settings:
+  disable_responses_id_security: true
+  # Disable spend tracking to reduce database overhead
+  disable_spend_logs: true
+  # Disable tag tracking
+  disable_tag_tracking: true
+  # Disable daily spend updates
+  disable_daily_spend_logs: true