Initial commit: RunPod multi-modal AI orchestration stack

- Multi-modal AI infrastructure for RunPod RTX 4090 - Automatic model orchestration (text, image, music) - Text: vLLM + Qwen 2.5 7B Instruct - Image: Flux.1 Schnell via OpenEDAI - Music: MusicGen Medium via AudioCraft - Cost-optimized sequential loading on single GPU - Template preparation scripts for rapid deployment - Comprehensive documentation (README, DEPLOYMENT, TEMPLATE)
2025-11-21 14:34:55 +01:00
commit 277f1c95bd
35 changed files with 7654 additions and 0 deletions
--- a/litellm-config-gpu.yaml
+++ b/litellm-config-gpu.yaml
@@ -0,0 +1,199 @@
+# LiteLLM Configuration with GPU Server Integration
+# This config includes both Anthropic Claude (API) and self-hosted models (vLLM on GPU server)
+
+model_list:
+  # =============================================================================
+  # Anthropic Claude Models (API-based, for complex reasoning)
+  # =============================================================================
+
+  - model_name: claude-sonnet-4
+    litellm_params:
+      model: anthropic/claude-sonnet-4-20250514
+      api_key: os.environ/ANTHROPIC_API_KEY
+
+  - model_name: claude-sonnet-4.5
+    litellm_params:
+      model: anthropic/claude-sonnet-4-5-20250929
+      api_key: os.environ/ANTHROPIC_API_KEY
+
+  - model_name: claude-3-5-sonnet
+    litellm_params:
+      model: anthropic/claude-3-5-sonnet-20241022
+      api_key: os.environ/ANTHROPIC_API_KEY
+
+  - model_name: claude-3-opus
+    litellm_params:
+      model: anthropic/claude-3-opus-20240229
+      api_key: os.environ/ANTHROPIC_API_KEY
+
+  - model_name: claude-3-haiku
+    litellm_params:
+      model: anthropic/claude-3-haiku-20240307
+      api_key: os.environ/ANTHROPIC_API_KEY
+
+  # =============================================================================
+  # Self-Hosted Models (vLLM on GPU server via WireGuard VPN)
+  # =============================================================================
+
+  # Llama 3.1 8B Instruct - Fast, general-purpose, good for routine tasks
+  - model_name: llama-3.1-8b
+    litellm_params:
+      model: openai/meta-llama/Meta-Llama-3.1-8B-Instruct
+      api_base: http://10.8.0.2:8000/v1
+      api_key: dummy  # vLLM doesn't require auth
+      rpm: 1000  # Rate limit: requests per minute
+      tpm: 100000  # Rate limit: tokens per minute
+
+  # Alternative models (uncomment and configure on GPU server as needed)
+
+  # Qwen 2.5 14B Instruct - Excellent multilingual, stronger reasoning
+  # - model_name: qwen-2.5-14b
+  #   litellm_params:
+  #     model: openai/Qwen/Qwen2.5-14B-Instruct
+  #     api_base: http://10.8.0.2:8000/v1
+  #     api_key: dummy
+  #     rpm: 800
+  #     tpm: 80000
+
+  # Mistral 7B Instruct - Very fast, lightweight
+  # - model_name: mistral-7b
+  #   litellm_params:
+  #     model: openai/mistralai/Mistral-7B-Instruct-v0.3
+  #     api_base: http://10.8.0.2:8000/v1
+  #     api_key: dummy
+  #     rpm: 1200
+  #     tpm: 120000
+
+  # DeepSeek Coder 6.7B - Code generation specialist
+  # - model_name: deepseek-coder-6.7b
+  #   litellm_params:
+  #     model: openai/deepseek-ai/deepseek-coder-6.7b-instruct
+  #     api_base: http://10.8.0.2:8000/v1
+  #     api_key: dummy
+  #     rpm: 1000
+  #     tpm: 100000
+
+# =============================================================================
+# Router Settings - Intelligent Model Selection
+# =============================================================================
+
+# Model aliases for easy switching in Open WebUI
+model_name_map:
+  # Default model (self-hosted, fast)
+  gpt-3.5-turbo: llama-3.1-8b
+
+  # Power users can use Claude for complex tasks
+  gpt-4: claude-sonnet-4.5
+  gpt-4-turbo: claude-sonnet-4.5
+
+# LiteLLM Settings
+litellm_settings:
+  drop_params: true
+  set_verbose: false  # Disable verbose logging for better performance
+
+  # Enable caching with Redis for better performance
+  cache: true
+  cache_params:
+    type: redis
+    host: redis
+    port: 6379
+    ttl: 3600  # Cache for 1 hour
+
+  # Force strip specific parameters globally
+  allowed_fails: 0
+
+  # Modify params before sending to provider
+  modify_params: true
+
+  # Enable success and failure logging but minimize overhead
+  success_callback: []  # Disable all success callbacks to reduce DB writes
+  failure_callback: []  # Disable all failure callbacks
+
+# Router Settings
+router_settings:
+  allowed_fails: 0
+
+  # Routing strategy: Try self-hosted first, fallback to Claude on failure
+  routing_strategy: simple-shuffle
+
+  # Cooldown for failed models
+  cooldown_time: 30  # seconds
+
+# Drop unsupported parameters
+default_litellm_params:
+  drop_params: true
+
+# General Settings
+general_settings:
+  disable_responses_id_security: true
+
+  # Disable spend tracking to reduce database overhead
+  disable_spend_logs: false  # Keep enabled to track API vs GPU costs
+
+  # Disable tag tracking
+  disable_tag_tracking: true
+
+  # Disable daily spend updates
+  disable_daily_spend_logs: false  # Keep enabled for cost analysis
+
+  # Master key for authentication (set via env var)
+  master_key: os.environ/LITELLM_MASTER_KEY
+
+  # Database for logging (optional but recommended for cost tracking)
+  database_url: os.environ/DATABASE_URL
+
+  # Enable OpenAPI docs
+  docs_url: /docs
+
+# =============================================================================
+# Usage Guidelines (for Open WebUI users)
+# =============================================================================
+#
+# Model Selection Guide:
+#
+# Use llama-3.1-8b for:
+# - General chat and Q&A
+# - Simple code generation
+# - Data extraction
+# - Summarization
+# - Translation
+# - Most routine tasks
+# Cost: ~$0/month (self-hosted)
+# Speed: ~50-80 tokens/second
+#
+# Use qwen-2.5-14b for:
+# - Complex reasoning
+# - Multi-step problems
+# - Advanced code generation
+# - Multilingual tasks
+# Cost: ~$0/month (self-hosted)
+# Speed: ~30-50 tokens/second
+#
+# Use claude-sonnet-4.5 for:
+# - Very complex reasoning
+# - Long documents (200K context)
+# - Production-critical code
+# - When quality matters most
+# Cost: ~$3/million input tokens, ~$15/million output tokens
+# Speed: ~30-40 tokens/second
+#
+# Use claude-3-haiku for:
+# - API fallback (if self-hosted down)
+# - Very fast responses needed
+# Cost: ~$0.25/million input tokens, ~$1.25/million output tokens
+# Speed: ~60-80 tokens/second
+#
+# =============================================================================
+
+# Health Check Configuration
+health_check:
+  # Check vLLM health endpoint
+  enabled: true
+  interval: 30  # seconds
+  timeout: 5  # seconds
+
+# Fallback Configuration
+# If GPU server is down, automatically use Claude
+fallback:
+  - ["llama-3.1-8b", "claude-3-haiku"]
+  - ["qwen-2.5-14b", "claude-sonnet-4.5"]