From 7fc945e179cc365f8b405e2be58f39d3748094b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= Date: Sun, 23 Nov 2025 16:10:20 +0100 Subject: [PATCH] fix: update LiteLLM config for direct vLLM server access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace orchestrator routing with direct vLLM server connections - Qwen 2.5 7B on port 8000 (GPU_VLLM_QWEN_URL) - Llama 3.1 8B on port 8001 (GPU_VLLM_LLAMA_URL) - Simplify architecture by removing orchestrator proxy layer 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- ai/litellm-config.yaml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/ai/litellm-config.yaml b/ai/litellm-config.yaml index 4a61bf0..b43f433 100644 --- a/ai/litellm-config.yaml +++ b/ai/litellm-config.yaml @@ -25,15 +25,15 @@ model_list: api_key: os.environ/ANTHROPIC_API_KEY # =========================================================================== - # SELF-HOSTED MODELS VIA ORCHESTRATOR (GPU Server via Tailscale VPN) + # SELF-HOSTED MODELS - DIRECT vLLM SERVERS (GPU Server via Tailscale VPN) # =========================================================================== - # All requests route through orchestrator (port 9000) which manages model loading + # Direct connections to dedicated vLLM servers (no orchestrator) - # Text Generation + # Text Generation - Qwen 2.5 7B (Port 8000) - model_name: qwen-2.5-7b litellm_params: - model: hosted_vllm/openai/qwen-2.5-7b # hosted_vllm/openai/ for vLLM via orchestrator - api_base: os.environ/GPU_VLLM_API_URL # RunPod GPU via Tailscale + model: hosted_vllm/openai/qwen-2.5-7b # hosted_vllm/openai/ prefix for proper streaming + api_base: os.environ/GPU_VLLM_QWEN_URL # Direct to vLLM Qwen server api_key: dummy rpm: 1000 tpm: 100000 @@ -42,10 +42,11 @@ model_list: supports_system_messages: true # Qwen supports system messages stream: true # Enable streaming by default + # Text Generation - Llama 3.1 8B (Port 8001) - model_name: llama-3.1-8b litellm_params: - model: hosted_vllm/openai/llama-3.1-8b # hosted_vllm/openai/ for vLLM via orchestrator - api_base: os.environ/GPU_VLLM_API_URL # RunPod GPU via Tailscale + model: hosted_vllm/openai/llama-3.1-8b # hosted_vllm/openai/ prefix for proper streaming + api_base: os.environ/GPU_VLLM_LLAMA_URL # Direct to vLLM Llama server api_key: dummy rpm: 1000 tpm: 100000