From 7fc945e179cc365f8b405e2be58f39d3748094b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= <valknar@pivoine.art>
Date: Sun, 23 Nov 2025 16:10:20 +0100
Subject: [PATCH] fix: update LiteLLM config for direct vLLM server access
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace orchestrator routing with direct vLLM server connections
- Qwen 2.5 7B on port 8000 (GPU_VLLM_QWEN_URL)
- Llama 3.1 8B on port 8001 (GPU_VLLM_LLAMA_URL)
- Simplify architecture by removing orchestrator proxy layer

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 ai/litellm-config.yaml | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/ai/litellm-config.yaml b/ai/litellm-config.yaml
index 4a61bf0..b43f433 100644
--- a/ai/litellm-config.yaml
+++ b/ai/litellm-config.yaml
@@ -25,15 +25,15 @@ model_list:
       api_key: os.environ/ANTHROPIC_API_KEY
 
   # ===========================================================================
-  # SELF-HOSTED MODELS VIA ORCHESTRATOR (GPU Server via Tailscale VPN)
+  # SELF-HOSTED MODELS - DIRECT vLLM SERVERS (GPU Server via Tailscale VPN)
   # ===========================================================================
-  # All requests route through orchestrator (port 9000) which manages model loading
+  # Direct connections to dedicated vLLM servers (no orchestrator)
 
-  # Text Generation
+  # Text Generation - Qwen 2.5 7B (Port 8000)
   - model_name: qwen-2.5-7b
     litellm_params:
-      model: hosted_vllm/openai/qwen-2.5-7b  # hosted_vllm/openai/ for vLLM via orchestrator
-      api_base: os.environ/GPU_VLLM_API_URL  # RunPod GPU via Tailscale
+      model: hosted_vllm/openai/qwen-2.5-7b  # hosted_vllm/openai/ prefix for proper streaming
+      api_base: os.environ/GPU_VLLM_QWEN_URL  # Direct to vLLM Qwen server
       api_key: dummy
       rpm: 1000
       tpm: 100000
@@ -42,10 +42,11 @@ model_list:
       supports_system_messages: true  # Qwen supports system messages
       stream: true  # Enable streaming by default
 
+  # Text Generation - Llama 3.1 8B (Port 8001)
   - model_name: llama-3.1-8b
     litellm_params:
-      model: hosted_vllm/openai/llama-3.1-8b  # hosted_vllm/openai/ for vLLM via orchestrator
-      api_base: os.environ/GPU_VLLM_API_URL  # RunPod GPU via Tailscale
+      model: hosted_vllm/openai/llama-3.1-8b  # hosted_vllm/openai/ prefix for proper streaming
+      api_base: os.environ/GPU_VLLM_LLAMA_URL  # Direct to vLLM Llama server
       api_key: dummy
       rpm: 1000
       tpm: 100000