From 62fcf832da8e60238b12be114b36f1af754c4398 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= <valknar@pivoine.art>
Date: Fri, 21 Nov 2025 18:38:31 +0100
Subject: [PATCH] feat: add direct RunPod orchestrator connection to WebUI for
 streaming bypass

- Configure WebUI with both LiteLLM and direct orchestrator API base URLs
- This bypasses LiteLLM's streaming issues for the qwen-2.5-7b model
- WebUI will now show models from both endpoints
- Allows testing if LiteLLM is the bottleneck for streaming

Related to streaming fix in RunPod models/vllm/server.py
---
 ai/compose.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ai/compose.yaml b/ai/compose.yaml
index de2bb0c..00a7afc 100644
--- a/ai/compose.yaml
+++ b/ai/compose.yaml
@@ -34,9 +34,9 @@ services:
       # Database configuration
       DATABASE_URL: postgresql://${AI_DB_USER}:${AI_DB_PASSWORD}@ai_postgres:5432/${AI_DB_NAME}
 
-      # OpenAI API configuration (pointing to LiteLLM proxy)
-      OPENAI_API_BASE_URLS: http://litellm:4000
-      OPENAI_API_KEYS: ${AI_LITELLM_API_KEY}
+      # OpenAI API configuration (pointing to LiteLLM proxy + direct RunPod orchestrator)
+      OPENAI_API_BASE_URLS: http://litellm:4000;http://100.121.199.88:9000/v1
+      OPENAI_API_KEYS: ${AI_LITELLM_API_KEY};dummy
 
       # Disable Ollama (we only use LiteLLM)
       ENABLE_OLLAMA_API: false