fix: update LiteLLM config for direct vLLM server access

- Replace orchestrator routing with direct vLLM server connections
- Qwen 2.5 7B on port 8000 (GPU_VLLM_QWEN_URL)
- Llama 3.1 8B on port 8001 (GPU_VLLM_LLAMA_URL)
- Simplify architecture by removing orchestrator proxy layer

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-11-23 16:10:20 +01:00
parent 94ab4ae6dd
commit 7fc945e179

View File

@@ -25,15 +25,15 @@ model_list:
api_key: os.environ/ANTHROPIC_API_KEY api_key: os.environ/ANTHROPIC_API_KEY
# =========================================================================== # ===========================================================================
# SELF-HOSTED MODELS VIA ORCHESTRATOR (GPU Server via Tailscale VPN) # SELF-HOSTED MODELS - DIRECT vLLM SERVERS (GPU Server via Tailscale VPN)
# =========================================================================== # ===========================================================================
# All requests route through orchestrator (port 9000) which manages model loading # Direct connections to dedicated vLLM servers (no orchestrator)
# Text Generation # Text Generation - Qwen 2.5 7B (Port 8000)
- model_name: qwen-2.5-7b - model_name: qwen-2.5-7b
litellm_params: litellm_params:
model: hosted_vllm/openai/qwen-2.5-7b # hosted_vllm/openai/ for vLLM via orchestrator model: hosted_vllm/openai/qwen-2.5-7b # hosted_vllm/openai/ prefix for proper streaming
api_base: os.environ/GPU_VLLM_API_URL # RunPod GPU via Tailscale api_base: os.environ/GPU_VLLM_QWEN_URL # Direct to vLLM Qwen server
api_key: dummy api_key: dummy
rpm: 1000 rpm: 1000
tpm: 100000 tpm: 100000
@@ -42,10 +42,11 @@ model_list:
supports_system_messages: true # Qwen supports system messages supports_system_messages: true # Qwen supports system messages
stream: true # Enable streaming by default stream: true # Enable streaming by default
# Text Generation - Llama 3.1 8B (Port 8001)
- model_name: llama-3.1-8b - model_name: llama-3.1-8b
litellm_params: litellm_params:
model: hosted_vllm/openai/llama-3.1-8b # hosted_vllm/openai/ for vLLM via orchestrator model: hosted_vllm/openai/llama-3.1-8b # hosted_vllm/openai/ prefix for proper streaming
api_base: os.environ/GPU_VLLM_API_URL # RunPod GPU via Tailscale api_base: os.environ/GPU_VLLM_LLAMA_URL # Direct to vLLM Llama server
api_key: dummy api_key: dummy
rpm: 1000 rpm: 1000
tpm: 100000 tpm: 100000