fix: update LiteLLM config for direct vLLM server access
- Replace orchestrator routing with direct vLLM server connections - Qwen 2.5 7B on port 8000 (GPU_VLLM_QWEN_URL) - Llama 3.1 8B on port 8001 (GPU_VLLM_LLAMA_URL) - Simplify architecture by removing orchestrator proxy layer 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -25,15 +25,15 @@ model_list:
|
|||||||
api_key: os.environ/ANTHROPIC_API_KEY
|
api_key: os.environ/ANTHROPIC_API_KEY
|
||||||
|
|
||||||
# ===========================================================================
|
# ===========================================================================
|
||||||
# SELF-HOSTED MODELS VIA ORCHESTRATOR (GPU Server via Tailscale VPN)
|
# SELF-HOSTED MODELS - DIRECT vLLM SERVERS (GPU Server via Tailscale VPN)
|
||||||
# ===========================================================================
|
# ===========================================================================
|
||||||
# All requests route through orchestrator (port 9000) which manages model loading
|
# Direct connections to dedicated vLLM servers (no orchestrator)
|
||||||
|
|
||||||
# Text Generation
|
# Text Generation - Qwen 2.5 7B (Port 8000)
|
||||||
- model_name: qwen-2.5-7b
|
- model_name: qwen-2.5-7b
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: hosted_vllm/openai/qwen-2.5-7b # hosted_vllm/openai/ for vLLM via orchestrator
|
model: hosted_vllm/openai/qwen-2.5-7b # hosted_vllm/openai/ prefix for proper streaming
|
||||||
api_base: os.environ/GPU_VLLM_API_URL # RunPod GPU via Tailscale
|
api_base: os.environ/GPU_VLLM_QWEN_URL # Direct to vLLM Qwen server
|
||||||
api_key: dummy
|
api_key: dummy
|
||||||
rpm: 1000
|
rpm: 1000
|
||||||
tpm: 100000
|
tpm: 100000
|
||||||
@@ -42,10 +42,11 @@ model_list:
|
|||||||
supports_system_messages: true # Qwen supports system messages
|
supports_system_messages: true # Qwen supports system messages
|
||||||
stream: true # Enable streaming by default
|
stream: true # Enable streaming by default
|
||||||
|
|
||||||
|
# Text Generation - Llama 3.1 8B (Port 8001)
|
||||||
- model_name: llama-3.1-8b
|
- model_name: llama-3.1-8b
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: hosted_vllm/openai/llama-3.1-8b # hosted_vllm/openai/ for vLLM via orchestrator
|
model: hosted_vllm/openai/llama-3.1-8b # hosted_vllm/openai/ prefix for proper streaming
|
||||||
api_base: os.environ/GPU_VLLM_API_URL # RunPod GPU via Tailscale
|
api_base: os.environ/GPU_VLLM_LLAMA_URL # Direct to vLLM Llama server
|
||||||
api_key: dummy
|
api_key: dummy
|
||||||
rpm: 1000
|
rpm: 1000
|
||||||
tpm: 100000
|
tpm: 100000
|
||||||
|
|||||||
Reference in New Issue
Block a user