chore: remove qwen

2025-11-26 21:03:43 +01:00
parent b63ddbffbd
commit cef233b678
1 changed files with 14 additions and 27 deletions
--- a/ai/litellm-config.yaml
+++ b/ai/litellm-config.yaml
@@ -29,65 +29,52 @@ model_list:
  # ===========================================================================
  # Direct connections to dedicated vLLM servers (no orchestrator)

-  # Text Generation - Qwen 2.5 7B (Port 8000)
-  - model_name: qwen-2.5-7b
-    litellm_params:
-      model: hosted_vllm/openai/qwen-2.5-7b  # hosted_vllm/openai/ prefix for proper streaming
-      api_base: os.environ/GPU_VLLM_QWEN_URL  # Direct to vLLM Qwen server
-      api_key: "EMPTY"  # vLLM doesn't validate API keys
-      rpm: 1000
-      tpm: 100000
-      timeout: 600  # 10 minutes for generation
-      stream_timeout: 600
-      supports_system_messages: true  # Qwen supports system messages
-      stream: true  # Enable streaming by default
-
  # Text Generation - Llama 3.1 8B (Port 8001)
  - model_name: llama-3.1-8b
    litellm_params:
-      model: hosted_vllm/openai/llama-3.1-8b  # hosted_vllm/openai/ prefix for proper streaming
-      api_base: os.environ/GPU_VLLM_LLAMA_URL  # Direct to vLLM Llama server
-      api_key: "EMPTY"  # vLLM doesn't validate API keys
+      model: hosted_vllm/openai/llama-3.1-8b # hosted_vllm/openai/ prefix for proper streaming
+      api_base: os.environ/GPU_VLLM_LLAMA_URL # Direct to vLLM Llama server
+      api_key: "EMPTY" # vLLM doesn't validate API keys
      rpm: 1000
      tpm: 100000
-      timeout: 600  # 10 minutes for generation
+      timeout: 600 # 10 minutes for generation
      stream_timeout: 600
-      supports_system_messages: true  # Llama supports system messages
-      stream: true  # Enable streaming by default
+      supports_system_messages: true # Llama supports system messages
+      stream: true # Enable streaming by default

  # Embeddings - BGE Large (Port 8002)
  - model_name: bge-large-en-v1.5
    litellm_params:
      model: hosted_vllm/openai/bge-large-en-v1.5
-      api_base: os.environ/GPU_VLLM_EMBED_URL  # Direct to vLLM embedding server
+      api_base: os.environ/GPU_VLLM_EMBED_URL # Direct to vLLM embedding server
      api_key: "EMPTY"
      rpm: 1000
      tpm: 500000

 litellm_settings:
-  drop_params: false  # DISABLED: Was breaking streaming
-  set_verbose: true  # Enable verbose logging for debugging streaming issues
+  drop_params: false # DISABLED: Was breaking streaming
+  set_verbose: true # Enable verbose logging for debugging streaming issues
  # Enable caching now that streaming is fixed
  cache: true
  cache_params:
    type: redis
    host: core_redis
    port: 6379
-    ttl: 3600  # Cache for 1 hour
+    ttl: 3600 # Cache for 1 hour
  # Force strip specific parameters globally
  allowed_fails: 0
  # Modify params before sending to provider
-  modify_params: false  # DISABLED: Was breaking streaming
+  modify_params: false # DISABLED: Was breaking streaming
  # Enable success and failure logging but minimize overhead
-  success_callback: []  # Disable all success callbacks to reduce DB writes
-  failure_callback: []  # Disable all failure callbacks
+  success_callback: [] # Disable all success callbacks to reduce DB writes
+  failure_callback: [] # Disable all failure callbacks

 router_settings:
  allowed_fails: 0

 # Drop unsupported parameters
 default_litellm_params:
-  drop_params: false  # DISABLED: Was breaking streaming
+  drop_params: false # DISABLED: Was breaking streaming

 general_settings:
  disable_responses_id_security: true