fix: reduce max_model_len to 20000 to fit in 24GB VRAM

2025-11-23 15:43:37 +01:00
parent 5cfd03f1ef
commit cc0f55df38
1 changed files with 2 additions and 2 deletions
--- a/vllm/server.py
+++ b/vllm/server.py
@@ -72,8 +72,8 @@ async def startup_event():
    engine_args = AsyncEngineArgs(
        model=model_name,
        tensor_parallel_size=1,  # Single GPU
-        gpu_memory_utilization=0.85,  # Use 85% of GPU memory
+        gpu_memory_utilization=0.90,  # Use 90% of GPU memory
-        max_model_len=32768,  # Context length (increased for LLMX)
+        max_model_len=20000,  # Context length (balanced for 24GB VRAM)
        dtype="auto",  # Auto-detect dtype
        download_dir="/workspace/huggingface_cache",  # Large disk
        trust_remote_code=True,  # Some models require this