From cc0f55df38b24e5bccb7c98903e057b2dc4ef101 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= Date: Sun, 23 Nov 2025 15:43:37 +0100 Subject: [PATCH] fix: reduce max_model_len to 20000 to fit in 24GB VRAM --- vllm/server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/server.py b/vllm/server.py index 5cbd621..e4da19d 100644 --- a/vllm/server.py +++ b/vllm/server.py @@ -72,8 +72,8 @@ async def startup_event(): engine_args = AsyncEngineArgs( model=model_name, tensor_parallel_size=1, # Single GPU - gpu_memory_utilization=0.85, # Use 85% of GPU memory - max_model_len=32768, # Context length (increased for LLMX) + gpu_memory_utilization=0.90, # Use 90% of GPU memory + max_model_len=20000, # Context length (balanced for 24GB VRAM) dtype="auto", # Auto-detect dtype download_dir="/workspace/huggingface_cache", # Large disk trust_remote_code=True, # Some models require this