diff --git a/vllm/server.py b/vllm/server.py index 5cbd621..e4da19d 100644 --- a/vllm/server.py +++ b/vllm/server.py @@ -72,8 +72,8 @@ async def startup_event(): engine_args = AsyncEngineArgs( model=model_name, tensor_parallel_size=1, # Single GPU - gpu_memory_utilization=0.85, # Use 85% of GPU memory - max_model_len=32768, # Context length (increased for LLMX) + gpu_memory_utilization=0.90, # Use 90% of GPU memory + max_model_len=20000, # Context length (balanced for 24GB VRAM) dtype="auto", # Auto-detect dtype download_dir="/workspace/huggingface_cache", # Large disk trust_remote_code=True, # Some models require this