From 7f1890517d3c1d87be7b8d0e73991eb7ba6e1f6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= Date: Fri, 21 Nov 2025 18:25:50 +0100 Subject: [PATCH] fix: enable eager execution for proper token streaming in vLLM - Set enforce_eager=True to disable CUDA graphs which were batching outputs - Add disable_log_stats=True for better streaming performance - This ensures AsyncLLMEngine yields tokens incrementally instead of returning complete response --- models/vllm/server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/models/vllm/server.py b/models/vllm/server.py index d7ae9b5..823b852 100644 --- a/models/vllm/server.py +++ b/models/vllm/server.py @@ -77,7 +77,8 @@ async def startup_event(): dtype="auto", # Auto-detect dtype download_dir="/workspace/huggingface_cache", # Large disk trust_remote_code=True, # Some models require this - enforce_eager=False, # Use CUDA graphs for better performance + enforce_eager=True, # Enable eager execution for proper streaming + disable_log_stats=True, # Disable log stats for better streaming performance ) # Create async engine