diff --git a/models/vllm/server.py b/models/vllm/server.py index d7ae9b5..823b852 100644 --- a/models/vllm/server.py +++ b/models/vllm/server.py @@ -77,7 +77,8 @@ async def startup_event(): dtype="auto", # Auto-detect dtype download_dir="/workspace/huggingface_cache", # Large disk trust_remote_code=True, # Some models require this - enforce_eager=False, # Use CUDA graphs for better performance + enforce_eager=True, # Enable eager execution for proper streaming + disable_log_stats=True, # Disable log stats for better streaming performance ) # Create async engine