fix: enable eager execution for proper token streaming in vLLM
- Set enforce_eager=True to disable CUDA graphs which were batching outputs - Add disable_log_stats=True for better streaming performance - This ensures AsyncLLMEngine yields tokens incrementally instead of returning complete response
This commit is contained in:
@@ -77,7 +77,8 @@ async def startup_event():
|
|||||||
dtype="auto", # Auto-detect dtype
|
dtype="auto", # Auto-detect dtype
|
||||||
download_dir="/workspace/huggingface_cache", # Large disk
|
download_dir="/workspace/huggingface_cache", # Large disk
|
||||||
trust_remote_code=True, # Some models require this
|
trust_remote_code=True, # Some models require this
|
||||||
enforce_eager=False, # Use CUDA graphs for better performance
|
enforce_eager=True, # Enable eager execution for proper streaming
|
||||||
|
disable_log_stats=True, # Disable log stats for better streaming performance
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create async engine
|
# Create async engine
|
||||||
|
|||||||
Reference in New Issue
Block a user