From d21caa56bc9de2ba9227e5ddf0b53321b0ac9211 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= Date: Fri, 21 Nov 2025 17:23:18 +0100 Subject: [PATCH] fix: implement incremental streaming deltas for vLLM chat completions - Track previous_text to calculate deltas instead of sending full accumulated text - Fixes WebUI streaming issue where responses appeared empty - Only send new tokens in each SSE chunk delta - Resolves OpenAI API compatibility for streaming chat completions --- models/vllm/server.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/models/vllm/server.py b/models/vllm/server.py index 0075bd2..d7ae9b5 100644 --- a/models/vllm/server.py +++ b/models/vllm/server.py @@ -239,7 +239,11 @@ async def create_chat_completion(request: ChatCompletionRequest): if request.stream: # Streaming response async def generate_stream(): + previous_text = "" async for output in engine.generate(prompt, sampling_params, request_id): + current_text = output.outputs[0].text + delta_text = current_text[len(previous_text):] + previous_text = current_text chunk = { "id": request_id, "object": "chat.completion.chunk", @@ -248,7 +252,7 @@ async def create_chat_completion(request: ChatCompletionRequest): "choices": [ { "index": 0, - "delta": {"content": output.outputs[0].text}, + "delta": {"content": delta_text} if delta_text else {}, "finish_reason": output.outputs[0].finish_reason, } ]