fix: implement incremental streaming deltas for vLLM chat completions

- Track previous_text to calculate deltas instead of sending full accumulated text
- Fixes WebUI streaming issue where responses appeared empty
- Only send new tokens in each SSE chunk delta
- Resolves OpenAI API compatibility for streaming chat completions
This commit is contained in:
2025-11-21 17:23:18 +01:00
parent 57b706abe6
commit d21caa56bc

View File

@@ -239,7 +239,11 @@ async def create_chat_completion(request: ChatCompletionRequest):
if request.stream:
# Streaming response
async def generate_stream():
previous_text = ""
async for output in engine.generate(prompt, sampling_params, request_id):
current_text = output.outputs[0].text
delta_text = current_text[len(previous_text):]
previous_text = current_text
chunk = {
"id": request_id,
"object": "chat.completion.chunk",
@@ -248,7 +252,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
"choices": [
{
"index": 0,
"delta": {"content": output.outputs[0].text},
"delta": {"content": delta_text} if delta_text else {},
"finish_reason": output.outputs[0].finish_reason,
}
]