fix: implement incremental streaming deltas for vLLM chat completions

- Track previous_text to calculate deltas instead of sending full accumulated text
- Fixes WebUI streaming issue where responses appeared empty
- Only send new tokens in each SSE chunk delta
- Resolves OpenAI API compatibility for streaming chat completions
This commit is contained in:
2025-11-21 17:23:18 +01:00
parent 57b706abe6
commit d21caa56bc

View File

@@ -239,7 +239,11 @@ async def create_chat_completion(request: ChatCompletionRequest):
if request.stream: if request.stream:
# Streaming response # Streaming response
async def generate_stream(): async def generate_stream():
previous_text = ""
async for output in engine.generate(prompt, sampling_params, request_id): async for output in engine.generate(prompt, sampling_params, request_id):
current_text = output.outputs[0].text
delta_text = current_text[len(previous_text):]
previous_text = current_text
chunk = { chunk = {
"id": request_id, "id": request_id,
"object": "chat.completion.chunk", "object": "chat.completion.chunk",
@@ -248,7 +252,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
"choices": [ "choices": [
{ {
"index": 0, "index": 0,
"delta": {"content": output.outputs[0].text}, "delta": {"content": delta_text} if delta_text else {},
"finish_reason": output.outputs[0].finish_reason, "finish_reason": output.outputs[0].finish_reason,
} }
] ]