fix: implement incremental streaming deltas for vLLM chat completions
- Track previous_text to calculate deltas instead of sending full accumulated text - Fixes WebUI streaming issue where responses appeared empty - Only send new tokens in each SSE chunk delta - Resolves OpenAI API compatibility for streaming chat completions
This commit is contained in:
@@ -239,7 +239,11 @@ async def create_chat_completion(request: ChatCompletionRequest):
|
||||
if request.stream:
|
||||
# Streaming response
|
||||
async def generate_stream():
|
||||
previous_text = ""
|
||||
async for output in engine.generate(prompt, sampling_params, request_id):
|
||||
current_text = output.outputs[0].text
|
||||
delta_text = current_text[len(previous_text):]
|
||||
previous_text = current_text
|
||||
chunk = {
|
||||
"id": request_id,
|
||||
"object": "chat.completion.chunk",
|
||||
@@ -248,7 +252,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": output.outputs[0].text},
|
||||
"delta": {"content": delta_text} if delta_text else {},
|
||||
"finish_reason": output.outputs[0].finish_reason,
|
||||
}
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user