fix: implement incremental streaming deltas for vLLM chat completions
- Track previous_text to calculate deltas instead of sending full accumulated text - Fixes WebUI streaming issue where responses appeared empty - Only send new tokens in each SSE chunk delta - Resolves OpenAI API compatibility for streaming chat completions
This commit is contained in:
@@ -239,7 +239,11 @@ async def create_chat_completion(request: ChatCompletionRequest):
|
|||||||
if request.stream:
|
if request.stream:
|
||||||
# Streaming response
|
# Streaming response
|
||||||
async def generate_stream():
|
async def generate_stream():
|
||||||
|
previous_text = ""
|
||||||
async for output in engine.generate(prompt, sampling_params, request_id):
|
async for output in engine.generate(prompt, sampling_params, request_id):
|
||||||
|
current_text = output.outputs[0].text
|
||||||
|
delta_text = current_text[len(previous_text):]
|
||||||
|
previous_text = current_text
|
||||||
chunk = {
|
chunk = {
|
||||||
"id": request_id,
|
"id": request_id,
|
||||||
"object": "chat.completion.chunk",
|
"object": "chat.completion.chunk",
|
||||||
@@ -248,7 +252,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
|
|||||||
"choices": [
|
"choices": [
|
||||||
{
|
{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
"delta": {"content": output.outputs[0].text},
|
"delta": {"content": delta_text} if delta_text else {},
|
||||||
"finish_reason": output.outputs[0].finish_reason,
|
"finish_reason": output.outputs[0].finish_reason,
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
Reference in New Issue
Block a user