fix: improve streaming with proper delta format and increase max_model_len to 32768

This commit is contained in:
2025-11-23 15:38:18 +01:00
parent 3f812704a2
commit 5cfd03f1ef

View File

@@ -73,11 +73,11 @@ async def startup_event():
model=model_name,
tensor_parallel_size=1, # Single GPU
gpu_memory_utilization=0.85, # Use 85% of GPU memory
max_model_len=4096, # Context length
max_model_len=32768, # Context length (increased for LLMX)
dtype="auto", # Auto-detect dtype
download_dir="/workspace/huggingface_cache", # Large disk
trust_remote_code=True, # Some models require this
enforce_eager=True, # Enable eager execution for proper streaming
enforce_eager=False, # Disable eager mode for better streaming
disable_log_stats=True, # Disable log stats for better streaming performance
)
@@ -241,10 +241,20 @@ async def create_chat_completion(request: ChatCompletionRequest):
# Streaming response
async def generate_stream():
previous_text = ""
first_chunk = True
async for output in engine.generate(prompt, sampling_params, request_id):
current_text = output.outputs[0].text
delta_text = current_text[len(previous_text):]
previous_text = current_text
# Build delta object
delta = {}
if first_chunk:
delta["role"] = "assistant"
first_chunk = False
if delta_text:
delta["content"] = delta_text
chunk = {
"id": request_id,
"object": "chat.completion.chunk",
@@ -253,7 +263,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
"choices": [
{
"index": 0,
"delta": {"content": delta_text} if delta_text else {},
"delta": delta,
"finish_reason": output.outputs[0].finish_reason,
}
]