fix: improve streaming with proper delta format and increase max_model_len to 32768
This commit is contained in:
@@ -73,11 +73,11 @@ async def startup_event():
|
||||
model=model_name,
|
||||
tensor_parallel_size=1, # Single GPU
|
||||
gpu_memory_utilization=0.85, # Use 85% of GPU memory
|
||||
max_model_len=4096, # Context length
|
||||
max_model_len=32768, # Context length (increased for LLMX)
|
||||
dtype="auto", # Auto-detect dtype
|
||||
download_dir="/workspace/huggingface_cache", # Large disk
|
||||
trust_remote_code=True, # Some models require this
|
||||
enforce_eager=True, # Enable eager execution for proper streaming
|
||||
enforce_eager=False, # Disable eager mode for better streaming
|
||||
disable_log_stats=True, # Disable log stats for better streaming performance
|
||||
)
|
||||
|
||||
@@ -241,10 +241,20 @@ async def create_chat_completion(request: ChatCompletionRequest):
|
||||
# Streaming response
|
||||
async def generate_stream():
|
||||
previous_text = ""
|
||||
first_chunk = True
|
||||
async for output in engine.generate(prompt, sampling_params, request_id):
|
||||
current_text = output.outputs[0].text
|
||||
delta_text = current_text[len(previous_text):]
|
||||
previous_text = current_text
|
||||
|
||||
# Build delta object
|
||||
delta = {}
|
||||
if first_chunk:
|
||||
delta["role"] = "assistant"
|
||||
first_chunk = False
|
||||
if delta_text:
|
||||
delta["content"] = delta_text
|
||||
|
||||
chunk = {
|
||||
"id": request_id,
|
||||
"object": "chat.completion.chunk",
|
||||
@@ -253,7 +263,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"delta": {"content": delta_text} if delta_text else {},
|
||||
"delta": delta,
|
||||
"finish_reason": output.outputs[0].finish_reason,
|
||||
}
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user