fix: improve streaming with proper delta format and increase max_model_len to 32768
This commit is contained in:
@@ -73,11 +73,11 @@ async def startup_event():
|
|||||||
model=model_name,
|
model=model_name,
|
||||||
tensor_parallel_size=1, # Single GPU
|
tensor_parallel_size=1, # Single GPU
|
||||||
gpu_memory_utilization=0.85, # Use 85% of GPU memory
|
gpu_memory_utilization=0.85, # Use 85% of GPU memory
|
||||||
max_model_len=4096, # Context length
|
max_model_len=32768, # Context length (increased for LLMX)
|
||||||
dtype="auto", # Auto-detect dtype
|
dtype="auto", # Auto-detect dtype
|
||||||
download_dir="/workspace/huggingface_cache", # Large disk
|
download_dir="/workspace/huggingface_cache", # Large disk
|
||||||
trust_remote_code=True, # Some models require this
|
trust_remote_code=True, # Some models require this
|
||||||
enforce_eager=True, # Enable eager execution for proper streaming
|
enforce_eager=False, # Disable eager mode for better streaming
|
||||||
disable_log_stats=True, # Disable log stats for better streaming performance
|
disable_log_stats=True, # Disable log stats for better streaming performance
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -241,10 +241,20 @@ async def create_chat_completion(request: ChatCompletionRequest):
|
|||||||
# Streaming response
|
# Streaming response
|
||||||
async def generate_stream():
|
async def generate_stream():
|
||||||
previous_text = ""
|
previous_text = ""
|
||||||
|
first_chunk = True
|
||||||
async for output in engine.generate(prompt, sampling_params, request_id):
|
async for output in engine.generate(prompt, sampling_params, request_id):
|
||||||
current_text = output.outputs[0].text
|
current_text = output.outputs[0].text
|
||||||
delta_text = current_text[len(previous_text):]
|
delta_text = current_text[len(previous_text):]
|
||||||
previous_text = current_text
|
previous_text = current_text
|
||||||
|
|
||||||
|
# Build delta object
|
||||||
|
delta = {}
|
||||||
|
if first_chunk:
|
||||||
|
delta["role"] = "assistant"
|
||||||
|
first_chunk = False
|
||||||
|
if delta_text:
|
||||||
|
delta["content"] = delta_text
|
||||||
|
|
||||||
chunk = {
|
chunk = {
|
||||||
"id": request_id,
|
"id": request_id,
|
||||||
"object": "chat.completion.chunk",
|
"object": "chat.completion.chunk",
|
||||||
@@ -253,7 +263,7 @@ async def create_chat_completion(request: ChatCompletionRequest):
|
|||||||
"choices": [
|
"choices": [
|
||||||
{
|
{
|
||||||
"index": 0,
|
"index": 0,
|
||||||
"delta": {"content": delta_text} if delta_text else {},
|
"delta": delta,
|
||||||
"finish_reason": output.outputs[0].finish_reason,
|
"finish_reason": output.outputs[0].finish_reason,
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
Reference in New Issue
Block a user