diff --git a/vllm/server.py b/vllm/server.py index 36dbd03..5cbd621 100644 --- a/vllm/server.py +++ b/vllm/server.py @@ -73,11 +73,11 @@ async def startup_event(): model=model_name, tensor_parallel_size=1, # Single GPU gpu_memory_utilization=0.85, # Use 85% of GPU memory - max_model_len=4096, # Context length + max_model_len=32768, # Context length (increased for LLMX) dtype="auto", # Auto-detect dtype download_dir="/workspace/huggingface_cache", # Large disk trust_remote_code=True, # Some models require this - enforce_eager=True, # Enable eager execution for proper streaming + enforce_eager=False, # Disable eager mode for better streaming disable_log_stats=True, # Disable log stats for better streaming performance ) @@ -241,10 +241,20 @@ async def create_chat_completion(request: ChatCompletionRequest): # Streaming response async def generate_stream(): previous_text = "" + first_chunk = True async for output in engine.generate(prompt, sampling_params, request_id): current_text = output.outputs[0].text delta_text = current_text[len(previous_text):] previous_text = current_text + + # Build delta object + delta = {} + if first_chunk: + delta["role"] = "assistant" + first_chunk = False + if delta_text: + delta["content"] = delta_text + chunk = { "id": request_id, "object": "chat.completion.chunk", @@ -253,7 +263,7 @@ async def create_chat_completion(request: ChatCompletionRequest): "choices": [ { "index": 0, - "delta": {"content": delta_text} if delta_text else {}, + "delta": delta, "finish_reason": output.outputs[0].finish_reason, } ]