diff --git a/start-vllm.sh b/start-vllm.sh
new file mode 100755
index 0000000..ee613a9
--- /dev/null
+++ b/start-vllm.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# Start vLLM server with OpenAI-compatible API
+# This uses vLLM's built-in server which properly handles streaming
+
+vllm serve Qwen/Qwen2.5-7B-Instruct \
+  --host 0.0.0.0 \
+  --port 9000 \
+  --tensor-parallel-size 1 \
+  --gpu-memory-utilization 0.85 \
+  --max-model-len 4096 \
+  --download-dir /workspace/huggingface_cache \
+  --trust-remote-code