diff --git a/start-vllm.sh b/start-vllm.sh new file mode 100755 index 0000000..ee613a9 --- /dev/null +++ b/start-vllm.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# Start vLLM server with OpenAI-compatible API +# This uses vLLM's built-in server which properly handles streaming + +vllm serve Qwen/Qwen2.5-7B-Instruct \ + --host 0.0.0.0 \ + --port 9000 \ + --tensor-parallel-size 1 \ + --gpu-memory-utilization 0.85 \ + --max-model-len 4096 \ + --download-dir /workspace/huggingface_cache \ + --trust-remote-code