feat: add vllm serve script with proper streaming support
This commit is contained in:
12
start-vllm.sh
Executable file
12
start-vllm.sh
Executable file
@@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Start vLLM server with OpenAI-compatible API
|
||||||
|
# This uses vLLM's built-in server which properly handles streaming
|
||||||
|
|
||||||
|
vllm serve Qwen/Qwen2.5-7B-Instruct \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port 9000 \
|
||||||
|
--tensor-parallel-size 1 \
|
||||||
|
--gpu-memory-utilization 0.85 \
|
||||||
|
--max-model-len 4096 \
|
||||||
|
--download-dir /workspace/huggingface_cache \
|
||||||
|
--trust-remote-code
|
||||||
Reference in New Issue
Block a user