feat: add vllm serve script with proper streaming support

This commit is contained in:
2025-11-21 18:08:21 +01:00
parent d21caa56bc
commit 6944e4ebd5

12
start-vllm.sh Executable file
View File

@@ -0,0 +1,12 @@
#!/bin/bash
# Start vLLM server with OpenAI-compatible API
# This uses vLLM's built-in server which properly handles streaming
vllm serve Qwen/Qwen2.5-7B-Instruct \
--host 0.0.0.0 \
--port 9000 \
--tensor-parallel-size 1 \
--gpu-memory-utilization 0.85 \
--max-model-len 4096 \
--download-dir /workspace/huggingface_cache \
--trust-remote-code