#!/bin/bash # Start vLLM server with OpenAI-compatible API # This uses vLLM's built-in server which properly handles streaming vllm serve Qwen/Qwen2.5-7B-Instruct \ --host 0.0.0.0 \ --port 9000 \ --tensor-parallel-size 1 \ --gpu-memory-utilization 0.85 \ --max-model-len 4096 \ --download-dir /workspace/huggingface_cache \ --trust-remote-code