fix: reduce max_model_len to 20000 to fit in 24GB VRAM

This commit is contained in:
2025-11-23 15:43:37 +01:00
parent 5cfd03f1ef
commit cc0f55df38

View File

@@ -72,8 +72,8 @@ async def startup_event():
engine_args = AsyncEngineArgs(
model=model_name,
tensor_parallel_size=1, # Single GPU
gpu_memory_utilization=0.85, # Use 85% of GPU memory
max_model_len=32768, # Context length (increased for LLMX)
gpu_memory_utilization=0.90, # Use 90% of GPU memory
max_model_len=20000, # Context length (balanced for 24GB VRAM)
dtype="auto", # Auto-detect dtype
download_dir="/workspace/huggingface_cache", # Large disk
trust_remote_code=True, # Some models require this