fix: reduce max_model_len to 20000 to fit in 24GB VRAM
This commit is contained in:
@@ -72,8 +72,8 @@ async def startup_event():
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=model_name,
|
||||
tensor_parallel_size=1, # Single GPU
|
||||
gpu_memory_utilization=0.85, # Use 85% of GPU memory
|
||||
max_model_len=32768, # Context length (increased for LLMX)
|
||||
gpu_memory_utilization=0.90, # Use 90% of GPU memory
|
||||
max_model_len=20000, # Context length (balanced for 24GB VRAM)
|
||||
dtype="auto", # Auto-detect dtype
|
||||
download_dir="/workspace/huggingface_cache", # Large disk
|
||||
trust_remote_code=True, # Some models require this
|
||||
|
||||
Reference in New Issue
Block a user