fix: reduce max_model_len to 20000 to fit in 24GB VRAM
This commit is contained in:
@@ -72,8 +72,8 @@ async def startup_event():
|
|||||||
engine_args = AsyncEngineArgs(
|
engine_args = AsyncEngineArgs(
|
||||||
model=model_name,
|
model=model_name,
|
||||||
tensor_parallel_size=1, # Single GPU
|
tensor_parallel_size=1, # Single GPU
|
||||||
gpu_memory_utilization=0.85, # Use 85% of GPU memory
|
gpu_memory_utilization=0.90, # Use 90% of GPU memory
|
||||||
max_model_len=32768, # Context length (increased for LLMX)
|
max_model_len=20000, # Context length (balanced for 24GB VRAM)
|
||||||
dtype="auto", # Auto-detect dtype
|
dtype="auto", # Auto-detect dtype
|
||||||
download_dir="/workspace/huggingface_cache", # Large disk
|
download_dir="/workspace/huggingface_cache", # Large disk
|
||||||
trust_remote_code=True, # Some models require this
|
trust_remote_code=True, # Some models require this
|
||||||
|
|||||||
Reference in New Issue
Block a user