feat: add vLLM models configuration file
All checks were successful
Build and Push RunPod Docker Image / build-and-push (push) Successful in 35s
All checks were successful
Build and Push RunPod Docker Image / build-and-push (push) Successful in 35s
Add models_huggingface_vllm.yaml with three vLLM models: - Qwen/Qwen2.5-7B-Instruct (14GB) - Advanced multilingual reasoning - meta-llama/Llama-3.1-8B-Instruct (17GB) - Extended 128K context - BAAI/bge-large-en-v1.5 (1.3GB) - High-quality text embeddings Total storage: ~32GB 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
126
models_huggingface_vllm.yaml
Normal file
126
models_huggingface_vllm.yaml
Normal file
@@ -0,0 +1,126 @@
|
||||
# ============================================================================
|
||||
# vLLM Model Configuration
|
||||
# ============================================================================
|
||||
#
|
||||
# This configuration file defines all available vLLM models for download.
|
||||
# Models are organized by category: text generation and text embeddings.
|
||||
#
|
||||
# Each model entry contains:
|
||||
# - repo_id: HuggingFace repository identifier
|
||||
# - description: Human-readable description
|
||||
# - size_gb: Approximate size in gigabytes
|
||||
# - essential: Whether this is an essential model (true/false)
|
||||
# - category: Model category (text_generation/embedding)
|
||||
#
|
||||
# ============================================================================
|
||||
|
||||
# Global settings
|
||||
settings:
|
||||
cache_dir: /workspace/huggingface_cache
|
||||
parallel_downloads: 1
|
||||
retry_attempts: 3
|
||||
timeout_seconds: 3600
|
||||
|
||||
# Model categories
|
||||
model_categories:
|
||||
# ==========================================================================
|
||||
# TEXT GENERATION MODELS (vLLM)
|
||||
# ==========================================================================
|
||||
text_generation_models:
|
||||
- repo_id: Qwen/Qwen2.5-7B-Instruct
|
||||
description: Qwen 2.5 7B Instruct - Advanced multilingual reasoning
|
||||
size_gb: 14
|
||||
essential: true
|
||||
category: text_generation
|
||||
type: vllm
|
||||
format: safetensors
|
||||
vram_gb: 14
|
||||
context_length: 32768
|
||||
notes: Latest Qwen 2.5 model with enhanced reasoning capabilities
|
||||
files:
|
||||
- source: "model.safetensors"
|
||||
dest: "model.safetensors"
|
||||
|
||||
- repo_id: meta-llama/Llama-3.1-8B-Instruct
|
||||
description: Llama 3.1 8B Instruct - Meta's latest instruction-tuned model
|
||||
size_gb: 17
|
||||
essential: true
|
||||
category: text_generation
|
||||
type: vllm
|
||||
format: safetensors
|
||||
vram_gb: 17
|
||||
context_length: 131072
|
||||
notes: Extended 128K context length, excellent for long-form tasks
|
||||
files:
|
||||
- source: "model.safetensors"
|
||||
dest: "model.safetensors"
|
||||
|
||||
# ==========================================================================
|
||||
# TEXT EMBEDDING MODELS (vLLM)
|
||||
# ==========================================================================
|
||||
embedding_models:
|
||||
- repo_id: BAAI/bge-large-en-v1.5
|
||||
description: BGE Large English v1.5 - High-quality embeddings for RAG
|
||||
size_gb: 1.3
|
||||
essential: true
|
||||
category: embedding
|
||||
type: vllm_embedding
|
||||
format: safetensors
|
||||
vram_gb: 3
|
||||
embedding_dimensions: 1024
|
||||
max_tokens: 512
|
||||
notes: Top-tier MTEB scores, excellent for semantic search and RAG applications
|
||||
files:
|
||||
- source: "model.safetensors"
|
||||
dest: "model.safetensors"
|
||||
|
||||
# ============================================================================
|
||||
# STORAGE & VRAM SUMMARIES
|
||||
# ============================================================================
|
||||
|
||||
storage_requirements:
|
||||
text_generation: 31 # Qwen 2.5 7B + Llama 3.1 8B
|
||||
embedding: 1.3 # BGE Large
|
||||
total: 32.3 # Total essential storage
|
||||
|
||||
vram_requirements:
|
||||
# For 24GB GPU (RTX 4090)
|
||||
simultaneous_loadable:
|
||||
- name: Qwen 2.5 7B Only
|
||||
models: [Qwen 2.5 7B Instruct]
|
||||
vram_used: 14
|
||||
remaining: 10
|
||||
|
||||
- name: Llama 3.1 8B Only
|
||||
models: [Llama 3.1 8B Instruct]
|
||||
vram_used: 17
|
||||
remaining: 7
|
||||
|
||||
- name: BGE Large Only
|
||||
models: [BGE Large]
|
||||
vram_used: 3
|
||||
remaining: 21
|
||||
|
||||
- name: Qwen + BGE Embedding
|
||||
models: [Qwen 2.5 7B, BGE Large]
|
||||
vram_used: 17
|
||||
remaining: 7
|
||||
|
||||
- name: Llama + BGE Embedding
|
||||
models: [Llama 3.1 8B, BGE Large]
|
||||
vram_used: 20
|
||||
remaining: 4
|
||||
|
||||
# ============================================================================
|
||||
# METADATA
|
||||
# ============================================================================
|
||||
|
||||
metadata:
|
||||
version: 1.0.0
|
||||
last_updated: 2025-11-25
|
||||
compatible_with:
|
||||
- vLLM >= 0.6.0
|
||||
- Python >= 3.10
|
||||
- HuggingFace Hub >= 0.20.0
|
||||
maintainer: Valknar
|
||||
repository: https://github.com/yourusername/runpod
|
||||
Reference in New Issue
Block a user