diff --git a/models_huggingface_vllm.yaml b/models_huggingface_vllm.yaml new file mode 100644 index 0000000..9dcf510 --- /dev/null +++ b/models_huggingface_vllm.yaml @@ -0,0 +1,126 @@ +# ============================================================================ +# vLLM Model Configuration +# ============================================================================ +# +# This configuration file defines all available vLLM models for download. +# Models are organized by category: text generation and text embeddings. +# +# Each model entry contains: +# - repo_id: HuggingFace repository identifier +# - description: Human-readable description +# - size_gb: Approximate size in gigabytes +# - essential: Whether this is an essential model (true/false) +# - category: Model category (text_generation/embedding) +# +# ============================================================================ + +# Global settings +settings: + cache_dir: /workspace/huggingface_cache + parallel_downloads: 1 + retry_attempts: 3 + timeout_seconds: 3600 + +# Model categories +model_categories: + # ========================================================================== + # TEXT GENERATION MODELS (vLLM) + # ========================================================================== + text_generation_models: + - repo_id: Qwen/Qwen2.5-7B-Instruct + description: Qwen 2.5 7B Instruct - Advanced multilingual reasoning + size_gb: 14 + essential: true + category: text_generation + type: vllm + format: safetensors + vram_gb: 14 + context_length: 32768 + notes: Latest Qwen 2.5 model with enhanced reasoning capabilities + files: + - source: "model.safetensors" + dest: "model.safetensors" + + - repo_id: meta-llama/Llama-3.1-8B-Instruct + description: Llama 3.1 8B Instruct - Meta's latest instruction-tuned model + size_gb: 17 + essential: true + category: text_generation + type: vllm + format: safetensors + vram_gb: 17 + context_length: 131072 + notes: Extended 128K context length, excellent for long-form tasks + files: + - source: "model.safetensors" + dest: "model.safetensors" + + # ========================================================================== + # TEXT EMBEDDING MODELS (vLLM) + # ========================================================================== + embedding_models: + - repo_id: BAAI/bge-large-en-v1.5 + description: BGE Large English v1.5 - High-quality embeddings for RAG + size_gb: 1.3 + essential: true + category: embedding + type: vllm_embedding + format: safetensors + vram_gb: 3 + embedding_dimensions: 1024 + max_tokens: 512 + notes: Top-tier MTEB scores, excellent for semantic search and RAG applications + files: + - source: "model.safetensors" + dest: "model.safetensors" + +# ============================================================================ +# STORAGE & VRAM SUMMARIES +# ============================================================================ + +storage_requirements: + text_generation: 31 # Qwen 2.5 7B + Llama 3.1 8B + embedding: 1.3 # BGE Large + total: 32.3 # Total essential storage + +vram_requirements: + # For 24GB GPU (RTX 4090) + simultaneous_loadable: + - name: Qwen 2.5 7B Only + models: [Qwen 2.5 7B Instruct] + vram_used: 14 + remaining: 10 + + - name: Llama 3.1 8B Only + models: [Llama 3.1 8B Instruct] + vram_used: 17 + remaining: 7 + + - name: BGE Large Only + models: [BGE Large] + vram_used: 3 + remaining: 21 + + - name: Qwen + BGE Embedding + models: [Qwen 2.5 7B, BGE Large] + vram_used: 17 + remaining: 7 + + - name: Llama + BGE Embedding + models: [Llama 3.1 8B, BGE Large] + vram_used: 20 + remaining: 4 + +# ============================================================================ +# METADATA +# ============================================================================ + +metadata: + version: 1.0.0 + last_updated: 2025-11-25 + compatible_with: + - vLLM >= 0.6.0 + - Python >= 3.10 + - HuggingFace Hub >= 0.20.0 + maintainer: Valknar + repository: https://github.com/yourusername/runpod