feat: add vLLM models configuration file

Add models_huggingface_vllm.yaml with three vLLM models: - Qwen/Qwen2.5-7B-Instruct (14GB) - Advanced multilingual reasoning - meta-llama/Llama-3.1-8B-Instruct (17GB) - Extended 128K context - BAAI/bge-large-en-v1.5 (1.3GB) - High-quality text embeddings Total storage: ~32GB 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-25 06:12:18 +01:00
parent 6ce989dd91
commit e12a8add61
1 changed files with 126 additions and 0 deletions
--- a/models_huggingface_vllm.yaml
+++ b/models_huggingface_vllm.yaml
@@ -0,0 +1,126 @@
+# ============================================================================
+# vLLM Model Configuration
+# ============================================================================
+#
+# This configuration file defines all available vLLM models for download.
+# Models are organized by category: text generation and text embeddings.
+#
+# Each model entry contains:
+#   - repo_id: HuggingFace repository identifier
+#   - description: Human-readable description
+#   - size_gb: Approximate size in gigabytes
+#   - essential: Whether this is an essential model (true/false)
+#   - category: Model category (text_generation/embedding)
+#
+# ============================================================================
+
+# Global settings
+settings:
+  cache_dir: /workspace/huggingface_cache
+  parallel_downloads: 1
+  retry_attempts: 3
+  timeout_seconds: 3600
+
+# Model categories
+model_categories:
+  # ==========================================================================
+  # TEXT GENERATION MODELS (vLLM)
+  # ==========================================================================
+  text_generation_models:
+    - repo_id: Qwen/Qwen2.5-7B-Instruct
+      description: Qwen 2.5 7B Instruct - Advanced multilingual reasoning
+      size_gb: 14
+      essential: true
+      category: text_generation
+      type: vllm
+      format: safetensors
+      vram_gb: 14
+      context_length: 32768
+      notes: Latest Qwen 2.5 model with enhanced reasoning capabilities
+      files:
+        - source: "model.safetensors"
+          dest: "model.safetensors"
+
+    - repo_id: meta-llama/Llama-3.1-8B-Instruct
+      description: Llama 3.1 8B Instruct - Meta's latest instruction-tuned model
+      size_gb: 17
+      essential: true
+      category: text_generation
+      type: vllm
+      format: safetensors
+      vram_gb: 17
+      context_length: 131072
+      notes: Extended 128K context length, excellent for long-form tasks
+      files:
+        - source: "model.safetensors"
+          dest: "model.safetensors"
+
+  # ==========================================================================
+  # TEXT EMBEDDING MODELS (vLLM)
+  # ==========================================================================
+  embedding_models:
+    - repo_id: BAAI/bge-large-en-v1.5
+      description: BGE Large English v1.5 - High-quality embeddings for RAG
+      size_gb: 1.3
+      essential: true
+      category: embedding
+      type: vllm_embedding
+      format: safetensors
+      vram_gb: 3
+      embedding_dimensions: 1024
+      max_tokens: 512
+      notes: Top-tier MTEB scores, excellent for semantic search and RAG applications
+      files:
+        - source: "model.safetensors"
+          dest: "model.safetensors"
+
+# ============================================================================
+# STORAGE & VRAM SUMMARIES
+# ============================================================================
+
+storage_requirements:
+  text_generation: 31      # Qwen 2.5 7B + Llama 3.1 8B
+  embedding: 1.3           # BGE Large
+  total: 32.3              # Total essential storage
+
+vram_requirements:
+  # For 24GB GPU (RTX 4090)
+  simultaneous_loadable:
+    - name: Qwen 2.5 7B Only
+      models: [Qwen 2.5 7B Instruct]
+      vram_used: 14
+      remaining: 10
+
+    - name: Llama 3.1 8B Only
+      models: [Llama 3.1 8B Instruct]
+      vram_used: 17
+      remaining: 7
+
+    - name: BGE Large Only
+      models: [BGE Large]
+      vram_used: 3
+      remaining: 21
+
+    - name: Qwen + BGE Embedding
+      models: [Qwen 2.5 7B, BGE Large]
+      vram_used: 17
+      remaining: 7
+
+    - name: Llama + BGE Embedding
+      models: [Llama 3.1 8B, BGE Large]
+      vram_used: 20
+      remaining: 4
+
+# ============================================================================
+# METADATA
+# ============================================================================
+
+metadata:
+  version: 1.0.0
+  last_updated: 2025-11-25
+  compatible_with:
+    - vLLM >= 0.6.0
+    - Python >= 3.10
+    - HuggingFace Hub >= 0.20.0
+  maintainer: Valknar
+  repository: https://github.com/yourusername/runpod