runpod/models_huggingface_vllm.yaml

# ============================================================================
# vLLM Model Configuration
# ============================================================================
#
# This configuration file defines all available vLLM models for download.
# Models are organized by category: text generation and text embeddings.
#
# Each model entry contains:
#   - repo_id: HuggingFace repository identifier
#   - description: Human-readable description
#   - size_gb: Approximate size in gigabytes
#   - essential: Whether this is an essential model (true/false)
#   - category: Model category (text_generation/embedding)
#
# ============================================================================

# Global settings
settings:
  cache_dir: /workspace/huggingface_cache
  parallel_downloads: 1
  retry_attempts: 3
  timeout_seconds: 3600

# Model categories
model_categories:
  # ==========================================================================
  # TEXT GENERATION MODELS (vLLM)
  # ==========================================================================
  text_generation_models:
    - repo_id: Qwen/Qwen2.5-7B-Instruct
      description: Qwen 2.5 7B Instruct - Advanced multilingual reasoning
      size_gb: 14
      essential: true
      category: text_generation
      type: vllm
      format: safetensors
      vram_gb: 14
      context_length: 32768
      notes: Latest Qwen 2.5 model with enhanced reasoning capabilities
      files:
        - source: "model.safetensors"
          dest: "model.safetensors"

    - repo_id: meta-llama/Llama-3.1-8B-Instruct
      description: Llama 3.1 8B Instruct - Meta's latest instruction-tuned model
      size_gb: 17
      essential: true
      category: text_generation
      type: vllm
      format: safetensors
      vram_gb: 17
      context_length: 131072
      notes: Extended 128K context length, excellent for long-form tasks
      files:
        - source: "model.safetensors"
          dest: "model.safetensors"

  # ==========================================================================
  # TEXT EMBEDDING MODELS (vLLM)
  # ==========================================================================
  embedding_models:
    - repo_id: BAAI/bge-large-en-v1.5
      description: BGE Large English v1.5 - High-quality embeddings for RAG
      size_gb: 1.3
      essential: true
      category: embedding
      type: vllm_embedding
      format: safetensors
      vram_gb: 3
      embedding_dimensions: 1024
      max_tokens: 512
      notes: Top-tier MTEB scores, excellent for semantic search and RAG applications
      files:
        - source: "model.safetensors"
          dest: "model.safetensors"

# ============================================================================
# STORAGE & VRAM SUMMARIES
# ============================================================================

storage_requirements:
  text_generation: 31      # Qwen 2.5 7B + Llama 3.1 8B
  embedding: 1.3           # BGE Large
  total: 32.3              # Total essential storage

vram_requirements:
  # For 24GB GPU (RTX 4090)
  simultaneous_loadable:
    - name: Qwen 2.5 7B Only
      models: [Qwen 2.5 7B Instruct]
      vram_used: 14
      remaining: 10

    - name: Llama 3.1 8B Only
      models: [Llama 3.1 8B Instruct]
      vram_used: 17
      remaining: 7

    - name: BGE Large Only
      models: [BGE Large]
      vram_used: 3
      remaining: 21

    - name: Qwen + BGE Embedding
      models: [Qwen 2.5 7B, BGE Large]
      vram_used: 17
      remaining: 7

    - name: Llama + BGE Embedding
      models: [Llama 3.1 8B, BGE Large]
      vram_used: 20
      remaining: 4

# ============================================================================
# METADATA
# ============================================================================

metadata:
  version: 1.0.0
  last_updated: 2025-11-25
  compatible_with:
    - vLLM >= 0.6.0
    - Python >= 3.10
    - HuggingFace Hub >= 0.20.0
  maintainer: Valknar
  repository: https://github.com/yourusername/runpod