All checks were successful
Build and Push RunPod Docker Image / build-and-push (push) Successful in 35s
Add models_huggingface_vllm.yaml with three vLLM models: - Qwen/Qwen2.5-7B-Instruct (14GB) - Advanced multilingual reasoning - meta-llama/Llama-3.1-8B-Instruct (17GB) - Extended 128K context - BAAI/bge-large-en-v1.5 (1.3GB) - High-quality text embeddings Total storage: ~32GB 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
127 lines
3.9 KiB
YAML
127 lines
3.9 KiB
YAML
# ============================================================================
|
|
# vLLM Model Configuration
|
|
# ============================================================================
|
|
#
|
|
# This configuration file defines all available vLLM models for download.
|
|
# Models are organized by category: text generation and text embeddings.
|
|
#
|
|
# Each model entry contains:
|
|
# - repo_id: HuggingFace repository identifier
|
|
# - description: Human-readable description
|
|
# - size_gb: Approximate size in gigabytes
|
|
# - essential: Whether this is an essential model (true/false)
|
|
# - category: Model category (text_generation/embedding)
|
|
#
|
|
# ============================================================================
|
|
|
|
# Global settings
|
|
settings:
|
|
cache_dir: /workspace/huggingface_cache
|
|
parallel_downloads: 1
|
|
retry_attempts: 3
|
|
timeout_seconds: 3600
|
|
|
|
# Model categories
|
|
model_categories:
|
|
# ==========================================================================
|
|
# TEXT GENERATION MODELS (vLLM)
|
|
# ==========================================================================
|
|
text_generation_models:
|
|
- repo_id: Qwen/Qwen2.5-7B-Instruct
|
|
description: Qwen 2.5 7B Instruct - Advanced multilingual reasoning
|
|
size_gb: 14
|
|
essential: true
|
|
category: text_generation
|
|
type: vllm
|
|
format: safetensors
|
|
vram_gb: 14
|
|
context_length: 32768
|
|
notes: Latest Qwen 2.5 model with enhanced reasoning capabilities
|
|
files:
|
|
- source: "model.safetensors"
|
|
dest: "model.safetensors"
|
|
|
|
- repo_id: meta-llama/Llama-3.1-8B-Instruct
|
|
description: Llama 3.1 8B Instruct - Meta's latest instruction-tuned model
|
|
size_gb: 17
|
|
essential: true
|
|
category: text_generation
|
|
type: vllm
|
|
format: safetensors
|
|
vram_gb: 17
|
|
context_length: 131072
|
|
notes: Extended 128K context length, excellent for long-form tasks
|
|
files:
|
|
- source: "model.safetensors"
|
|
dest: "model.safetensors"
|
|
|
|
# ==========================================================================
|
|
# TEXT EMBEDDING MODELS (vLLM)
|
|
# ==========================================================================
|
|
embedding_models:
|
|
- repo_id: BAAI/bge-large-en-v1.5
|
|
description: BGE Large English v1.5 - High-quality embeddings for RAG
|
|
size_gb: 1.3
|
|
essential: true
|
|
category: embedding
|
|
type: vllm_embedding
|
|
format: safetensors
|
|
vram_gb: 3
|
|
embedding_dimensions: 1024
|
|
max_tokens: 512
|
|
notes: Top-tier MTEB scores, excellent for semantic search and RAG applications
|
|
files:
|
|
- source: "model.safetensors"
|
|
dest: "model.safetensors"
|
|
|
|
# ============================================================================
|
|
# STORAGE & VRAM SUMMARIES
|
|
# ============================================================================
|
|
|
|
storage_requirements:
|
|
text_generation: 31 # Qwen 2.5 7B + Llama 3.1 8B
|
|
embedding: 1.3 # BGE Large
|
|
total: 32.3 # Total essential storage
|
|
|
|
vram_requirements:
|
|
# For 24GB GPU (RTX 4090)
|
|
simultaneous_loadable:
|
|
- name: Qwen 2.5 7B Only
|
|
models: [Qwen 2.5 7B Instruct]
|
|
vram_used: 14
|
|
remaining: 10
|
|
|
|
- name: Llama 3.1 8B Only
|
|
models: [Llama 3.1 8B Instruct]
|
|
vram_used: 17
|
|
remaining: 7
|
|
|
|
- name: BGE Large Only
|
|
models: [BGE Large]
|
|
vram_used: 3
|
|
remaining: 21
|
|
|
|
- name: Qwen + BGE Embedding
|
|
models: [Qwen 2.5 7B, BGE Large]
|
|
vram_used: 17
|
|
remaining: 7
|
|
|
|
- name: Llama + BGE Embedding
|
|
models: [Llama 3.1 8B, BGE Large]
|
|
vram_used: 20
|
|
remaining: 4
|
|
|
|
# ============================================================================
|
|
# METADATA
|
|
# ============================================================================
|
|
|
|
metadata:
|
|
version: 1.0.0
|
|
last_updated: 2025-11-25
|
|
compatible_with:
|
|
- vLLM >= 0.6.0
|
|
- Python >= 3.10
|
|
- HuggingFace Hub >= 0.20.0
|
|
maintainer: Valknar
|
|
repository: https://github.com/yourusername/runpod
|