# ============================================================================ # ComfyUI Model Configuration # ============================================================================ # # This configuration file defines all available ComfyUI models for download. # Models are organized by category: image, video, audio, and support models. # # Each model entry contains: # - repo_id: HuggingFace repository identifier # - description: Human-readable description # - size_gb: Approximate size in gigabytes # - essential: Whether this is an essential model (true/false) # - category: Model category (image/video/audio/support) # # ============================================================================ # Global settings settings: cache_dir: /workspace/huggingface_cache parallel_downloads: 1 retry_attempts: 3 timeout_seconds: 3600 # Model categories model_categories: # ========================================================================== # IMAGE GENERATION MODELS # ========================================================================== image_models: - repo_id: black-forest-labs/FLUX.1-schnell description: FLUX.1 Schnell - Fast 4-step inference size_gb: 23 essential: true category: image format: fp16 vram_gb: 23 notes: Industry-leading image generation quality - repo_id: black-forest-labs/FLUX.1-dev description: FLUX.1 Dev - Balanced quality/speed size_gb: 23 essential: false category: image format: fp16 vram_gb: 23 notes: Development version with enhanced features - repo_id: stabilityai/stable-diffusion-xl-base-1.0 description: SDXL Base 1.0 - Industry standard size_gb: 7 essential: true category: image format: fp16 vram_gb: 12 notes: Most widely used Stable Diffusion model - repo_id: stabilityai/stable-diffusion-xl-refiner-1.0 description: SDXL Refiner 1.0 - Enhances base output size_gb: 6 essential: false category: image format: fp16 vram_gb: 12 notes: Use after SDXL base for improved details - repo_id: stabilityai/stable-diffusion-3.5-large description: SD 3.5 Large - Latest Stability AI size_gb: 18 essential: false category: image format: fp16 vram_gb: 20 notes: Newest generation Stable Diffusion # ========================================================================== # VIDEO GENERATION MODELS # ========================================================================== video_models: - repo_id: THUDM/CogVideoX-5b description: CogVideoX-5B - Professional text-to-video size_gb: 20 essential: true category: video format: fp16 vram_gb: 20 frames: 49 resolution: 720p notes: State-of-the-art text-to-video generation - repo_id: stabilityai/stable-video-diffusion-img2vid description: SVD - 14 frame image-to-video size_gb: 8 essential: true category: video format: fp16 vram_gb: 20 frames: 14 resolution: 576x1024 notes: Convert images to short video clips - repo_id: stabilityai/stable-video-diffusion-img2vid-xt description: SVD-XT - 25 frame image-to-video size_gb: 8 essential: false category: video format: fp16 vram_gb: 20 frames: 25 resolution: 576x1024 notes: Extended frame count version # ========================================================================== # AUDIO GENERATION MODELS # ========================================================================== audio_models: - repo_id: facebook/musicgen-small description: MusicGen Small - Fast generation size_gb: 3 essential: false category: audio format: fp32 vram_gb: 4 duration_seconds: 30 notes: Fastest music generation, lower quality - repo_id: facebook/musicgen-medium description: MusicGen Medium - Balanced quality size_gb: 11 essential: true category: audio format: fp32 vram_gb: 8 duration_seconds: 30 notes: Best balance of speed and quality - repo_id: facebook/musicgen-large description: MusicGen Large - Highest quality size_gb: 22 essential: false category: audio format: fp32 vram_gb: 16 duration_seconds: 30 notes: Best quality, slower generation # ========================================================================== # SUPPORT MODELS (CLIP, IP-Adapter, etc.) # ========================================================================== support_models: - repo_id: openai/clip-vit-large-patch14 description: CLIP H - For SD 1.5 IP-Adapter size_gb: 2 essential: true category: support format: fp32 vram_gb: 2 notes: Text-image understanding model - repo_id: laion/CLIP-ViT-bigG-14-laion2B-39B-b160k description: CLIP G - For SDXL IP-Adapter size_gb: 7 essential: true category: support format: fp32 vram_gb: 4 notes: Larger CLIP model for SDXL - repo_id: google/siglip-so400m-patch14-384 description: SigLIP - For FLUX models size_gb: 2 essential: true category: support format: fp32 vram_gb: 2 notes: Advanced image-text alignment # ============================================================================ # STORAGE & VRAM SUMMARIES # ============================================================================ storage_requirements: essential_only: image: 30 # FLUX Schnell + SDXL Base video: 28 # CogVideoX + SVD audio: 11 # MusicGen Medium support: 11 # All 3 CLIP models total: 80 # Total essential storage all_models: image: 54 # All image models video: 36 # All video models audio: 36 # All audio models support: 11 # All support models total: 137 # Total with optional models vram_requirements: # For 24GB GPU (RTX 4090) simultaneous_loadable: - name: Image Focus - FLUX FP16 models: [FLUX.1 Schnell] vram_used: 23 remaining: 1 - name: Image Focus - FLUX FP8 + SDXL models: [FLUX.1 Schnell FP8, SDXL Base] vram_used: 24 remaining: 0 - name: Video Generation models: [CogVideoX-5B optimized, SDXL] vram_used: 24 remaining: 0 - name: Multi-Modal models: [SDXL, MusicGen Medium] vram_used: 20 remaining: 4 # ============================================================================ # INSTALLATION PROFILES # ============================================================================ installation_profiles: minimal: description: Minimal setup for testing categories: [support_models] storage_gb: 11 estimated_time: 5-10 minutes essential: description: Essential models only (~80GB) categories: [image_models, video_models, audio_models, support_models] essential_only: true storage_gb: 80 estimated_time: 1-2 hours image_focused: description: All image generation models categories: [image_models, support_models] storage_gb: 65 estimated_time: 45-90 minutes video_focused: description: All video generation models categories: [video_models, image_models, support_models] essential_only: true storage_gb: 69 estimated_time: 1-2 hours complete: description: All models (including optional) categories: [image_models, video_models, audio_models, support_models] storage_gb: 137 estimated_time: 2-4 hours # ============================================================================ # METADATA # ============================================================================ metadata: version: 1.0.0 last_updated: 2025-11-21 compatible_with: - ComfyUI >= 0.1.0 - Python >= 3.10 - HuggingFace Hub >= 0.20.0 maintainer: Valknar repository: https://github.com/yourusername/runpod