# ============================================================================ # ComfyUI Model Configuration # ============================================================================ # # This configuration file defines all available ComfyUI models for download. # Models are organized by category: image, video, audio, and support models. # # Each model entry contains: # - repo_id: HuggingFace repository identifier # - description: Human-readable description # - size_gb: Approximate size in gigabytes # - essential: Whether this is an essential model (true/false) # - category: Model category (image/video/audio/support) # # ============================================================================ # Global settings settings: cache_dir: /workspace/huggingface_cache parallel_downloads: 1 retry_attempts: 3 timeout_seconds: 3600 # Model categories model_categories: # ========================================================================== # IMAGE GENERATION MODELS # ========================================================================== image_models: - repo_id: black-forest-labs/FLUX.1-schnell description: FLUX.1 Schnell - Fast 4-step inference size_gb: 23 essential: true category: image type: unet format: fp16 vram_gb: 23 notes: Industry-leading image generation quality files: - source: "flux1-schnell.safetensors" dest: "flux1-schnell.safetensors" - repo_id: black-forest-labs/FLUX.1-dev description: FLUX.1 Dev - Balanced quality/speed size_gb: 23 essential: false category: image type: unet format: fp16 vram_gb: 23 notes: Development version with enhanced features files: - source: "flux1-dev.safetensors" dest: "flux1-dev.safetensors" - repo_id: runwayml/stable-diffusion-v1-5 description: SD 1.5 - For AnimateDiff size_gb: 4 essential: true category: image type: checkpoints format: fp16 vram_gb: 8 notes: Stable Diffusion 1.5 required for AnimateDiff motion modules files: - source: "v1-5-pruned-emaonly.safetensors" dest: "v1-5-pruned-emaonly.safetensors" - repo_id: stabilityai/stable-diffusion-xl-base-1.0 description: SDXL Base 1.0 - Industry standard size_gb: 7 essential: true category: image type: checkpoints format: fp16 vram_gb: 12 notes: Most widely used Stable Diffusion model files: - source: "sd_xl_base_1.0.safetensors" dest: "sd_xl_base_1.0.safetensors" - repo_id: stabilityai/stable-diffusion-xl-refiner-1.0 description: SDXL Refiner 1.0 - Enhances base output size_gb: 6 essential: false category: image type: checkpoints format: fp16 vram_gb: 12 notes: Use after SDXL base for improved details files: - source: "sd_xl_refiner_1.0.safetensors" dest: "sd_xl_refiner_1.0.safetensors" - repo_id: stabilityai/stable-diffusion-3.5-large description: SD 3.5 Large - Latest Stability AI size_gb: 18 essential: false category: image type: checkpoints format: fp16 vram_gb: 20 notes: Newest generation Stable Diffusion files: - source: "sd3.5_large.safetensors" dest: "sd3.5_large.safetensors" # ========================================================================== # VIDEO GENERATION MODELS # ========================================================================== video_models: - repo_id: THUDM/CogVideoX-5b description: CogVideoX-5B - Professional text-to-video size_gb: 20 essential: true category: video type: diffusion_models format: fp16 vram_gb: 20 frames: 49 resolution: 720p notes: State-of-the-art text-to-video generation, auto-downloaded by DownloadAndLoadCogVideoModel node files: - source: "transformer/diffusion_pytorch_model.safetensors" dest: "cogvideox-5b-transformer.safetensors" - repo_id: THUDM/CogVideoX-5b-I2V description: CogVideoX-5B-I2V - Image-to-video generation size_gb: 20 essential: true category: video type: diffusion_models format: fp16 vram_gb: 20 frames: 49 resolution: 720p notes: Image-to-video model, auto-downloaded by DownloadAndLoadCogVideoModel node files: - source: "transformer/diffusion_pytorch_model.safetensors" dest: "cogvideox-5b-i2v-transformer.safetensors" - repo_id: stabilityai/stable-video-diffusion-img2vid description: SVD - 14 frame image-to-video size_gb: 8 essential: true category: video type: checkpoints format: fp16 vram_gb: 20 frames: 14 resolution: 576x1024 notes: Convert images to short video clips files: - source: "svd.safetensors" dest: "svd.safetensors" - repo_id: stabilityai/stable-video-diffusion-img2vid-xt description: SVD-XT - 25 frame image-to-video size_gb: 8 essential: false category: video type: checkpoints format: fp16 vram_gb: 20 frames: 25 resolution: 576x1024 notes: Extended frame count version files: - source: "svd_xt.safetensors" dest: "svd_xt.safetensors" # ========================================================================== # AUDIO GENERATION MODELS # ========================================================================== audio_models: - repo_id: facebook/musicgen-small description: MusicGen Small - Fast generation size_gb: 3 essential: false category: audio type: musicgen format: fp32 vram_gb: 4 duration_seconds: 30 notes: Fastest music generation, lower quality files: - source: "pytorch_model.bin" dest: "musicgen-small-pytorch_model.bin" - repo_id: facebook/musicgen-medium description: MusicGen Medium - Balanced quality size_gb: 11 essential: true category: audio type: musicgen format: fp32 vram_gb: 8 duration_seconds: 30 notes: Best balance of speed and quality files: - source: "pytorch_model.bin" dest: "musicgen-medium-pytorch_model.bin" - repo_id: facebook/musicgen-large description: MusicGen Large - Highest quality size_gb: 22 essential: false category: audio type: musicgen format: fp32 vram_gb: 16 duration_seconds: 30 notes: Best quality, slower generation files: - source: "pytorch_model-00001-of-00002.bin" dest: "musicgen-large-pytorch_model-00001-of-00002.bin" - source: "pytorch_model-00002-of-00002.bin" dest: "musicgen-large-pytorch_model-00002-of-00002.bin" - source: "pytorch_model.bin.index.json" dest: "musicgen-large-pytorch_model.bin.index.json" # ACE Step v1 3.5B - State-of-the-art music generation - repo_id: Comfy-Org/ACE-Step_ComfyUI_repackaged description: ACE Step v1 3.5B - Fast coherent music generation with 19-language support size_gb: 7.7 essential: true category: audio type: checkpoints format: safetensors vram_gb: 16 duration_seconds: 240 notes: 15x faster than LLM baselines, superior structural coherence, voice cloning, 19-language lyrics files: - source: "all_in_one/ace_step_v1_3.5b.safetensors" dest: "ace_step_v1_3.5b.safetensors" # ACE Step Chinese RAP LoRA (optional) - repo_id: ACE-Step/ACE-Step-v1-chinese-rap-LoRA description: ACE Step Chinese RAP LoRA - Enhanced Chinese pronunciation and hip-hop genre size_gb: 0.3 essential: false category: audio type: loras format: safetensors notes: Improves Chinese pronunciation accuracy and hip-hop/electronic genre adherence files: - source: "pytorch_lora_weights.safetensors" dest: "ace-step-chinese-rap-lora.safetensors" # ========================================================================== # SUPPORT MODELS (CLIP, IP-Adapter, etc.) # ========================================================================== support_models: - repo_id: openai/clip-vit-large-patch14 description: CLIP H - For SD 1.5 IP-Adapter size_gb: 2 essential: true category: support type: clip_vision format: fp32 vram_gb: 2 notes: Text-image understanding model for IP-Adapter files: - source: "model.safetensors" dest: "CLIP-ViT-H-14-laion2B-s32B-b79K.safetensors" - repo_id: laion/CLIP-ViT-bigG-14-laion2B-39B-b160k description: CLIP G - For SDXL IP-Adapter size_gb: 7 essential: true category: support type: clip_vision format: fp32 vram_gb: 4 notes: Larger CLIP model for SDXL IP-Adapter files: - source: "open_clip_model.safetensors" dest: "CLIP-ViT-bigG-14-laion2B-39B-b160k.safetensors" - repo_id: google/siglip-so400m-patch14-384 description: SigLIP - For FLUX models size_gb: 2 essential: true category: support type: clip_vision format: fp32 vram_gb: 2 notes: Advanced image-text alignment files: - source: "model.safetensors" dest: "siglip-so400m-patch14-384.safetensors" - repo_id: stabilityai/stable-diffusion-3.5-large description: CLIP-L and T5-XXL - For FLUX text encoding size_gb: 10 essential: true category: support type: clip format: fp16 vram_gb: 4 notes: CLIP text encoders required for FLUX models files: - source: "text_encoders/clip_l.safetensors" dest: "clip_l.safetensors" - source: "text_encoders/t5xxl_fp16.safetensors" dest: "t5xxl_fp16.safetensors" - repo_id: black-forest-labs/FLUX.1-schnell description: FLUX VAE - Autoencoder for FLUX models size_gb: 0.5 essential: true category: support type: vae format: safetensors vram_gb: 1 notes: VAE autoencoder required for FLUX image decoding files: - source: "ae.safetensors" dest: "ae.safetensors" - repo_id: ai-forever/Real-ESRGAN description: RealESRGAN x2 - 2x upscaling model size_gb: 0.06 essential: true category: support type: upscale_models format: pth vram_gb: 2 notes: Fast 2x upscaling model for general purpose enhancement files: - source: "RealESRGAN_x2.pth" dest: "RealESRGAN_x2.pth" - repo_id: ai-forever/Real-ESRGAN description: RealESRGAN x4 - 4x upscaling model size_gb: 0.06 essential: true category: support type: upscale_models format: pth vram_gb: 4 notes: High-quality 4x upscaling model for detail enhancement files: - source: "RealESRGAN_x4.pth" dest: "RealESRGAN_x4.pth" - repo_id: stabilityai/stable-diffusion-3.5-large description: T5-XXL FP16 - For CogVideoX text encoding size_gb: 9 essential: true category: support type: text_encoders format: fp16 vram_gb: 4 notes: T5 text encoder required for CogVideoX models files: - source: "text_encoders/t5xxl_fp16.safetensors" dest: "t5xxl_fp16.safetensors" - repo_id: stabilityai/stable-diffusion-3.5-large description: CLIP-L - For CogVideoX and SD3 size_gb: 1 essential: true category: support type: text_encoders format: fp32 vram_gb: 1 notes: CLIP-L text encoder for CogVideoX and SD3 models files: - source: "text_encoders/clip_l.safetensors" dest: "clip_l.safetensors" - repo_id: stabilityai/stable-diffusion-3.5-large description: CLIP-G - For SD3 models size_gb: 3 essential: false category: support type: text_encoders format: fp32 vram_gb: 2 notes: CLIP-G text encoder for SD3 models files: - source: "text_encoders/clip_g.safetensors" dest: "clip_g.safetensors" # ========================================================================== # ANIMATEDIFF MODELS # ========================================================================== animatediff_models: - repo_id: guoyww/animatediff description: AnimateDiff Motion Modules size_gb: 2 essential: true category: animatediff type: animatediff_models filename: mm_sd_v15 format: safetensors vram_gb: 4 notes: Motion modules for AnimateDiff text-to-video files: - source: "mm_sd_v15_v2.ckpt" dest: "mm_sd_v15_v2.ckpt" # ========================================================================== # CONTROLNET MODELS # ========================================================================== controlnet_models: - repo_id: lllyasviel/control_v11p_sd15_canny description: ControlNet Canny - Edge detection control for SD 1.5 size_gb: 1.5 essential: false category: controlnet type: controlnet format: safetensors vram_gb: 2 notes: Precise edge-based composition control files: - source: "diffusion_pytorch_model.safetensors" dest: "control_v11p_sd15_canny.safetensors" - repo_id: lllyasviel/control_v11f1p_sd15_depth description: ControlNet Depth - Depth map control for SD 1.5 size_gb: 1.5 essential: false category: controlnet type: controlnet format: safetensors vram_gb: 2 notes: Depth-based spatial control files: - source: "diffusion_pytorch_model.safetensors" dest: "control_v11p_sd15_depth.safetensors" - repo_id: diffusers/controlnet-canny-sdxl-1.0 description: ControlNet Canny SDXL - Edge detection for SDXL size_gb: 2.5 essential: false category: controlnet type: controlnet format: safetensors vram_gb: 3 notes: Canny edge control for SDXL models files: - source: "diffusion_pytorch_model.safetensors" dest: "controlnet-canny-sdxl-1.0.safetensors" - repo_id: diffusers/controlnet-depth-sdxl-1.0 description: ControlNet Depth SDXL - Depth map for SDXL size_gb: 2.5 essential: false category: controlnet type: controlnet format: safetensors vram_gb: 3 notes: Depth control for SDXL models files: - source: "diffusion_pytorch_model.safetensors" dest: "controlnet-depth-sdxl-1.0.safetensors" # ========================================================================== # IP-ADAPTER MODELS # ========================================================================== ipadapter_models: - repo_id: h94/IP-Adapter description: IP-Adapter SDXL Base - Style & Composition size_gb: 1.3 essential: true category: ipadapter type: ipadapter format: safetensors vram_gb: 4 notes: Basic IP-Adapter for SDXL files: - source: "sdxl_models/ip-adapter_sdxl.safetensors" dest: "ip-adapter_sdxl.safetensors" - repo_id: h94/IP-Adapter description: IP-Adapter SDXL VIT-H - For CLIP-ViT-H size_gb: 0.9 essential: true category: ipadapter type: ipadapter format: safetensors vram_gb: 4 notes: IP-Adapter for SDXL with VIT-H CLIP vision model files: - source: "sdxl_models/ip-adapter_sdxl_vit-h.safetensors" dest: "ip-adapter_sdxl_vit-h.safetensors" - repo_id: h94/IP-Adapter description: IP-Adapter SDXL Plus - High Strength Composition size_gb: 0.9 essential: false category: ipadapter type: ipadapter format: safetensors vram_gb: 4 notes: Enhanced composition control with higher strength files: - source: "sdxl_models/ip-adapter-plus_sdxl_vit-h.safetensors" dest: "ip-adapter-plus_sdxl_vit-h.safetensors" - repo_id: h94/IP-Adapter description: IP-Adapter SDXL Plus Face - Face-focused generation size_gb: 0.5 essential: false category: ipadapter type: ipadapter format: safetensors vram_gb: 4 notes: Specialized for face transfer and portrait generation files: - source: "sdxl_models/ip-adapter-plus-face_sdxl_vit-h.safetensors" dest: "ip-adapter-plus-face_sdxl_vit-h.safetensors" # ========================================================================== # DIFFRHYTHM MODELS (Full-length song generation) # ========================================================================== diffrhythm_models: - repo_id: ASLP-lab/DiffRhythm-1_2 description: DiffRhythm 1.2 - 95 second generation model size_gb: 2 essential: true category: diffrhythm type: TTS/DiffRhythm format: pt vram_gb: 12 duration_seconds: 95 notes: Latest 95-second generation model files: - source: "cfm_model.pt" dest: "cfm_model_v1_2.pt" - repo_id: ASLP-lab/DiffRhythm-full description: DiffRhythm Full - 4m45s full-length generation size_gb: 2 essential: false category: diffrhythm type: TTS/DiffRhythm format: pt vram_gb: 16 duration_seconds: 285 notes: Full-length 4 minute 45 second music generation files: - source: "cfm_model.pt" dest: "cfm_full_model.pt" - repo_id: ASLP-lab/DiffRhythm-base description: DiffRhythm Base - 95 second base model size_gb: 2 essential: false category: diffrhythm type: TTS/DiffRhythm format: pt vram_gb: 12 duration_seconds: 95 notes: Base 95-second model files: - source: "cfm_model.pt" dest: "cfm_model.pt" - repo_id: ASLP-lab/DiffRhythm-vae description: DiffRhythm VAE - Variational autoencoder size_gb: 1 essential: true category: diffrhythm type: TTS/DiffRhythm format: pt vram_gb: 2 notes: VAE component fine-tuned from Stable Audio Open (Stability AI Community License) files: - source: "vae_model.pt" dest: "vae_model.pt" - repo_id: OpenMuQ/MuQ-MuLan-large description: MuQ-MuLan-large - Music-text joint embedding (~700M parameters) size_gb: 3 essential: true category: diffrhythm type: TTS/DiffRhythm/MuQ-MuLan-large format: bin vram_gb: 4 notes: Music-text joint embedding for semantic understanding (English/Chinese) files: - source: "config.json" dest: "config.json" - source: "pytorch_model.bin" dest: "pytorch_model.bin" - repo_id: OpenMuQ/MuQ-large-msd-iter description: MuQ-large-msd-iter - Music representation learning (~300M parameters) size_gb: 1.2 essential: true category: diffrhythm type: TTS/DiffRhythm/MuQ-large-msd-iter format: safetensors vram_gb: 2 notes: Music representation model trained on Million Song Dataset files: - source: "config.json" dest: "config.json" - source: "model.safetensors" dest: "model.safetensors" - repo_id: FacebookAI/xlm-roberta-base description: XLM-RoBERTa Base - Multilingual text encoder (100 languages, 0.3B params) size_gb: 1.1 essential: true category: diffrhythm type: TTS/DiffRhythm/xlm-roberta-base format: safetensors vram_gb: 1 notes: Multilingual text encoding for 100 languages files: - source: "config.json" dest: "config.json" - source: "model.safetensors" dest: "model.safetensors" - source: "sentencepiece.bpe.model" dest: "sentencepiece.bpe.model" - source: "tokenizer.json" dest: "tokenizer.json" - source: "tokenizer_config.json" dest: "tokenizer_config.json" # ============================================================================ # STORAGE & VRAM SUMMARIES # ============================================================================ storage_requirements: essential_only: image: 30 # FLUX Schnell + SDXL Base video: 28 # CogVideoX + SVD audio: 11 # MusicGen Medium support: 11 # All 3 CLIP models diffrhythm: 10 # DiffRhythm essential models total: 90 # Total essential storage all_models: image: 54 # All image models video: 36 # All video models audio: 36 # All audio models support: 11 # All support models diffrhythm: 12 # All DiffRhythm models total: 149 # Total with optional models vram_requirements: # For 24GB GPU (RTX 4090) simultaneous_loadable: - name: Image Focus - FLUX FP16 models: [FLUX.1 Schnell] vram_used: 23 remaining: 1 - name: Image Focus - FLUX FP8 + SDXL models: [FLUX.1 Schnell FP8, SDXL Base] vram_used: 24 remaining: 0 - name: Video Generation models: [CogVideoX-5B optimized, SDXL] vram_used: 24 remaining: 0 - name: Multi-Modal models: [SDXL, MusicGen Medium] vram_used: 20 remaining: 4 # ============================================================================ # INSTALLATION PROFILES # ============================================================================ installation_profiles: minimal: description: Minimal setup for testing categories: [support_models] storage_gb: 11 estimated_time: 5-10 minutes essential: description: Essential models only (~80GB) categories: [image_models, video_models, audio_models, support_models] essential_only: true storage_gb: 80 estimated_time: 1-2 hours image_focused: description: All image generation models categories: [image_models, support_models] storage_gb: 65 estimated_time: 45-90 minutes video_focused: description: All video generation models categories: [video_models, image_models, support_models] essential_only: true storage_gb: 69 estimated_time: 1-2 hours complete: description: All models (including optional) categories: [image_models, video_models, audio_models, support_models] storage_gb: 137 estimated_time: 2-4 hours # ============================================================================ # METADATA # ============================================================================ metadata: version: 1.0.0 last_updated: 2025-11-21 compatible_with: - ComfyUI >= 0.1.0 - Python >= 3.10 - HuggingFace Hub >= 0.20.0 maintainer: Valknar repository: https://github.com/yourusername/runpod