From 2189697734c46fa0a7b5ce7f1e614d7f1d4873ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= Date: Tue, 25 Nov 2025 19:19:42 +0100 Subject: [PATCH] refactor: remove type field from models_huggingface.yaml and include type in dest paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Prepended ComfyUI model type folder (checkpoints/, clip/, vae/, etc.) to all dest paths - Removed separate 'type' field from all model entries - Consolidated SD3.5 duplicate entries (5 → 1) - Simplified model configuration by embedding directory structure directly in destination paths This change eliminates the need to parse the 'type' field separately in artifact_huggingface_download.sh, making the configuration more explicit and easier to understand. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- models_huggingface.yaml | 1881 ++++++++++++++------------------------- 1 file changed, 667 insertions(+), 1214 deletions(-) diff --git a/models_huggingface.yaml b/models_huggingface.yaml index ff9d638..a8011fc 100644 --- a/models_huggingface.yaml +++ b/models_huggingface.yaml @@ -1,1262 +1,715 @@ -# ============================================================================ -# ComfyUI Model Configuration -# ============================================================================ -# -# This configuration file defines all available ComfyUI models for download. -# Models are organized by category: image, video, audio, and support models. -# -# Each model entry contains: -# - repo_id: HuggingFace repository identifier -# - description: Human-readable description -# - size_gb: Approximate size in gigabytes -# - essential: Whether this is an essential model (true/false) -# - category: Model category (image/video/audio/support) -# -# ============================================================================ - -# Global settings settings: cache_dir: /workspace/huggingface_cache parallel_downloads: 1 retry_attempts: 3 timeout_seconds: 3600 - -# Model categories model_categories: - # ========================================================================== - # IMAGE GENERATION MODELS - # ========================================================================== image_models: - - repo_id: black-forest-labs/FLUX.1-schnell - description: FLUX.1 Schnell - Fast 4-step inference - size_gb: 23 - essential: true - category: image - type: unet - format: fp16 - vram_gb: 23 - notes: Industry-leading image generation quality - files: - - source: "flux1-schnell.safetensors" - dest: "flux1-schnell.safetensors" - - - repo_id: black-forest-labs/FLUX.1-dev - description: FLUX.1 Dev - Balanced quality/speed - size_gb: 23 - essential: false - category: image - type: unet - format: fp16 - vram_gb: 23 - notes: Development version with enhanced features - files: - - source: "flux1-dev.safetensors" - dest: "flux1-dev.safetensors" - - - repo_id: runwayml/stable-diffusion-v1-5 - description: SD 1.5 - For AnimateDiff - size_gb: 4 - essential: true - category: image - type: checkpoints - format: fp16 - vram_gb: 8 - notes: Stable Diffusion 1.5 required for AnimateDiff motion modules - files: - - source: "v1-5-pruned-emaonly.safetensors" - dest: "v1-5-pruned-emaonly.safetensors" - - - repo_id: stabilityai/stable-diffusion-xl-base-1.0 - description: SDXL Base 1.0 - Industry standard - size_gb: 7 - essential: true - category: image - type: checkpoints - format: fp16 - vram_gb: 12 - notes: Most widely used Stable Diffusion model - files: - - source: "sd_xl_base_1.0.safetensors" - dest: "sd_xl_base_1.0.safetensors" - - - repo_id: stabilityai/stable-diffusion-xl-refiner-1.0 - description: SDXL Refiner 1.0 - Enhances base output - size_gb: 6 - essential: false - category: image - type: checkpoints - format: fp16 - vram_gb: 12 - notes: Use after SDXL base for improved details - files: - - source: "sd_xl_refiner_1.0.safetensors" - dest: "sd_xl_refiner_1.0.safetensors" - - - repo_id: stabilityai/stable-diffusion-3.5-large - description: SD 3.5 Large - Latest Stability AI - size_gb: 18 - essential: false - category: image - type: checkpoints - format: fp16 - vram_gb: 20 - notes: Newest generation Stable Diffusion - files: - - source: "sd3.5_large.safetensors" - dest: "sd3.5_large.safetensors" - - - repo_id: John6666/diving-illustrious-real-asian-v50-sdxl - description: Diving Illustrious Real Asian v5.0 - Photorealistic Asian subjects - size_gb: 7 - essential: false - category: image - type: checkpoints - format: fp16 - vram_gb: 12 - notes: SDXL fine-tune specializing in photorealistic Asian subjects with illustrious quality - files: - - source: "*.safetensors" - dest: "diving-illustrious-real-asian-v50-sdxl.safetensors" - - - repo_id: playgroundai/playground-v2.5-1024px-aesthetic - description: Playground v2.5 - 1024px aesthetic images - size_gb: 7 - essential: false - category: image - type: checkpoints - format: fp16 - vram_gb: 12 - notes: Highly aesthetic 1024x1024 outputs, outperforms SDXL and DALL-E 3 in user studies - files: - - source: "*.safetensors" - dest: "playground-v2.5-1024px-aesthetic.safetensors" - - - repo_id: Lykon/dreamshaper-8 - description: DreamShaper 8 - Multi-style versatile model - size_gb: 4 - essential: false - category: image - type: checkpoints - format: fp16 - vram_gb: 8 - notes: Versatile SD1.5 fine-tune balancing photorealistic and anime styles with strong LoRA support - files: - - source: "*.safetensors" - dest: "dreamshaper-8.safetensors" - - # ========================================================================== - # VIDEO GENERATION MODELS - # ========================================================================== + - repo_id: black-forest-labs/FLUX.1-schnell + description: FLUX.1 Schnell - Fast 4-step inference + size_gb: 23 + essential: true + category: image + format: fp16 + vram_gb: 23 + notes: Industry-leading image generation quality + files: + - source: flux1-schnell.safetensors + dest: unet/flux1-schnell.safetensors + - repo_id: black-forest-labs/FLUX.1-dev + description: FLUX.1 Dev - Balanced quality/speed + size_gb: 23 + essential: false + category: image + format: fp16 + vram_gb: 23 + notes: Development version with enhanced features + files: + - source: flux1-dev.safetensors + dest: unet/flux1-dev.safetensors + - repo_id: runwayml/stable-diffusion-v1-5 + description: SD 1.5 - For AnimateDiff + size_gb: 4 + essential: true + category: image + format: fp16 + vram_gb: 8 + notes: Stable Diffusion 1.5 required for AnimateDiff motion modules + files: + - source: v1-5-pruned-emaonly.safetensors + dest: checkpoints/v1-5-pruned-emaonly.safetensors + - repo_id: stabilityai/stable-diffusion-xl-base-1.0 + description: SDXL Base 1.0 - Industry standard + size_gb: 7 + essential: true + category: image + format: fp16 + vram_gb: 12 + notes: Most widely used Stable Diffusion model + files: + - source: sd_xl_base_1.0.safetensors + dest: checkpoints/sd_xl_base_1.0.safetensors + - repo_id: stabilityai/stable-diffusion-xl-refiner-1.0 + description: SDXL Refiner 1.0 - Enhances base output + size_gb: 6 + essential: false + category: image + format: fp16 + vram_gb: 12 + notes: Use after SDXL base for improved details + files: + - source: sd_xl_refiner_1.0.safetensors + dest: checkpoints/sd_xl_refiner_1.0.safetensors + - repo_id: stabilityai/stable-diffusion-3.5-large + description: SD 3.5 Large Complete - Checkpoint and text encoders + size_gb: 31 + essential: false + category: image + format: mixed + vram_gb: 20 + notes: Complete SD3.5 Large model with checkpoint and all text encoders (CLIP-L, + CLIP-G, T5-XXL) + files: + - source: sd3.5_large.safetensors + dest: checkpoints/sd3.5_large.safetensors + - source: text_encoders/clip_l.safetensors + dest: checkpoints/clip_l.safetensors + - source: text_encoders/clip_g.safetensors + dest: checkpoints/clip_g.safetensors + - source: text_encoders/t5xxl_fp16.safetensors + dest: checkpoints/t5xxl_fp16.safetensors + - repo_id: John6666/diving-illustrious-real-asian-v50-sdxl + description: Diving Illustrious Real Asian v5.0 - Photorealistic Asian subjects + size_gb: 7 + essential: false + category: image + format: fp16 + vram_gb: 12 + notes: SDXL fine-tune specializing in photorealistic Asian subjects with illustrious + quality + files: + - source: unet/diffusion_pytorch_model.safetensors + dest: checkpoints/diving-illustrious-real-asian-v50-sdxl.safetensors + - repo_id: playgroundai/playground-v2.5-1024px-aesthetic + description: Playground v2.5 - 1024px aesthetic images + size_gb: 7 + essential: false + category: image + format: fp16 + vram_gb: 12 + notes: Highly aesthetic 1024x1024 outputs, outperforms SDXL and DALL-E 3 in user + studies + files: + - source: playground-v2.5-1024px-aesthetic.fp16.safetensors + dest: checkpoints/playground-v2.5-1024px-aesthetic.safetensors + - repo_id: Lykon/dreamshaper-8 + description: DreamShaper 8 - Multi-style versatile model + size_gb: 4 + essential: false + category: image + format: fp16 + vram_gb: 8 + notes: Versatile SD1.5 fine-tune balancing photorealistic and anime styles with + strong LoRA support + files: + - source: unet/diffusion_pytorch_model.fp16.safetensors + dest: checkpoints/dreamshaper-8.safetensors video_models: - - repo_id: THUDM/CogVideoX-5b - description: CogVideoX-5B - Professional text-to-video - size_gb: 20 - essential: true - category: video - type: diffusion_models - format: fp16 - vram_gb: 20 - frames: 49 - resolution: 720p - notes: State-of-the-art text-to-video generation, auto-downloaded by DownloadAndLoadCogVideoModel node - files: - - source: "transformer/diffusion_pytorch_model-00001-of-00002.safetensors" - dest: "cogvideox-5b-transformer-00001-of-00002.safetensors" - - source: "transformer/diffusion_pytorch_model-00002-of-00002.safetensors" - dest: "cogvideox-5b-transformer-00002-of-00002.safetensors" - - source: "transformer/diffusion_pytorch_model.safetensors.index.json" - dest: "cogvideox-5b-transformer.safetensors.index.json" - - - repo_id: THUDM/CogVideoX-5b-I2V - description: CogVideoX-5B-I2V - Image-to-video generation - size_gb: 20 - essential: true - category: video - type: diffusion_models - format: fp16 - vram_gb: 20 - frames: 49 - resolution: 720p - notes: Image-to-video model, auto-downloaded by DownloadAndLoadCogVideoModel node - files: - - source: "transformer/diffusion_pytorch_model-00001-of-00003.safetensors" - dest: "cogvideox-5b-i2v-transformer-00001-of-00003.safetensors" - - source: "transformer/diffusion_pytorch_model-00002-of-00003.safetensors" - dest: "cogvideox-5b-i2v-transformer-00002-of-00003.safetensors" - - source: "transformer/diffusion_pytorch_model-00003-of-00003.safetensors" - dest: "cogvideox-5b-i2v-transformer-00003-of-00003.safetensors" - - source: "transformer/diffusion_pytorch_model.safetensors.index.json" - dest: "cogvideox-5b-i2v-transformer.safetensors.index.json" - - - repo_id: stabilityai/stable-video-diffusion-img2vid - description: SVD - 14 frame image-to-video - size_gb: 8 - essential: true - category: video - type: checkpoints - format: fp16 - vram_gb: 20 - frames: 14 - resolution: 576x1024 - notes: Convert images to short video clips - files: - - source: "svd.safetensors" - dest: "svd.safetensors" - - - repo_id: stabilityai/stable-video-diffusion-img2vid-xt - description: SVD-XT - 25 frame image-to-video - size_gb: 8 - essential: false - category: video - type: checkpoints - format: fp16 - vram_gb: 20 - frames: 25 - resolution: 576x1024 - notes: Extended frame count version - files: - - source: "svd_xt.safetensors" - dest: "svd_xt.safetensors" - - # HunyuanVideo - Original (720p, T2V/I2V) - - repo_id: Comfy-Org/HunyuanVideo_repackaged - description: HunyuanVideo T2V - 720p text-to-video with MLLM encoders - size_gb: 20 - essential: true - category: video - type: diffusion_models - format: bf16 - vram_gb: 24 - frames: 129 - resolution: 720p - notes: 5-second T2V generation with Chinese/English support, DiT architecture with 3D VAE - files: - - source: "split_files/diffusion_models/hunyuan_video_t2v_720p_bf16.safetensors" - dest: "hunyuan_video_t2v_720p_bf16.safetensors" - - - repo_id: Comfy-Org/HunyuanVideo_repackaged - description: HunyuanVideo I2V v1 - 720p image-to-video (concat method) - size_gb: 20 - essential: true - category: video - type: diffusion_models - format: bf16 - vram_gb: 24 - frames: 129 - resolution: 720p - notes: Static image to video with concat conditioning, better motion fluidity - files: - - source: "split_files/diffusion_models/hunyuan_video_image_to_video_720p_bf16.safetensors" - dest: "hunyuan_video_image_to_video_720p_bf16.safetensors" - - - repo_id: Comfy-Org/HunyuanVideo_repackaged - description: HunyuanVideo I2V v2 - 720p image-to-video (replace method) - size_gb: 20 - essential: true - category: video - type: diffusion_models - format: bf16 - vram_gb: 24 - frames: 129 - resolution: 720p - notes: Updated I2V with replace conditioning, better image guidance adherence - files: - - source: "split_files/diffusion_models/hunyuan_video_v2_replace_image_to_video_720p_bf16.safetensors" - dest: "hunyuan_video_v2_replace_image_to_video_720p_bf16.safetensors" - - # HunyuanVideo 1.5 - Latest generation (720p/1080p, T2V/I2V) - - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged - description: HunyuanVideo 1.5 T2V - 720p text-to-video (8.3B parameters) - size_gb: 18 - essential: true - category: video - type: diffusion_models - format: fp16 - vram_gb: 24 - frames: 129-257 - resolution: 720p - notes: 5-10 second T2V with Qwen 2.5 VL encoder, requires 24GB VRAM - files: - - source: "hunyuanvideo1.5_720p_t2v_fp16.safetensors" - dest: "hunyuanvideo1.5_720p_t2v_fp16.safetensors" - - - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged - description: HunyuanVideo 1.5 SR - 1080p super-resolution (distilled) - size_gb: 18 - essential: false - category: video - type: diffusion_models - format: fp16 - vram_gb: 24 - frames: 129-257 - resolution: 1080p - notes: Upscales 720p to 1080p with distilled model for faster generation - files: - - source: "hunyuanvideo1.5_1080p_sr_distilled_fp16.safetensors" - dest: "hunyuanvideo1.5_1080p_sr_distilled_fp16.safetensors" - - # Wan2.2 5B - Hybrid text+image to video (low VRAM) - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wan2.2 TI2V 5B - Hybrid text+image to video (8GB VRAM) - size_gb: 10 - essential: true - category: video - type: diffusion_models - format: fp16 - vram_gb: 8 - frames: 81 - resolution: 640x640 - notes: Efficient 5B model with native offloading, dual-expert architecture - files: - - source: "wan2.2_ti2v_5B_fp16.safetensors" - dest: "wan2.2_ti2v_5B_fp16.safetensors" - - # Wan2.2 14B T2V - Dual-expert text-to-video - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wan2.2 T2V High Noise 14B - Text-to-video high noise expert (FP8) - size_gb: 14 - essential: true - category: video - type: diffusion_models - format: fp8_scaled - vram_gb: 24 - frames: 81 - resolution: 640x640 - notes: Dual-expert T2V high noise denoising, FP8 quantized for 24GB GPU - files: - - source: "wan2.2_t2v_high_noise_14B_fp8_scaled.safetensors" - dest: "wan2.2_t2v_high_noise_14B_fp8_scaled.safetensors" - - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wan2.2 T2V Low Noise 14B - Text-to-video low noise expert (FP8) - size_gb: 14 - essential: true - category: video - type: diffusion_models - format: fp8_scaled - vram_gb: 24 - frames: 81 - resolution: 640x640 - notes: Dual-expert T2V low noise refinement, FP8 quantized for 24GB GPU - files: - - source: "wan2.2_t2v_low_noise_14B_fp8_scaled.safetensors" - dest: "wan2.2_t2v_low_noise_14B_fp8_scaled.safetensors" - - # Wan2.2 14B I2V - Image-to-video with content consistency - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wan2.2 I2V High Noise 14B - Image-to-video high noise expert (FP16) - size_gb: 28 - essential: true - category: video - type: diffusion_models - format: fp16 - vram_gb: 24 - frames: 81 - resolution: 640x640 - notes: Dual-expert I2V high noise denoising with content consistency - files: - - source: "wan2.2_i2v_high_noise_14B_fp16.safetensors" - dest: "wan2.2_i2v_high_noise_14B_fp16.safetensors" - - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wan2.2 I2V Low Noise 14B - Image-to-video low noise expert (FP16) - size_gb: 28 - essential: true - category: video - type: diffusion_models - format: fp16 - vram_gb: 24 - frames: 81 - resolution: 640x640 - notes: Dual-expert I2V low noise refinement with content consistency - files: - - source: "wan2.2_i2v_low_noise_14B_fp16.safetensors" - dest: "wan2.2_i2v_low_noise_14B_fp16.safetensors" - - # Wan2.2 14B Animate - Video-to-video character animation - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wan2.2 Animate 14B - Video-to-video character animation (BF16) - size_gb: 28 - essential: true - category: video - type: diffusion_models - format: bf16 - vram_gb: 24 - frames: 81 - resolution: multiples of 16 - notes: V2V animation with Mix/Move modes, requires CLIP Vision H for reference image - files: - - source: "wan2.2_animate_14B_bf16.safetensors" - dest: "wan2.2_animate_14B_bf16.safetensors" - - # Wan2.2 14B S2V - Sound-to-video synchronization - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wan2.2 S2V 14B - Sound-to-video with audio sync (FP8) - size_gb: 14 - essential: true - category: video - type: diffusion_models - format: fp8_scaled - vram_gb: 24 - frames: 81 - resolution: 640x640 - notes: Transforms static images + audio into synchronized videos, uses Wav2Vec2 audio encoder - files: - - source: "wan2.2_s2v_14B_fp8_scaled.safetensors" - dest: "wan2.2_s2v_14B_fp8_scaled.safetensors" - - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wan2.2 S2V 14B - Sound-to-video with audio sync (BF16 quality) - size_gb: 28 - essential: false - category: video - type: diffusion_models - format: bf16 - vram_gb: 24 - frames: 81 - resolution: 640x640 - notes: Higher quality BF16 version of S2V for better output quality - files: - - source: "wan2.2_s2v_14B_bf16.safetensors" - dest: "wan2.2_s2v_14B_bf16.safetensors" - - # Wan2.2 14B Fun Inpaint - Start-end frame controlled generation - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wan2.2 Fun Inpaint High Noise 14B - Start-end frame transition (FP8) - size_gb: 14 - essential: true - category: video - type: diffusion_models - format: fp8_scaled - vram_gb: 24 - frames: 81 - resolution: 640x640 - notes: Generates transition between start and end frames with high noise denoising - files: - - source: "wan2.2_fun_inpaint_high_noise_14B_fp8_scaled.safetensors" - dest: "wan2.2_fun_inpaint_high_noise_14B_fp8_scaled.safetensors" - - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wan2.2 Fun Inpaint Low Noise 14B - Start-end frame transition (FP8) - size_gb: 14 - essential: true - category: video - type: diffusion_models - format: fp8_scaled - vram_gb: 24 - frames: 81 - resolution: 640x640 - notes: Generates transition between start and end frames with low noise refinement - files: - - source: "wan2.2_fun_inpaint_low_noise_14B_fp8_scaled.safetensors" - dest: "wan2.2_fun_inpaint_low_noise_14B_fp8_scaled.safetensors" - - # Wan2.2 14B Fun Control - ControlNet-style conditioning - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wan2.2 Fun Control High Noise 14B - Control conditions (Canny/Depth/Pose/MLSD/trajectory) - size_gb: 14 - essential: true - category: video - type: diffusion_models - format: fp8_scaled - vram_gb: 24 - frames: 81 - resolution: 640x640 - notes: I2V with control conditions (Canny, Depth, OpenPose, MLSD, trajectory), requires controlnet_aux - files: - - source: "wan2.2_fun_control_high_noise_14B_fp8_scaled.safetensors" - dest: "wan2.2_fun_control_high_noise_14B_fp8_scaled.safetensors" - - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wan2.2 Fun Control Low Noise 14B - Control conditions (Canny/Depth/Pose/MLSD/trajectory) - size_gb: 14 - essential: true - category: video - type: diffusion_models - format: fp8_scaled - vram_gb: 24 - frames: 81 - resolution: 640x640 - notes: I2V with control conditions low noise refinement - files: - - source: "wan2.2_fun_control_low_noise_14B_fp8_scaled.safetensors" - dest: "wan2.2_fun_control_low_noise_14B_fp8_scaled.safetensors" - - # Wan2.2 14B Fun Camera - Camera motion control - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wan2.2 Fun Camera High Noise 14B - Camera motion control (pan/zoom/static) - size_gb: 14 - essential: true - category: video - type: diffusion_models - format: fp8_scaled - vram_gb: 24 - frames: 81 - resolution: 640x640 - notes: I2V with camera motion control (pan, zoom, static), 108s with LoRA / 536s without - files: - - source: "wan2.2_fun_camera_high_noise_14B_fp8_scaled.safetensors" - dest: "wan2.2_fun_camera_high_noise_14B_fp8_scaled.safetensors" - - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wan2.2 Fun Camera Low Noise 14B - Camera motion control (pan/zoom/static) - size_gb: 14 - essential: true - category: video - type: diffusion_models - format: fp8_scaled - vram_gb: 24 - frames: 81 - resolution: 640x640 - notes: I2V with camera motion control low noise refinement - files: - - source: "wan2.2_fun_camera_low_noise_14B_fp8_scaled.safetensors" - dest: "wan2.2_fun_camera_low_noise_14B_fp8_scaled.safetensors" - - # ========================================================================== - # AUDIO GENERATION MODELS - # ========================================================================== + - repo_id: THUDM/CogVideoX-5b + description: CogVideoX-5B - Professional text-to-video + size_gb: 20 + essential: true + category: video + format: fp16 + vram_gb: 20 + frames: 49 + resolution: 720p + notes: State-of-the-art text-to-video generation, auto-downloaded by DownloadAndLoadCogVideoModel + node + files: + - source: transformer/diffusion_pytorch_model-00001-of-00002.safetensors + dest: diffusion_models/cogvideox-5b-transformer-00001-of-00002.safetensors + - source: transformer/diffusion_pytorch_model-00002-of-00002.safetensors + dest: diffusion_models/cogvideox-5b-transformer-00002-of-00002.safetensors + - source: transformer/diffusion_pytorch_model.safetensors.index.json + dest: diffusion_models/cogvideox-5b-transformer.safetensors.index.json + - repo_id: THUDM/CogVideoX-5b-I2V + description: CogVideoX-5B-I2V - Image-to-video generation + size_gb: 20 + essential: true + category: video + format: fp16 + vram_gb: 20 + frames: 49 + resolution: 720p + notes: Image-to-video model, auto-downloaded by DownloadAndLoadCogVideoModel node + files: + - source: transformer/diffusion_pytorch_model-00001-of-00003.safetensors + dest: diffusion_models/cogvideox-5b-i2v-transformer-00001-of-00003.safetensors + - source: transformer/diffusion_pytorch_model-00002-of-00003.safetensors + dest: diffusion_models/cogvideox-5b-i2v-transformer-00002-of-00003.safetensors + - source: transformer/diffusion_pytorch_model-00003-of-00003.safetensors + dest: diffusion_models/cogvideox-5b-i2v-transformer-00003-of-00003.safetensors + - source: transformer/diffusion_pytorch_model.safetensors.index.json + dest: diffusion_models/cogvideox-5b-i2v-transformer.safetensors.index.json + - repo_id: stabilityai/stable-video-diffusion-img2vid + description: SVD - 14 frame image-to-video + size_gb: 8 + essential: true + category: video + format: fp16 + vram_gb: 20 + frames: 14 + resolution: 576x1024 + notes: Convert images to short video clips + files: + - source: svd.safetensors + dest: checkpoints/svd.safetensors + - repo_id: stabilityai/stable-video-diffusion-img2vid-xt + description: SVD-XT - 25 frame image-to-video + size_gb: 8 + essential: false + category: video + format: fp16 + vram_gb: 20 + frames: 25 + resolution: 576x1024 + notes: Extended frame count version + files: + - source: svd_xt.safetensors + dest: checkpoints/svd_xt.safetensors + - repo_id: Comfy-Org/HunyuanVideo_repackaged + description: HunyuanVideo Complete - 720p T2V/I2V models with VAE and encoders + size_gb: 51 + essential: true + category: video + format: bf16 + vram_gb: 24 + frames: 129 + resolution: 720p + notes: Complete HunyuanVideo family - T2V, I2V v1/v2, 3D VAE, LLaVA LLaMA3 text/vision + encoders + files: + - source: split_files/diffusion_models/hunyuan_video_t2v_720p_bf16.safetensors + dest: diffusion_models/hunyuan_video_t2v_720p_bf16.safetensors + - source: split_files/diffusion_models/hunyuan_video_image_to_video_720p_bf16.safetensors + dest: diffusion_models/hunyuan_video_image_to_video_720p_bf16.safetensors + - source: split_files/diffusion_models/hunyuan_video_v2_replace_image_to_video_720p_bf16.safetensors + dest: diffusion_models/hunyuan_video_v2_replace_image_to_video_720p_bf16.safetensors + - source: split_files/vae/hunyuan_video_vae_bf16.safetensors + dest: diffusion_models/hunyuan_video_vae_bf16.safetensors + - source: split_files/text_encoders/llava_llama3_fp8_scaled.safetensors + dest: diffusion_models/llava_llama3_fp8_scaled.safetensors + - source: split_files/clip_vision/llava_llama3_vision.safetensors + dest: diffusion_models/llava_llama3_vision.safetensors + - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged + description: HunyuanVideo 1.5 Complete - 720p/1080p T2V/SR with encoders + size_gb: 51.5 + essential: true + category: video + format: fp16 + vram_gb: 24 + frames: 129-257 + resolution: 720p-1080p + notes: Complete HunyuanVideo 1.5 - T2V 720p, SR 1080p, VAE, Qwen 2.5 VL, ByT5 + GlyphXL encoders + files: + - source: hunyuanvideo1.5_720p_t2v_fp16.safetensors + dest: diffusion_models/hunyuanvideo1.5_720p_t2v_fp16.safetensors + - source: hunyuanvideo1.5_1080p_sr_distilled_fp16.safetensors + dest: diffusion_models/hunyuanvideo1.5_1080p_sr_distilled_fp16.safetensors + - source: hunyuanvideo15_vae_fp16.safetensors + dest: diffusion_models/hunyuanvideo15_vae_fp16.safetensors + - source: qwen_2.5_vl_7b_fp8_scaled.safetensors + dest: diffusion_models/qwen_2.5_vl_7b_fp8_scaled.safetensors + - source: byt5_small_glyphxl_fp16.safetensors + dest: diffusion_models/byt5_small_glyphxl_fp16.safetensors + - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged + description: Wan2.2 Complete - All video models, VAEs, and LoRAs + size_gb: 220 + essential: true + category: video + format: mixed + vram_gb: 24 + frames: 81 + resolution: 640x640 + notes: Complete Wan2.2 model family - TI2V 5B, T2V 14B, I2V 14B, Animate, S2V, + Fun Inpaint/Control/Camera, VAEs, CLIP Vision H, Wav2Vec2, and LoRA accelerators + files: + - source: wan2.2_ti2v_5B_fp16.safetensors + dest: diffusion_models/wan2.2_ti2v_5B_fp16.safetensors + - source: wan2.2_t2v_high_noise_14B_fp8_scaled.safetensors + dest: diffusion_models/wan2.2_t2v_high_noise_14B_fp8_scaled.safetensors + - source: wan2.2_t2v_low_noise_14B_fp8_scaled.safetensors + dest: diffusion_models/wan2.2_t2v_low_noise_14B_fp8_scaled.safetensors + - source: wan2.2_i2v_high_noise_14B_fp16.safetensors + dest: diffusion_models/wan2.2_i2v_high_noise_14B_fp16.safetensors + - source: wan2.2_i2v_low_noise_14B_fp16.safetensors + dest: diffusion_models/wan2.2_i2v_low_noise_14B_fp16.safetensors + - source: wan2.2_animate_14B_bf16.safetensors + dest: diffusion_models/wan2.2_animate_14B_bf16.safetensors + - source: wan2.2_s2v_14B_fp8_scaled.safetensors + dest: diffusion_models/wan2.2_s2v_14B_fp8_scaled.safetensors + - source: wan2.2_s2v_14B_bf16.safetensors + dest: diffusion_models/wan2.2_s2v_14B_bf16.safetensors + - source: wan2.2_fun_inpaint_high_noise_14B_fp8_scaled.safetensors + dest: diffusion_models/wan2.2_fun_inpaint_high_noise_14B_fp8_scaled.safetensors + - source: wan2.2_fun_inpaint_low_noise_14B_fp8_scaled.safetensors + dest: diffusion_models/wan2.2_fun_inpaint_low_noise_14B_fp8_scaled.safetensors + - source: wan2.2_fun_control_high_noise_14B_fp8_scaled.safetensors + dest: diffusion_models/wan2.2_fun_control_high_noise_14B_fp8_scaled.safetensors + - source: wan2.2_fun_control_low_noise_14B_fp8_scaled.safetensors + dest: diffusion_models/wan2.2_fun_control_low_noise_14B_fp8_scaled.safetensors + - source: wan2.2_fun_camera_high_noise_14B_fp8_scaled.safetensors + dest: diffusion_models/wan2.2_fun_camera_high_noise_14B_fp8_scaled.safetensors + - source: wan2.2_fun_camera_low_noise_14B_fp8_scaled.safetensors + dest: diffusion_models/wan2.2_fun_camera_low_noise_14B_fp8_scaled.safetensors + - source: wan2.2_vae.safetensors + dest: diffusion_models/wan2.2_vae.safetensors + - source: wan_2.1_vae.safetensors + dest: diffusion_models/wan_2.1_vae.safetensors + - source: clip_vision_h.safetensors + dest: diffusion_models/clip_vision_h.safetensors + - source: wav2vec2_large_english_fp16.safetensors + dest: diffusion_models/wav2vec2_large_english_fp16.safetensors + - source: lightx2v_I2V_14B_480p_cfg_step_distill_rank64_bf16.safetensors + dest: diffusion_models/lightx2v_I2V_14B_480p_cfg_step_distill_rank64_bf16.safetensors + - source: wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise.safetensors + dest: diffusion_models/wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise.safetensors + - source: wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors + dest: diffusion_models/wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors + - source: wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors + dest: diffusion_models/wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors audio_models: - - repo_id: facebook/musicgen-small - description: MusicGen Small - Fast generation - size_gb: 3 - essential: false - category: audio - type: musicgen - format: fp32 - vram_gb: 4 - duration_seconds: 30 - notes: Fastest music generation, lower quality - files: - - source: "pytorch_model.bin" - dest: "musicgen-small-pytorch_model.bin" - - - repo_id: facebook/musicgen-medium - description: MusicGen Medium - Balanced quality - size_gb: 11 - essential: true - category: audio - type: musicgen - format: fp32 - vram_gb: 8 - duration_seconds: 30 - notes: Best balance of speed and quality - files: - - source: "pytorch_model.bin" - dest: "musicgen-medium-pytorch_model.bin" - - - repo_id: facebook/musicgen-large - description: MusicGen Large - Highest quality - size_gb: 22 - essential: false - category: audio - type: musicgen - format: fp32 - vram_gb: 16 - duration_seconds: 30 - notes: Best quality, slower generation - files: - - source: "pytorch_model-00001-of-00002.bin" - dest: "musicgen-large-pytorch_model-00001-of-00002.bin" - - source: "pytorch_model-00002-of-00002.bin" - dest: "musicgen-large-pytorch_model-00002-of-00002.bin" - - source: "pytorch_model.bin.index.json" - dest: "musicgen-large-pytorch_model.bin.index.json" - - # ACE Step v1 3.5B - State-of-the-art music generation - - repo_id: Comfy-Org/ACE-Step_ComfyUI_repackaged - description: ACE Step v1 3.5B - Fast coherent music generation with 19-language support - size_gb: 7.7 - essential: true - category: audio - type: checkpoints - format: safetensors - vram_gb: 16 - duration_seconds: 240 - notes: 15x faster than LLM baselines, superior structural coherence, voice cloning, 19-language lyrics - files: - - source: "all_in_one/ace_step_v1_3.5b.safetensors" - dest: "ace_step_v1_3.5b.safetensors" - - # ACE Step Chinese RAP LoRA (optional) - - repo_id: ACE-Step/ACE-Step-v1-chinese-rap-LoRA - description: ACE Step Chinese RAP LoRA - Enhanced Chinese pronunciation and hip-hop genre - size_gb: 0.3 - essential: false - category: audio - type: loras - format: safetensors - notes: Improves Chinese pronunciation accuracy and hip-hop/electronic genre adherence - files: - - source: "pytorch_lora_weights.safetensors" - dest: "ace-step-chinese-rap-lora.safetensors" - - # ========================================================================== - # SUPPORT MODELS (CLIP, IP-Adapter, etc.) - # ========================================================================== + - repo_id: facebook/musicgen-small + description: MusicGen Small - Fast generation + size_gb: 3 + essential: false + category: audio + format: fp32 + vram_gb: 4 + duration_seconds: 30 + notes: Fastest music generation, lower quality + files: + - source: pytorch_model.bin + dest: musicgen/musicgen-small-pytorch_model.bin + - repo_id: facebook/musicgen-medium + description: MusicGen Medium - Balanced quality + size_gb: 11 + essential: true + category: audio + format: fp32 + vram_gb: 8 + duration_seconds: 30 + notes: Best balance of speed and quality + files: + - source: pytorch_model.bin + dest: musicgen/musicgen-medium-pytorch_model.bin + - repo_id: facebook/musicgen-large + description: MusicGen Large - Highest quality + size_gb: 22 + essential: false + category: audio + format: fp32 + vram_gb: 16 + duration_seconds: 30 + notes: Best quality, slower generation + files: + - source: pytorch_model-00001-of-00002.bin + dest: musicgen/musicgen-large-pytorch_model-00001-of-00002.bin + - source: pytorch_model-00002-of-00002.bin + dest: musicgen/musicgen-large-pytorch_model-00002-of-00002.bin + - source: pytorch_model.bin.index.json + dest: musicgen/musicgen-large-pytorch_model.bin.index.json + - repo_id: Comfy-Org/ACE-Step_ComfyUI_repackaged + description: ACE Step v1 3.5B - Fast coherent music generation with 19-language + support + size_gb: 7.7 + essential: true + category: audio + format: safetensors + vram_gb: 16 + duration_seconds: 240 + notes: 15x faster than LLM baselines, superior structural coherence, voice cloning, + 19-language lyrics + files: + - source: all_in_one/ace_step_v1_3.5b.safetensors + dest: checkpoints/ace_step_v1_3.5b.safetensors + - repo_id: ACE-Step/ACE-Step-v1-chinese-rap-LoRA + description: ACE Step Chinese RAP LoRA - Enhanced Chinese pronunciation and hip-hop + genre + size_gb: 0.3 + essential: false + category: audio + format: safetensors + notes: Improves Chinese pronunciation accuracy and hip-hop/electronic genre adherence + files: + - source: pytorch_lora_weights.safetensors + dest: loras/ace-step-chinese-rap-lora.safetensors support_models: - - repo_id: openai/clip-vit-large-patch14 - description: CLIP H - For SD 1.5 IP-Adapter - size_gb: 2 - essential: true - category: support - type: clip_vision - format: fp32 - vram_gb: 2 - notes: Text-image understanding model for IP-Adapter - files: - - source: "model.safetensors" - dest: "CLIP-ViT-H-14-laion2B-s32B-b79K.safetensors" - - - repo_id: laion/CLIP-ViT-bigG-14-laion2B-39B-b160k - description: CLIP G - For SDXL IP-Adapter - size_gb: 7 - essential: true - category: support - type: clip_vision - format: fp32 - vram_gb: 4 - notes: Larger CLIP model for SDXL IP-Adapter - files: - - source: "open_clip_model.safetensors" - dest: "CLIP-ViT-bigG-14-laion2B-39B-b160k.safetensors" - - - repo_id: google/siglip-so400m-patch14-384 - description: SigLIP - For FLUX models - size_gb: 2 - essential: true - category: support - type: clip_vision - format: fp32 - vram_gb: 2 - notes: Advanced image-text alignment - files: - - source: "model.safetensors" - dest: "siglip-so400m-patch14-384.safetensors" - - - repo_id: stabilityai/stable-diffusion-3.5-large - description: CLIP-L and T5-XXL - For FLUX text encoding - size_gb: 10 - essential: true - category: support - type: clip - format: fp16 - vram_gb: 4 - notes: CLIP text encoders required for FLUX models - files: - - source: "text_encoders/clip_l.safetensors" - dest: "clip_l.safetensors" - - source: "text_encoders/t5xxl_fp16.safetensors" - dest: "t5xxl_fp16.safetensors" - - - repo_id: black-forest-labs/FLUX.1-schnell - description: FLUX VAE - Autoencoder for FLUX models - size_gb: 0.5 - essential: true - category: support - type: vae - format: safetensors - vram_gb: 1 - notes: VAE autoencoder required for FLUX image decoding - files: - - source: "ae.safetensors" - dest: "ae.safetensors" - - - repo_id: ai-forever/Real-ESRGAN - description: RealESRGAN x2 - 2x upscaling model - size_gb: 0.06 - essential: true - category: support - type: upscale_models - format: pth - vram_gb: 2 - notes: Fast 2x upscaling model for general purpose enhancement - files: - - source: "RealESRGAN_x2.pth" - dest: "RealESRGAN_x2.pth" - - - repo_id: ai-forever/Real-ESRGAN - description: RealESRGAN x4 - 4x upscaling model - size_gb: 0.06 - essential: true - category: support - type: upscale_models - format: pth - vram_gb: 4 - notes: High-quality 4x upscaling model for detail enhancement - files: - - source: "RealESRGAN_x4.pth" - dest: "RealESRGAN_x4.pth" - - - repo_id: stabilityai/stable-diffusion-3.5-large - description: T5-XXL FP16 - For CogVideoX text encoding - size_gb: 9 - essential: true - category: support - type: text_encoders - format: fp16 - vram_gb: 4 - notes: T5 text encoder required for CogVideoX models - files: - - source: "text_encoders/t5xxl_fp16.safetensors" - dest: "t5xxl_fp16.safetensors" - - - repo_id: stabilityai/stable-diffusion-3.5-large - description: CLIP-L - For CogVideoX and SD3 - size_gb: 1 - essential: true - category: support - type: text_encoders - format: fp32 - vram_gb: 1 - notes: CLIP-L text encoder for CogVideoX and SD3 models - files: - - source: "text_encoders/clip_l.safetensors" - dest: "clip_l.safetensors" - - - repo_id: stabilityai/stable-diffusion-3.5-large - description: CLIP-G - For SD3 models - size_gb: 3 - essential: false - category: support - type: text_encoders - format: fp32 - vram_gb: 2 - notes: CLIP-G text encoder for SD3 models - files: - - source: "text_encoders/clip_g.safetensors" - dest: "clip_g.safetensors" - - # HunyuanVideo Support Models - - repo_id: Comfy-Org/HunyuanVideo_repackaged - description: HunyuanVideo VAE - 3D VAE for video encoding/decoding (BF16) - size_gb: 1 - essential: true - category: support - type: vae - format: bf16 - vram_gb: 2 - notes: 3D VAE autoencoder for HunyuanVideo models - files: - - source: "split_files/vae/hunyuan_video_vae_bf16.safetensors" - dest: "hunyuan_video_vae_bf16.safetensors" - - - repo_id: Comfy-Org/HunyuanVideo_repackaged - description: LLaVA LLaMA3 FP8 - Multimodal text encoder for HunyuanVideo - size_gb: 8 - essential: true - category: support - type: text_encoders - format: fp8_scaled - vram_gb: 4 - notes: LLaVA LLaMA3-based text encoder with FP8 quantization - files: - - source: "split_files/text_encoders/llava_llama3_fp8_scaled.safetensors" - dest: "llava_llama3_fp8_scaled.safetensors" - - - repo_id: Comfy-Org/HunyuanVideo_repackaged - description: LLaVA LLaMA3 Vision - Vision encoder for HunyuanVideo I2V - size_gb: 2 - essential: true - category: support - type: clip_vision - format: safetensors - vram_gb: 2 - notes: Vision encoder for image-to-video conditioning - files: - - source: "split_files/clip_vision/llava_llama3_vision.safetensors" - dest: "llava_llama3_vision.safetensors" - - # HunyuanVideo 1.5 Support Models - - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged - description: HunyuanVideo 1.5 VAE - VAE for v1.5 models (FP16) - size_gb: 1 - essential: true - category: support - type: vae - format: fp16 - vram_gb: 2 - notes: VAE autoencoder for HunyuanVideo 1.5 - files: - - source: "hunyuanvideo15_vae_fp16.safetensors" - dest: "hunyuanvideo15_vae_fp16.safetensors" - - - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged - description: Qwen 2.5 VL 7B FP8 - Vision-language encoder for HunyuanVideo 1.5 - size_gb: 14 - essential: true - category: support - type: text_encoders - format: fp8_scaled - vram_gb: 8 - notes: Qwen 2.5 VL 7B text encoder with FP8 quantization - files: - - source: "qwen_2.5_vl_7b_fp8_scaled.safetensors" - dest: "qwen_2.5_vl_7b_fp8_scaled.safetensors" - - - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged - description: ByT5 Small GlyphXL FP16 - Glyph-aware text encoder for HunyuanVideo 1.5 - size_gb: 0.5 - essential: true - category: support - type: text_encoders - format: fp16 - vram_gb: 1 - notes: ByT5 small text encoder with glyph awareness - files: - - source: "byt5_small_glyphxl_fp16.safetensors" - dest: "byt5_small_glyphxl_fp16.safetensors" - - # Wan2.2 Support Models - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wan2.2 VAE - VAE for Wan2.2 5B models - size_gb: 0.5 - essential: true - category: support - type: vae - format: safetensors - vram_gb: 1 - notes: VAE autoencoder for Wan2.2 5B TI2V model - files: - - source: "wan2.2_vae.safetensors" - dest: "wan2.2_vae.safetensors" - - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wan 2.1 VAE - VAE for Wan2.2 14B models - size_gb: 0.5 - essential: true - category: support - type: vae - format: safetensors - vram_gb: 1 - notes: VAE autoencoder for all Wan2.2 14B models (T2V, I2V, S2V, Animate, etc.) - files: - - source: "wan_2.1_vae.safetensors" - dest: "wan_2.1_vae.safetensors" - - - repo_id: Comfy-Org/Wan_2.1_ComfyUI_repackaged - description: UMT5-XXL FP8 - Text encoder for all Wan2.2 models - size_gb: 10 - essential: true - category: support - type: text_encoders - format: fp8_scaled - vram_gb: 5 - notes: Shared text encoder for all Wan2.2 models (5B and 14B), FP8 quantized - files: - - source: "umt5_xxl_fp8_e4m3fn_scaled.safetensors" - dest: "umt5_xxl_fp8_e4m3fn_scaled.safetensors" - - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: CLIP Vision H - Vision encoder for Wan2.2 Animate mode - size_gb: 4 - essential: true - category: support - type: clip_vision - format: safetensors - vram_gb: 2 - notes: CLIP Vision H for reference image in Wan2.2 Animate video-to-video - files: - - source: "clip_vision_h.safetensors" - dest: "clip_vision_h.safetensors" - - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Wav2Vec2 Large English FP16 - Audio encoder for Wan2.2 S2V - size_gb: 1 - essential: true - category: support - type: audio_models - format: fp16 - vram_gb: 2 - notes: Audio encoder for sound-to-video synchronization - files: - - source: "wav2vec2_large_english_fp16.safetensors" - dest: "wav2vec2_large_english_fp16.safetensors" - - # Wan2.2 LoRA Accelerators (4-step distillation) - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Lightx2v I2V Animate LoRA - 4-step acceleration for Wan2.2 Animate - size_gb: 0.5 - essential: true - category: support - type: loras - format: bf16 - vram_gb: 1 - notes: 4-step LoRA for Wan2.2 Animate (480p, cfg distilled), 5x speedup - files: - - source: "lightx2v_I2V_14B_480p_cfg_step_distill_rank64_bf16.safetensors" - dest: "lightx2v_I2V_14B_480p_cfg_step_distill_rank64_bf16.safetensors" - - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Lightx2v T2V High Noise LoRA - 4-step acceleration for Wan2.2 T2V high noise - size_gb: 0.5 - essential: true - category: support - type: loras - format: safetensors - vram_gb: 1 - notes: 4-step LoRA for T2V high noise expert, v1.1 - files: - - source: "wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise.safetensors" - dest: "wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise.safetensors" - - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Lightx2v I2V High Noise LoRA - 4-step acceleration for Wan2.2 I2V high noise - size_gb: 0.5 - essential: true - category: support - type: loras - format: safetensors - vram_gb: 1 - notes: 4-step LoRA for I2V/Fun Inpaint/Fun Control/Fun Camera high noise expert - files: - - source: "wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors" - dest: "wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors" - - - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged - description: Lightx2v I2V Low Noise LoRA - 4-step acceleration for Wan2.2 I2V low noise - size_gb: 0.5 - essential: true - category: support - type: loras - format: safetensors - vram_gb: 1 - notes: 4-step LoRA for I2V/Fun Inpaint/Fun Control/Fun Camera low noise expert - files: - - source: "wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors" - dest: "wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors" - - # ========================================================================== - # ANIMATEDIFF MODELS - # ========================================================================== + - repo_id: openai/clip-vit-large-patch14 + description: CLIP H - For SD 1.5 IP-Adapter + size_gb: 2 + essential: true + category: support + format: fp32 + vram_gb: 2 + notes: Text-image understanding model for IP-Adapter + files: + - source: model.safetensors + dest: clip_vision/CLIP-ViT-H-14-laion2B-s32B-b79K.safetensors + - repo_id: laion/CLIP-ViT-bigG-14-laion2B-39B-b160k + description: CLIP G - For SDXL IP-Adapter + size_gb: 7 + essential: true + category: support + format: fp32 + vram_gb: 4 + notes: Larger CLIP model for SDXL IP-Adapter + files: + - source: open_clip_model.safetensors + dest: clip_vision/CLIP-ViT-bigG-14-laion2B-39B-b160k.safetensors + - repo_id: google/siglip-so400m-patch14-384 + description: SigLIP - For FLUX models + size_gb: 2 + essential: true + category: support + format: fp32 + vram_gb: 2 + notes: Advanced image-text alignment + files: + - source: model.safetensors + dest: clip_vision/siglip-so400m-patch14-384.safetensors + - repo_id: black-forest-labs/FLUX.1-schnell + description: FLUX VAE - Autoencoder for FLUX models + size_gb: 0.5 + essential: true + category: support + format: safetensors + vram_gb: 1 + notes: VAE autoencoder required for FLUX image decoding + files: + - source: ae.safetensors + dest: vae/ae.safetensors + - repo_id: ai-forever/Real-ESRGAN + description: RealESRGAN x2 - 2x upscaling model + size_gb: 0.06 + essential: true + category: support + format: pth + vram_gb: 2 + notes: Fast 2x upscaling model for general purpose enhancement + files: + - source: RealESRGAN_x2.pth + dest: upscale_models/RealESRGAN_x2.pth + - repo_id: ai-forever/Real-ESRGAN + description: RealESRGAN x4 - 4x upscaling model + size_gb: 0.06 + essential: true + category: support + format: pth + vram_gb: 4 + notes: High-quality 4x upscaling model for detail enhancement + files: + - source: RealESRGAN_x4.pth + dest: upscale_models/RealESRGAN_x4.pth + - repo_id: Comfy-Org/Wan_2.1_ComfyUI_repackaged + description: UMT5-XXL FP8 - Text encoder for all Wan2.2 models + size_gb: 10 + essential: true + category: support + format: fp8_scaled + vram_gb: 5 + notes: Shared text encoder for all Wan2.2 models (5B and 14B), FP8 quantized + files: + - source: umt5_xxl_fp8_e4m3fn_scaled.safetensors + dest: text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors animatediff_models: - - repo_id: guoyww/animatediff - description: AnimateDiff Motion Modules - size_gb: 2 - essential: true - category: animatediff - type: animatediff_models - filename: mm_sd_v15 - format: safetensors - vram_gb: 4 - notes: Motion modules for AnimateDiff text-to-video - files: - - source: "mm_sd_v15_v2.ckpt" - dest: "mm_sd_v15_v2.ckpt" - - # ========================================================================== - # CONTROLNET MODELS - # ========================================================================== + - repo_id: guoyww/animatediff + description: AnimateDiff Motion Modules + size_gb: 2 + essential: true + category: animatediff + filename: mm_sd_v15 + format: safetensors + vram_gb: 4 + notes: Motion modules for AnimateDiff text-to-video + files: + - source: mm_sd_v15_v2.ckpt + dest: animatediff_models/mm_sd_v15_v2.ckpt controlnet_models: - - repo_id: lllyasviel/control_v11p_sd15_canny - description: ControlNet Canny - Edge detection control for SD 1.5 - size_gb: 1.5 - essential: false - category: controlnet - type: controlnet - format: safetensors - vram_gb: 2 - notes: Precise edge-based composition control - files: - - source: "diffusion_pytorch_model.safetensors" - dest: "control_v11p_sd15_canny.safetensors" - - - repo_id: lllyasviel/control_v11f1p_sd15_depth - description: ControlNet Depth - Depth map control for SD 1.5 - size_gb: 1.5 - essential: false - category: controlnet - type: controlnet - format: safetensors - vram_gb: 2 - notes: Depth-based spatial control - files: - - source: "diffusion_pytorch_model.safetensors" - dest: "control_v11p_sd15_depth.safetensors" - - - repo_id: diffusers/controlnet-canny-sdxl-1.0 - description: ControlNet Canny SDXL - Edge detection for SDXL - size_gb: 2.5 - essential: false - category: controlnet - type: controlnet - format: safetensors - vram_gb: 3 - notes: Canny edge control for SDXL models - files: - - source: "diffusion_pytorch_model.safetensors" - dest: "controlnet-canny-sdxl-1.0.safetensors" - - - repo_id: diffusers/controlnet-depth-sdxl-1.0 - description: ControlNet Depth SDXL - Depth map for SDXL - size_gb: 2.5 - essential: false - category: controlnet - type: controlnet - format: safetensors - vram_gb: 3 - notes: Depth control for SDXL models - files: - - source: "diffusion_pytorch_model.safetensors" - dest: "controlnet-depth-sdxl-1.0.safetensors" - - # ========================================================================== - # IP-ADAPTER MODELS - # ========================================================================== + - repo_id: lllyasviel/control_v11p_sd15_canny + description: ControlNet Canny - Edge detection control for SD 1.5 + size_gb: 1.5 + essential: false + category: controlnet + format: safetensors + vram_gb: 2 + notes: Precise edge-based composition control + files: + - source: diffusion_pytorch_model.safetensors + dest: controlnet/control_v11p_sd15_canny.safetensors + - repo_id: lllyasviel/control_v11f1p_sd15_depth + description: ControlNet Depth - Depth map control for SD 1.5 + size_gb: 1.5 + essential: false + category: controlnet + format: safetensors + vram_gb: 2 + notes: Depth-based spatial control + files: + - source: diffusion_pytorch_model.safetensors + dest: controlnet/control_v11p_sd15_depth.safetensors + - repo_id: diffusers/controlnet-canny-sdxl-1.0 + description: ControlNet Canny SDXL - Edge detection for SDXL + size_gb: 2.5 + essential: false + category: controlnet + format: safetensors + vram_gb: 3 + notes: Canny edge control for SDXL models + files: + - source: diffusion_pytorch_model.safetensors + dest: controlnet/controlnet-canny-sdxl-1.0.safetensors + - repo_id: diffusers/controlnet-depth-sdxl-1.0 + description: ControlNet Depth SDXL - Depth map for SDXL + size_gb: 2.5 + essential: false + category: controlnet + format: safetensors + vram_gb: 3 + notes: Depth control for SDXL models + files: + - source: diffusion_pytorch_model.safetensors + dest: controlnet/controlnet-depth-sdxl-1.0.safetensors ipadapter_models: - - repo_id: h94/IP-Adapter - description: IP-Adapter SDXL Base - Style & Composition - size_gb: 1.3 - essential: true - category: ipadapter - type: ipadapter - format: safetensors - vram_gb: 4 - notes: Basic IP-Adapter for SDXL - files: - - source: "sdxl_models/ip-adapter_sdxl.safetensors" - dest: "ip-adapter_sdxl.safetensors" - - - repo_id: h94/IP-Adapter - description: IP-Adapter SDXL VIT-H - For CLIP-ViT-H - size_gb: 0.9 - essential: true - category: ipadapter - type: ipadapter - format: safetensors - vram_gb: 4 - notes: IP-Adapter for SDXL with VIT-H CLIP vision model - files: - - source: "sdxl_models/ip-adapter_sdxl_vit-h.safetensors" - dest: "ip-adapter_sdxl_vit-h.safetensors" - - - repo_id: h94/IP-Adapter - description: IP-Adapter SDXL Plus - High Strength Composition - size_gb: 0.9 - essential: false - category: ipadapter - type: ipadapter - format: safetensors - vram_gb: 4 - notes: Enhanced composition control with higher strength - files: - - source: "sdxl_models/ip-adapter-plus_sdxl_vit-h.safetensors" - dest: "ip-adapter-plus_sdxl_vit-h.safetensors" - - - repo_id: h94/IP-Adapter - description: IP-Adapter SDXL Plus Face - Face-focused generation - size_gb: 0.5 - essential: false - category: ipadapter - type: ipadapter - format: safetensors - vram_gb: 4 - notes: Specialized for face transfer and portrait generation - files: - - source: "sdxl_models/ip-adapter-plus-face_sdxl_vit-h.safetensors" - dest: "ip-adapter-plus-face_sdxl_vit-h.safetensors" - - # ========================================================================== - # DIFFRHYTHM MODELS (Full-length song generation) - # ========================================================================== + - repo_id: h94/IP-Adapter + description: IP-Adapter SDXL Base - Style & Composition + size_gb: 1.3 + essential: true + category: ipadapter + format: safetensors + vram_gb: 4 + notes: Basic IP-Adapter for SDXL + files: + - source: sdxl_models/ip-adapter_sdxl.safetensors + dest: ipadapter/ip-adapter_sdxl.safetensors + - repo_id: h94/IP-Adapter + description: IP-Adapter SDXL VIT-H - For CLIP-ViT-H + size_gb: 0.9 + essential: true + category: ipadapter + format: safetensors + vram_gb: 4 + notes: IP-Adapter for SDXL with VIT-H CLIP vision model + files: + - source: sdxl_models/ip-adapter_sdxl_vit-h.safetensors + dest: ipadapter/ip-adapter_sdxl_vit-h.safetensors + - repo_id: h94/IP-Adapter + description: IP-Adapter SDXL Plus - High Strength Composition + size_gb: 0.9 + essential: false + category: ipadapter + format: safetensors + vram_gb: 4 + notes: Enhanced composition control with higher strength + files: + - source: sdxl_models/ip-adapter-plus_sdxl_vit-h.safetensors + dest: ipadapter/ip-adapter-plus_sdxl_vit-h.safetensors + - repo_id: h94/IP-Adapter + description: IP-Adapter SDXL Plus Face - Face-focused generation + size_gb: 0.5 + essential: false + category: ipadapter + format: safetensors + vram_gb: 4 + notes: Specialized for face transfer and portrait generation + files: + - source: sdxl_models/ip-adapter-plus-face_sdxl_vit-h.safetensors + dest: ipadapter/ip-adapter-plus-face_sdxl_vit-h.safetensors diffrhythm_models: - - repo_id: ASLP-lab/DiffRhythm-1_2 - description: DiffRhythm 1.2 - 95 second generation model - size_gb: 2 - essential: true - category: diffrhythm - type: TTS/DiffRhythm - format: pt - vram_gb: 12 - duration_seconds: 95 - notes: Latest 95-second generation model - files: - - source: "cfm_model.pt" - dest: "cfm_model_v1_2.pt" - - - repo_id: ASLP-lab/DiffRhythm-full - description: DiffRhythm Full - 4m45s full-length generation - size_gb: 2 - essential: false - category: diffrhythm - type: TTS/DiffRhythm - format: pt - vram_gb: 16 - duration_seconds: 285 - notes: Full-length 4 minute 45 second music generation - files: - - source: "cfm_model.pt" - dest: "cfm_full_model.pt" - - - repo_id: ASLP-lab/DiffRhythm-base - description: DiffRhythm Base - 95 second base model - size_gb: 2 - essential: false - category: diffrhythm - type: TTS/DiffRhythm - format: pt - vram_gb: 12 - duration_seconds: 95 - notes: Base 95-second model - files: - - source: "cfm_model.pt" - dest: "cfm_model.pt" - - - repo_id: ASLP-lab/DiffRhythm-vae - description: DiffRhythm VAE - Variational autoencoder - size_gb: 1 - essential: true - category: diffrhythm - type: TTS/DiffRhythm - format: pt - vram_gb: 2 - notes: VAE component fine-tuned from Stable Audio Open (Stability AI Community License) - files: - - source: "vae_model.pt" - dest: "vae_model.pt" - - - repo_id: OpenMuQ/MuQ-MuLan-large - description: MuQ-MuLan-large - Music-text joint embedding (~700M parameters) - size_gb: 3 - essential: true - category: diffrhythm - type: TTS/DiffRhythm/MuQ-MuLan-large - format: bin - vram_gb: 4 - notes: Music-text joint embedding for semantic understanding (English/Chinese) - files: - - source: "config.json" - dest: "config.json" - - source: "pytorch_model.bin" - dest: "pytorch_model.bin" - - - repo_id: OpenMuQ/MuQ-large-msd-iter - description: MuQ-large-msd-iter - Music representation learning (~300M parameters) - size_gb: 1.2 - essential: true - category: diffrhythm - type: TTS/DiffRhythm/MuQ-large-msd-iter - format: safetensors - vram_gb: 2 - notes: Music representation model trained on Million Song Dataset - files: - - source: "config.json" - dest: "config.json" - - source: "model.safetensors" - dest: "model.safetensors" - - - repo_id: FacebookAI/xlm-roberta-base - description: XLM-RoBERTa Base - Multilingual text encoder (100 languages, 0.3B params) - size_gb: 1.1 - essential: true - category: diffrhythm - type: TTS/DiffRhythm/xlm-roberta-base - format: safetensors - vram_gb: 1 - notes: Multilingual text encoding for 100 languages - files: - - source: "config.json" - dest: "config.json" - - source: "model.safetensors" - dest: "model.safetensors" - - source: "sentencepiece.bpe.model" - dest: "sentencepiece.bpe.model" - - source: "tokenizer.json" - dest: "tokenizer.json" - - source: "tokenizer_config.json" - dest: "tokenizer_config.json" - -# ============================================================================ -# STORAGE & VRAM SUMMARIES -# ============================================================================ - + - repo_id: ASLP-lab/DiffRhythm-1_2 + description: DiffRhythm 1.2 - 95 second generation model + size_gb: 2 + essential: true + category: diffrhythm + format: pt + vram_gb: 12 + duration_seconds: 95 + notes: Latest 95-second generation model + files: + - source: cfm_model.pt + dest: TTS/DiffRhythm/cfm_model_v1_2.pt + - repo_id: ASLP-lab/DiffRhythm-full + description: DiffRhythm Full - 4m45s full-length generation + size_gb: 2 + essential: false + category: diffrhythm + format: pt + vram_gb: 16 + duration_seconds: 285 + notes: Full-length 4 minute 45 second music generation + files: + - source: cfm_model.pt + dest: TTS/DiffRhythm/cfm_full_model.pt + - repo_id: ASLP-lab/DiffRhythm-base + description: DiffRhythm Base - 95 second base model + size_gb: 2 + essential: false + category: diffrhythm + format: pt + vram_gb: 12 + duration_seconds: 95 + notes: Base 95-second model + files: + - source: cfm_model.pt + dest: TTS/DiffRhythm/cfm_model.pt + - repo_id: ASLP-lab/DiffRhythm-vae + description: DiffRhythm VAE - Variational autoencoder + size_gb: 1 + essential: true + category: diffrhythm + format: pt + vram_gb: 2 + notes: VAE component fine-tuned from Stable Audio Open (Stability AI Community + License) + files: + - source: vae_model.pt + dest: TTS/DiffRhythm/vae_model.pt + - repo_id: OpenMuQ/MuQ-MuLan-large + description: MuQ-MuLan-large - Music-text joint embedding (~700M parameters) + size_gb: 3 + essential: true + category: diffrhythm + format: bin + vram_gb: 4 + notes: Music-text joint embedding for semantic understanding (English/Chinese) + files: + - source: config.json + dest: TTS/DiffRhythm/MuQ-MuLan-large/config.json + - source: pytorch_model.bin + dest: TTS/DiffRhythm/MuQ-MuLan-large/pytorch_model.bin + - repo_id: OpenMuQ/MuQ-large-msd-iter + description: MuQ-large-msd-iter - Music representation learning (~300M parameters) + size_gb: 1.2 + essential: true + category: diffrhythm + format: safetensors + vram_gb: 2 + notes: Music representation model trained on Million Song Dataset + files: + - source: config.json + dest: TTS/DiffRhythm/MuQ-large-msd-iter/config.json + - source: model.safetensors + dest: TTS/DiffRhythm/MuQ-large-msd-iter/model.safetensors + - repo_id: FacebookAI/xlm-roberta-base + description: XLM-RoBERTa Base - Multilingual text encoder (100 languages, 0.3B + params) + size_gb: 1.1 + essential: true + category: diffrhythm + format: safetensors + vram_gb: 1 + notes: Multilingual text encoding for 100 languages + files: + - source: config.json + dest: TTS/DiffRhythm/xlm-roberta-base/config.json + - source: model.safetensors + dest: TTS/DiffRhythm/xlm-roberta-base/model.safetensors + - source: sentencepiece.bpe.model + dest: TTS/DiffRhythm/xlm-roberta-base/sentencepiece.bpe.model + - source: tokenizer.json + dest: TTS/DiffRhythm/xlm-roberta-base/tokenizer.json + - source: tokenizer_config.json + dest: TTS/DiffRhythm/xlm-roberta-base/tokenizer_config.json storage_requirements: essential_only: - image: 30 # FLUX Schnell + SDXL Base - video: 28 # CogVideoX + SVD - audio: 11 # MusicGen Medium - support: 11 # All 3 CLIP models - diffrhythm: 10 # DiffRhythm essential models - total: 90 # Total essential storage - + image: 30 + video: 28 + audio: 11 + support: 11 + diffrhythm: 10 + total: 90 all_models: - image: 54 # All image models - video: 36 # All video models - audio: 36 # All audio models - support: 11 # All support models - diffrhythm: 12 # All DiffRhythm models - total: 149 # Total with optional models - + image: 54 + video: 36 + audio: 36 + support: 11 + diffrhythm: 12 + total: 149 vram_requirements: - # For 24GB GPU (RTX 4090) simultaneous_loadable: - - name: Image Focus - FLUX FP16 - models: [FLUX.1 Schnell] - vram_used: 23 - remaining: 1 - - - name: Image Focus - FLUX FP8 + SDXL - models: [FLUX.1 Schnell FP8, SDXL Base] - vram_used: 24 - remaining: 0 - - - name: Video Generation - models: [CogVideoX-5B optimized, SDXL] - vram_used: 24 - remaining: 0 - - - name: Multi-Modal - models: [SDXL, MusicGen Medium] - vram_used: 20 - remaining: 4 - -# ============================================================================ -# INSTALLATION PROFILES -# ============================================================================ - + - name: Image Focus - FLUX FP16 + models: + - FLUX.1 Schnell + vram_used: 23 + remaining: 1 + - name: Image Focus - FLUX FP8 + SDXL + models: + - FLUX.1 Schnell FP8 + - SDXL Base + vram_used: 24 + remaining: 0 + - name: Video Generation + models: + - CogVideoX-5B optimized + - SDXL + vram_used: 24 + remaining: 0 + - name: Multi-Modal + models: + - SDXL + - MusicGen Medium + vram_used: 20 + remaining: 4 installation_profiles: minimal: description: Minimal setup for testing - categories: [support_models] + categories: + - support_models storage_gb: 11 estimated_time: 5-10 minutes - essential: description: Essential models only (~80GB) - categories: [image_models, video_models, audio_models, support_models] + categories: + - image_models + - video_models + - audio_models + - support_models essential_only: true storage_gb: 80 estimated_time: 1-2 hours - image_focused: description: All image generation models - categories: [image_models, support_models] + categories: + - image_models + - support_models storage_gb: 65 estimated_time: 45-90 minutes - video_focused: description: All video generation models - categories: [video_models, image_models, support_models] + categories: + - video_models + - image_models + - support_models essential_only: true storage_gb: 69 estimated_time: 1-2 hours - complete: description: All models (including optional) - categories: [image_models, video_models, audio_models, support_models] + categories: + - image_models + - video_models + - audio_models + - support_models storage_gb: 137 estimated_time: 2-4 hours - -# ============================================================================ -# METADATA -# ============================================================================ - metadata: version: 1.0.0 last_updated: 2025-11-21 compatible_with: - - ComfyUI >= 0.1.0 - - Python >= 3.10 - - HuggingFace Hub >= 0.20.0 + - ComfyUI >= 0.1.0 + - Python >= 3.10 + - HuggingFace Hub >= 0.20.0 maintainer: Valknar repository: https://github.com/yourusername/runpod