From 2189697734c46fa0a7b5ce7f1e614d7f1d4873ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= <valknar@pivoine.art>
Date: Tue, 25 Nov 2025 19:19:42 +0100
Subject: [PATCH] refactor: remove type field from models_huggingface.yaml and
 include type in dest paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Prepended ComfyUI model type folder (checkpoints/, clip/, vae/, etc.) to all dest paths
- Removed separate 'type' field from all model entries
- Consolidated SD3.5 duplicate entries (5 → 1)
- Simplified model configuration by embedding directory structure directly in destination paths

This change eliminates the need to parse the 'type' field separately in artifact_huggingface_download.sh,
making the configuration more explicit and easier to understand.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 models_huggingface.yaml | 1881 ++++++++++++++-------------------------
 1 file changed, 667 insertions(+), 1214 deletions(-)

diff --git a/models_huggingface.yaml b/models_huggingface.yaml
index ff9d638..a8011fc 100644
--- a/models_huggingface.yaml
+++ b/models_huggingface.yaml
@@ -1,1262 +1,715 @@
-# ============================================================================
-# ComfyUI Model Configuration
-# ============================================================================
-#
-# This configuration file defines all available ComfyUI models for download.
-# Models are organized by category: image, video, audio, and support models.
-#
-# Each model entry contains:
-#   - repo_id: HuggingFace repository identifier
-#   - description: Human-readable description
-#   - size_gb: Approximate size in gigabytes
-#   - essential: Whether this is an essential model (true/false)
-#   - category: Model category (image/video/audio/support)
-#
-# ============================================================================
-
-# Global settings
 settings:
   cache_dir: /workspace/huggingface_cache
   parallel_downloads: 1
   retry_attempts: 3
   timeout_seconds: 3600
-
-# Model categories
 model_categories:
-  # ==========================================================================
-  # IMAGE GENERATION MODELS
-  # ==========================================================================
   image_models:
-    - repo_id: black-forest-labs/FLUX.1-schnell
-      description: FLUX.1 Schnell - Fast 4-step inference
-      size_gb: 23
-      essential: true
-      category: image
-      type: unet
-      format: fp16
-      vram_gb: 23
-      notes: Industry-leading image generation quality
-      files:
-        - source: "flux1-schnell.safetensors"
-          dest: "flux1-schnell.safetensors"
-
-    - repo_id: black-forest-labs/FLUX.1-dev
-      description: FLUX.1 Dev - Balanced quality/speed
-      size_gb: 23
-      essential: false
-      category: image
-      type: unet
-      format: fp16
-      vram_gb: 23
-      notes: Development version with enhanced features
-      files:
-        - source: "flux1-dev.safetensors"
-          dest: "flux1-dev.safetensors"
-
-    - repo_id: runwayml/stable-diffusion-v1-5
-      description: SD 1.5 - For AnimateDiff
-      size_gb: 4
-      essential: true
-      category: image
-      type: checkpoints
-      format: fp16
-      vram_gb: 8
-      notes: Stable Diffusion 1.5 required for AnimateDiff motion modules
-      files:
-        - source: "v1-5-pruned-emaonly.safetensors"
-          dest: "v1-5-pruned-emaonly.safetensors"
-
-    - repo_id: stabilityai/stable-diffusion-xl-base-1.0
-      description: SDXL Base 1.0 - Industry standard
-      size_gb: 7
-      essential: true
-      category: image
-      type: checkpoints
-      format: fp16
-      vram_gb: 12
-      notes: Most widely used Stable Diffusion model
-      files:
-        - source: "sd_xl_base_1.0.safetensors"
-          dest: "sd_xl_base_1.0.safetensors"
-
-    - repo_id: stabilityai/stable-diffusion-xl-refiner-1.0
-      description: SDXL Refiner 1.0 - Enhances base output
-      size_gb: 6
-      essential: false
-      category: image
-      type: checkpoints
-      format: fp16
-      vram_gb: 12
-      notes: Use after SDXL base for improved details
-      files:
-        - source: "sd_xl_refiner_1.0.safetensors"
-          dest: "sd_xl_refiner_1.0.safetensors"
-
-    - repo_id: stabilityai/stable-diffusion-3.5-large
-      description: SD 3.5 Large - Latest Stability AI
-      size_gb: 18
-      essential: false
-      category: image
-      type: checkpoints
-      format: fp16
-      vram_gb: 20
-      notes: Newest generation Stable Diffusion
-      files:
-        - source: "sd3.5_large.safetensors"
-          dest: "sd3.5_large.safetensors"
-
-    - repo_id: John6666/diving-illustrious-real-asian-v50-sdxl
-      description: Diving Illustrious Real Asian v5.0 - Photorealistic Asian subjects
-      size_gb: 7
-      essential: false
-      category: image
-      type: checkpoints
-      format: fp16
-      vram_gb: 12
-      notes: SDXL fine-tune specializing in photorealistic Asian subjects with illustrious quality
-      files:
-        - source: "*.safetensors"
-          dest: "diving-illustrious-real-asian-v50-sdxl.safetensors"
-
-    - repo_id: playgroundai/playground-v2.5-1024px-aesthetic
-      description: Playground v2.5 - 1024px aesthetic images
-      size_gb: 7
-      essential: false
-      category: image
-      type: checkpoints
-      format: fp16
-      vram_gb: 12
-      notes: Highly aesthetic 1024x1024 outputs, outperforms SDXL and DALL-E 3 in user studies
-      files:
-        - source: "*.safetensors"
-          dest: "playground-v2.5-1024px-aesthetic.safetensors"
-
-    - repo_id: Lykon/dreamshaper-8
-      description: DreamShaper 8 - Multi-style versatile model
-      size_gb: 4
-      essential: false
-      category: image
-      type: checkpoints
-      format: fp16
-      vram_gb: 8
-      notes: Versatile SD1.5 fine-tune balancing photorealistic and anime styles with strong LoRA support
-      files:
-        - source: "*.safetensors"
-          dest: "dreamshaper-8.safetensors"
-
-  # ==========================================================================
-  # VIDEO GENERATION MODELS
-  # ==========================================================================
+  - repo_id: black-forest-labs/FLUX.1-schnell
+    description: FLUX.1 Schnell - Fast 4-step inference
+    size_gb: 23
+    essential: true
+    category: image
+    format: fp16
+    vram_gb: 23
+    notes: Industry-leading image generation quality
+    files:
+    - source: flux1-schnell.safetensors
+      dest: unet/flux1-schnell.safetensors
+  - repo_id: black-forest-labs/FLUX.1-dev
+    description: FLUX.1 Dev - Balanced quality/speed
+    size_gb: 23
+    essential: false
+    category: image
+    format: fp16
+    vram_gb: 23
+    notes: Development version with enhanced features
+    files:
+    - source: flux1-dev.safetensors
+      dest: unet/flux1-dev.safetensors
+  - repo_id: runwayml/stable-diffusion-v1-5
+    description: SD 1.5 - For AnimateDiff
+    size_gb: 4
+    essential: true
+    category: image
+    format: fp16
+    vram_gb: 8
+    notes: Stable Diffusion 1.5 required for AnimateDiff motion modules
+    files:
+    - source: v1-5-pruned-emaonly.safetensors
+      dest: checkpoints/v1-5-pruned-emaonly.safetensors
+  - repo_id: stabilityai/stable-diffusion-xl-base-1.0
+    description: SDXL Base 1.0 - Industry standard
+    size_gb: 7
+    essential: true
+    category: image
+    format: fp16
+    vram_gb: 12
+    notes: Most widely used Stable Diffusion model
+    files:
+    - source: sd_xl_base_1.0.safetensors
+      dest: checkpoints/sd_xl_base_1.0.safetensors
+  - repo_id: stabilityai/stable-diffusion-xl-refiner-1.0
+    description: SDXL Refiner 1.0 - Enhances base output
+    size_gb: 6
+    essential: false
+    category: image
+    format: fp16
+    vram_gb: 12
+    notes: Use after SDXL base for improved details
+    files:
+    - source: sd_xl_refiner_1.0.safetensors
+      dest: checkpoints/sd_xl_refiner_1.0.safetensors
+  - repo_id: stabilityai/stable-diffusion-3.5-large
+    description: SD 3.5 Large Complete - Checkpoint and text encoders
+    size_gb: 31
+    essential: false
+    category: image
+    format: mixed
+    vram_gb: 20
+    notes: Complete SD3.5 Large model with checkpoint and all text encoders (CLIP-L,
+      CLIP-G, T5-XXL)
+    files:
+    - source: sd3.5_large.safetensors
+      dest: checkpoints/sd3.5_large.safetensors
+    - source: text_encoders/clip_l.safetensors
+      dest: checkpoints/clip_l.safetensors
+    - source: text_encoders/clip_g.safetensors
+      dest: checkpoints/clip_g.safetensors
+    - source: text_encoders/t5xxl_fp16.safetensors
+      dest: checkpoints/t5xxl_fp16.safetensors
+  - repo_id: John6666/diving-illustrious-real-asian-v50-sdxl
+    description: Diving Illustrious Real Asian v5.0 - Photorealistic Asian subjects
+    size_gb: 7
+    essential: false
+    category: image
+    format: fp16
+    vram_gb: 12
+    notes: SDXL fine-tune specializing in photorealistic Asian subjects with illustrious
+      quality
+    files:
+    - source: unet/diffusion_pytorch_model.safetensors
+      dest: checkpoints/diving-illustrious-real-asian-v50-sdxl.safetensors
+  - repo_id: playgroundai/playground-v2.5-1024px-aesthetic
+    description: Playground v2.5 - 1024px aesthetic images
+    size_gb: 7
+    essential: false
+    category: image
+    format: fp16
+    vram_gb: 12
+    notes: Highly aesthetic 1024x1024 outputs, outperforms SDXL and DALL-E 3 in user
+      studies
+    files:
+    - source: playground-v2.5-1024px-aesthetic.fp16.safetensors
+      dest: checkpoints/playground-v2.5-1024px-aesthetic.safetensors
+  - repo_id: Lykon/dreamshaper-8
+    description: DreamShaper 8 - Multi-style versatile model
+    size_gb: 4
+    essential: false
+    category: image
+    format: fp16
+    vram_gb: 8
+    notes: Versatile SD1.5 fine-tune balancing photorealistic and anime styles with
+      strong LoRA support
+    files:
+    - source: unet/diffusion_pytorch_model.fp16.safetensors
+      dest: checkpoints/dreamshaper-8.safetensors
   video_models:
-    - repo_id: THUDM/CogVideoX-5b
-      description: CogVideoX-5B - Professional text-to-video
-      size_gb: 20
-      essential: true
-      category: video
-      type: diffusion_models
-      format: fp16
-      vram_gb: 20
-      frames: 49
-      resolution: 720p
-      notes: State-of-the-art text-to-video generation, auto-downloaded by DownloadAndLoadCogVideoModel node
-      files:
-        - source: "transformer/diffusion_pytorch_model-00001-of-00002.safetensors"
-          dest: "cogvideox-5b-transformer-00001-of-00002.safetensors"
-        - source: "transformer/diffusion_pytorch_model-00002-of-00002.safetensors"
-          dest: "cogvideox-5b-transformer-00002-of-00002.safetensors"
-        - source: "transformer/diffusion_pytorch_model.safetensors.index.json"
-          dest: "cogvideox-5b-transformer.safetensors.index.json"
-
-    - repo_id: THUDM/CogVideoX-5b-I2V
-      description: CogVideoX-5B-I2V - Image-to-video generation
-      size_gb: 20
-      essential: true
-      category: video
-      type: diffusion_models
-      format: fp16
-      vram_gb: 20
-      frames: 49
-      resolution: 720p
-      notes: Image-to-video model, auto-downloaded by DownloadAndLoadCogVideoModel node
-      files:
-        - source: "transformer/diffusion_pytorch_model-00001-of-00003.safetensors"
-          dest: "cogvideox-5b-i2v-transformer-00001-of-00003.safetensors"
-        - source: "transformer/diffusion_pytorch_model-00002-of-00003.safetensors"
-          dest: "cogvideox-5b-i2v-transformer-00002-of-00003.safetensors"
-        - source: "transformer/diffusion_pytorch_model-00003-of-00003.safetensors"
-          dest: "cogvideox-5b-i2v-transformer-00003-of-00003.safetensors"
-        - source: "transformer/diffusion_pytorch_model.safetensors.index.json"
-          dest: "cogvideox-5b-i2v-transformer.safetensors.index.json"
-
-    - repo_id: stabilityai/stable-video-diffusion-img2vid
-      description: SVD - 14 frame image-to-video
-      size_gb: 8
-      essential: true
-      category: video
-      type: checkpoints
-      format: fp16
-      vram_gb: 20
-      frames: 14
-      resolution: 576x1024
-      notes: Convert images to short video clips
-      files:
-        - source: "svd.safetensors"
-          dest: "svd.safetensors"
-
-    - repo_id: stabilityai/stable-video-diffusion-img2vid-xt
-      description: SVD-XT - 25 frame image-to-video
-      size_gb: 8
-      essential: false
-      category: video
-      type: checkpoints
-      format: fp16
-      vram_gb: 20
-      frames: 25
-      resolution: 576x1024
-      notes: Extended frame count version
-      files:
-        - source: "svd_xt.safetensors"
-          dest: "svd_xt.safetensors"
-
-    # HunyuanVideo - Original (720p, T2V/I2V)
-    - repo_id: Comfy-Org/HunyuanVideo_repackaged
-      description: HunyuanVideo T2V - 720p text-to-video with MLLM encoders
-      size_gb: 20
-      essential: true
-      category: video
-      type: diffusion_models
-      format: bf16
-      vram_gb: 24
-      frames: 129
-      resolution: 720p
-      notes: 5-second T2V generation with Chinese/English support, DiT architecture with 3D VAE
-      files:
-        - source: "split_files/diffusion_models/hunyuan_video_t2v_720p_bf16.safetensors"
-          dest: "hunyuan_video_t2v_720p_bf16.safetensors"
-
-    - repo_id: Comfy-Org/HunyuanVideo_repackaged
-      description: HunyuanVideo I2V v1 - 720p image-to-video (concat method)
-      size_gb: 20
-      essential: true
-      category: video
-      type: diffusion_models
-      format: bf16
-      vram_gb: 24
-      frames: 129
-      resolution: 720p
-      notes: Static image to video with concat conditioning, better motion fluidity
-      files:
-        - source: "split_files/diffusion_models/hunyuan_video_image_to_video_720p_bf16.safetensors"
-          dest: "hunyuan_video_image_to_video_720p_bf16.safetensors"
-
-    - repo_id: Comfy-Org/HunyuanVideo_repackaged
-      description: HunyuanVideo I2V v2 - 720p image-to-video (replace method)
-      size_gb: 20
-      essential: true
-      category: video
-      type: diffusion_models
-      format: bf16
-      vram_gb: 24
-      frames: 129
-      resolution: 720p
-      notes: Updated I2V with replace conditioning, better image guidance adherence
-      files:
-        - source: "split_files/diffusion_models/hunyuan_video_v2_replace_image_to_video_720p_bf16.safetensors"
-          dest: "hunyuan_video_v2_replace_image_to_video_720p_bf16.safetensors"
-
-    # HunyuanVideo 1.5 - Latest generation (720p/1080p, T2V/I2V)
-    - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged
-      description: HunyuanVideo 1.5 T2V - 720p text-to-video (8.3B parameters)
-      size_gb: 18
-      essential: true
-      category: video
-      type: diffusion_models
-      format: fp16
-      vram_gb: 24
-      frames: 129-257
-      resolution: 720p
-      notes: 5-10 second T2V with Qwen 2.5 VL encoder, requires 24GB VRAM
-      files:
-        - source: "hunyuanvideo1.5_720p_t2v_fp16.safetensors"
-          dest: "hunyuanvideo1.5_720p_t2v_fp16.safetensors"
-
-    - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged
-      description: HunyuanVideo 1.5 SR - 1080p super-resolution (distilled)
-      size_gb: 18
-      essential: false
-      category: video
-      type: diffusion_models
-      format: fp16
-      vram_gb: 24
-      frames: 129-257
-      resolution: 1080p
-      notes: Upscales 720p to 1080p with distilled model for faster generation
-      files:
-        - source: "hunyuanvideo1.5_1080p_sr_distilled_fp16.safetensors"
-          dest: "hunyuanvideo1.5_1080p_sr_distilled_fp16.safetensors"
-
-    # Wan2.2 5B - Hybrid text+image to video (low VRAM)
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wan2.2 TI2V 5B - Hybrid text+image to video (8GB VRAM)
-      size_gb: 10
-      essential: true
-      category: video
-      type: diffusion_models
-      format: fp16
-      vram_gb: 8
-      frames: 81
-      resolution: 640x640
-      notes: Efficient 5B model with native offloading, dual-expert architecture
-      files:
-        - source: "wan2.2_ti2v_5B_fp16.safetensors"
-          dest: "wan2.2_ti2v_5B_fp16.safetensors"
-
-    # Wan2.2 14B T2V - Dual-expert text-to-video
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wan2.2 T2V High Noise 14B - Text-to-video high noise expert (FP8)
-      size_gb: 14
-      essential: true
-      category: video
-      type: diffusion_models
-      format: fp8_scaled
-      vram_gb: 24
-      frames: 81
-      resolution: 640x640
-      notes: Dual-expert T2V high noise denoising, FP8 quantized for 24GB GPU
-      files:
-        - source: "wan2.2_t2v_high_noise_14B_fp8_scaled.safetensors"
-          dest: "wan2.2_t2v_high_noise_14B_fp8_scaled.safetensors"
-
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wan2.2 T2V Low Noise 14B - Text-to-video low noise expert (FP8)
-      size_gb: 14
-      essential: true
-      category: video
-      type: diffusion_models
-      format: fp8_scaled
-      vram_gb: 24
-      frames: 81
-      resolution: 640x640
-      notes: Dual-expert T2V low noise refinement, FP8 quantized for 24GB GPU
-      files:
-        - source: "wan2.2_t2v_low_noise_14B_fp8_scaled.safetensors"
-          dest: "wan2.2_t2v_low_noise_14B_fp8_scaled.safetensors"
-
-    # Wan2.2 14B I2V - Image-to-video with content consistency
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wan2.2 I2V High Noise 14B - Image-to-video high noise expert (FP16)
-      size_gb: 28
-      essential: true
-      category: video
-      type: diffusion_models
-      format: fp16
-      vram_gb: 24
-      frames: 81
-      resolution: 640x640
-      notes: Dual-expert I2V high noise denoising with content consistency
-      files:
-        - source: "wan2.2_i2v_high_noise_14B_fp16.safetensors"
-          dest: "wan2.2_i2v_high_noise_14B_fp16.safetensors"
-
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wan2.2 I2V Low Noise 14B - Image-to-video low noise expert (FP16)
-      size_gb: 28
-      essential: true
-      category: video
-      type: diffusion_models
-      format: fp16
-      vram_gb: 24
-      frames: 81
-      resolution: 640x640
-      notes: Dual-expert I2V low noise refinement with content consistency
-      files:
-        - source: "wan2.2_i2v_low_noise_14B_fp16.safetensors"
-          dest: "wan2.2_i2v_low_noise_14B_fp16.safetensors"
-
-    # Wan2.2 14B Animate - Video-to-video character animation
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wan2.2 Animate 14B - Video-to-video character animation (BF16)
-      size_gb: 28
-      essential: true
-      category: video
-      type: diffusion_models
-      format: bf16
-      vram_gb: 24
-      frames: 81
-      resolution: multiples of 16
-      notes: V2V animation with Mix/Move modes, requires CLIP Vision H for reference image
-      files:
-        - source: "wan2.2_animate_14B_bf16.safetensors"
-          dest: "wan2.2_animate_14B_bf16.safetensors"
-
-    # Wan2.2 14B S2V - Sound-to-video synchronization
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wan2.2 S2V 14B - Sound-to-video with audio sync (FP8)
-      size_gb: 14
-      essential: true
-      category: video
-      type: diffusion_models
-      format: fp8_scaled
-      vram_gb: 24
-      frames: 81
-      resolution: 640x640
-      notes: Transforms static images + audio into synchronized videos, uses Wav2Vec2 audio encoder
-      files:
-        - source: "wan2.2_s2v_14B_fp8_scaled.safetensors"
-          dest: "wan2.2_s2v_14B_fp8_scaled.safetensors"
-
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wan2.2 S2V 14B - Sound-to-video with audio sync (BF16 quality)
-      size_gb: 28
-      essential: false
-      category: video
-      type: diffusion_models
-      format: bf16
-      vram_gb: 24
-      frames: 81
-      resolution: 640x640
-      notes: Higher quality BF16 version of S2V for better output quality
-      files:
-        - source: "wan2.2_s2v_14B_bf16.safetensors"
-          dest: "wan2.2_s2v_14B_bf16.safetensors"
-
-    # Wan2.2 14B Fun Inpaint - Start-end frame controlled generation
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wan2.2 Fun Inpaint High Noise 14B - Start-end frame transition (FP8)
-      size_gb: 14
-      essential: true
-      category: video
-      type: diffusion_models
-      format: fp8_scaled
-      vram_gb: 24
-      frames: 81
-      resolution: 640x640
-      notes: Generates transition between start and end frames with high noise denoising
-      files:
-        - source: "wan2.2_fun_inpaint_high_noise_14B_fp8_scaled.safetensors"
-          dest: "wan2.2_fun_inpaint_high_noise_14B_fp8_scaled.safetensors"
-
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wan2.2 Fun Inpaint Low Noise 14B - Start-end frame transition (FP8)
-      size_gb: 14
-      essential: true
-      category: video
-      type: diffusion_models
-      format: fp8_scaled
-      vram_gb: 24
-      frames: 81
-      resolution: 640x640
-      notes: Generates transition between start and end frames with low noise refinement
-      files:
-        - source: "wan2.2_fun_inpaint_low_noise_14B_fp8_scaled.safetensors"
-          dest: "wan2.2_fun_inpaint_low_noise_14B_fp8_scaled.safetensors"
-
-    # Wan2.2 14B Fun Control - ControlNet-style conditioning
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wan2.2 Fun Control High Noise 14B - Control conditions (Canny/Depth/Pose/MLSD/trajectory)
-      size_gb: 14
-      essential: true
-      category: video
-      type: diffusion_models
-      format: fp8_scaled
-      vram_gb: 24
-      frames: 81
-      resolution: 640x640
-      notes: I2V with control conditions (Canny, Depth, OpenPose, MLSD, trajectory), requires controlnet_aux
-      files:
-        - source: "wan2.2_fun_control_high_noise_14B_fp8_scaled.safetensors"
-          dest: "wan2.2_fun_control_high_noise_14B_fp8_scaled.safetensors"
-
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wan2.2 Fun Control Low Noise 14B - Control conditions (Canny/Depth/Pose/MLSD/trajectory)
-      size_gb: 14
-      essential: true
-      category: video
-      type: diffusion_models
-      format: fp8_scaled
-      vram_gb: 24
-      frames: 81
-      resolution: 640x640
-      notes: I2V with control conditions low noise refinement
-      files:
-        - source: "wan2.2_fun_control_low_noise_14B_fp8_scaled.safetensors"
-          dest: "wan2.2_fun_control_low_noise_14B_fp8_scaled.safetensors"
-
-    # Wan2.2 14B Fun Camera - Camera motion control
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wan2.2 Fun Camera High Noise 14B - Camera motion control (pan/zoom/static)
-      size_gb: 14
-      essential: true
-      category: video
-      type: diffusion_models
-      format: fp8_scaled
-      vram_gb: 24
-      frames: 81
-      resolution: 640x640
-      notes: I2V with camera motion control (pan, zoom, static), 108s with LoRA / 536s without
-      files:
-        - source: "wan2.2_fun_camera_high_noise_14B_fp8_scaled.safetensors"
-          dest: "wan2.2_fun_camera_high_noise_14B_fp8_scaled.safetensors"
-
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wan2.2 Fun Camera Low Noise 14B - Camera motion control (pan/zoom/static)
-      size_gb: 14
-      essential: true
-      category: video
-      type: diffusion_models
-      format: fp8_scaled
-      vram_gb: 24
-      frames: 81
-      resolution: 640x640
-      notes: I2V with camera motion control low noise refinement
-      files:
-        - source: "wan2.2_fun_camera_low_noise_14B_fp8_scaled.safetensors"
-          dest: "wan2.2_fun_camera_low_noise_14B_fp8_scaled.safetensors"
-
-  # ==========================================================================
-  # AUDIO GENERATION MODELS
-  # ==========================================================================
+  - repo_id: THUDM/CogVideoX-5b
+    description: CogVideoX-5B - Professional text-to-video
+    size_gb: 20
+    essential: true
+    category: video
+    format: fp16
+    vram_gb: 20
+    frames: 49
+    resolution: 720p
+    notes: State-of-the-art text-to-video generation, auto-downloaded by DownloadAndLoadCogVideoModel
+      node
+    files:
+    - source: transformer/diffusion_pytorch_model-00001-of-00002.safetensors
+      dest: diffusion_models/cogvideox-5b-transformer-00001-of-00002.safetensors
+    - source: transformer/diffusion_pytorch_model-00002-of-00002.safetensors
+      dest: diffusion_models/cogvideox-5b-transformer-00002-of-00002.safetensors
+    - source: transformer/diffusion_pytorch_model.safetensors.index.json
+      dest: diffusion_models/cogvideox-5b-transformer.safetensors.index.json
+  - repo_id: THUDM/CogVideoX-5b-I2V
+    description: CogVideoX-5B-I2V - Image-to-video generation
+    size_gb: 20
+    essential: true
+    category: video
+    format: fp16
+    vram_gb: 20
+    frames: 49
+    resolution: 720p
+    notes: Image-to-video model, auto-downloaded by DownloadAndLoadCogVideoModel node
+    files:
+    - source: transformer/diffusion_pytorch_model-00001-of-00003.safetensors
+      dest: diffusion_models/cogvideox-5b-i2v-transformer-00001-of-00003.safetensors
+    - source: transformer/diffusion_pytorch_model-00002-of-00003.safetensors
+      dest: diffusion_models/cogvideox-5b-i2v-transformer-00002-of-00003.safetensors
+    - source: transformer/diffusion_pytorch_model-00003-of-00003.safetensors
+      dest: diffusion_models/cogvideox-5b-i2v-transformer-00003-of-00003.safetensors
+    - source: transformer/diffusion_pytorch_model.safetensors.index.json
+      dest: diffusion_models/cogvideox-5b-i2v-transformer.safetensors.index.json
+  - repo_id: stabilityai/stable-video-diffusion-img2vid
+    description: SVD - 14 frame image-to-video
+    size_gb: 8
+    essential: true
+    category: video
+    format: fp16
+    vram_gb: 20
+    frames: 14
+    resolution: 576x1024
+    notes: Convert images to short video clips
+    files:
+    - source: svd.safetensors
+      dest: checkpoints/svd.safetensors
+  - repo_id: stabilityai/stable-video-diffusion-img2vid-xt
+    description: SVD-XT - 25 frame image-to-video
+    size_gb: 8
+    essential: false
+    category: video
+    format: fp16
+    vram_gb: 20
+    frames: 25
+    resolution: 576x1024
+    notes: Extended frame count version
+    files:
+    - source: svd_xt.safetensors
+      dest: checkpoints/svd_xt.safetensors
+  - repo_id: Comfy-Org/HunyuanVideo_repackaged
+    description: HunyuanVideo Complete - 720p T2V/I2V models with VAE and encoders
+    size_gb: 51
+    essential: true
+    category: video
+    format: bf16
+    vram_gb: 24
+    frames: 129
+    resolution: 720p
+    notes: Complete HunyuanVideo family - T2V, I2V v1/v2, 3D VAE, LLaVA LLaMA3 text/vision
+      encoders
+    files:
+    - source: split_files/diffusion_models/hunyuan_video_t2v_720p_bf16.safetensors
+      dest: diffusion_models/hunyuan_video_t2v_720p_bf16.safetensors
+    - source: split_files/diffusion_models/hunyuan_video_image_to_video_720p_bf16.safetensors
+      dest: diffusion_models/hunyuan_video_image_to_video_720p_bf16.safetensors
+    - source: split_files/diffusion_models/hunyuan_video_v2_replace_image_to_video_720p_bf16.safetensors
+      dest: diffusion_models/hunyuan_video_v2_replace_image_to_video_720p_bf16.safetensors
+    - source: split_files/vae/hunyuan_video_vae_bf16.safetensors
+      dest: diffusion_models/hunyuan_video_vae_bf16.safetensors
+    - source: split_files/text_encoders/llava_llama3_fp8_scaled.safetensors
+      dest: diffusion_models/llava_llama3_fp8_scaled.safetensors
+    - source: split_files/clip_vision/llava_llama3_vision.safetensors
+      dest: diffusion_models/llava_llama3_vision.safetensors
+  - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged
+    description: HunyuanVideo 1.5 Complete - 720p/1080p T2V/SR with encoders
+    size_gb: 51.5
+    essential: true
+    category: video
+    format: fp16
+    vram_gb: 24
+    frames: 129-257
+    resolution: 720p-1080p
+    notes: Complete HunyuanVideo 1.5 - T2V 720p, SR 1080p, VAE, Qwen 2.5 VL, ByT5
+      GlyphXL encoders
+    files:
+    - source: hunyuanvideo1.5_720p_t2v_fp16.safetensors
+      dest: diffusion_models/hunyuanvideo1.5_720p_t2v_fp16.safetensors
+    - source: hunyuanvideo1.5_1080p_sr_distilled_fp16.safetensors
+      dest: diffusion_models/hunyuanvideo1.5_1080p_sr_distilled_fp16.safetensors
+    - source: hunyuanvideo15_vae_fp16.safetensors
+      dest: diffusion_models/hunyuanvideo15_vae_fp16.safetensors
+    - source: qwen_2.5_vl_7b_fp8_scaled.safetensors
+      dest: diffusion_models/qwen_2.5_vl_7b_fp8_scaled.safetensors
+    - source: byt5_small_glyphxl_fp16.safetensors
+      dest: diffusion_models/byt5_small_glyphxl_fp16.safetensors
+  - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
+    description: Wan2.2 Complete - All video models, VAEs, and LoRAs
+    size_gb: 220
+    essential: true
+    category: video
+    format: mixed
+    vram_gb: 24
+    frames: 81
+    resolution: 640x640
+    notes: Complete Wan2.2 model family - TI2V 5B, T2V 14B, I2V 14B, Animate, S2V,
+      Fun Inpaint/Control/Camera, VAEs, CLIP Vision H, Wav2Vec2, and LoRA accelerators
+    files:
+    - source: wan2.2_ti2v_5B_fp16.safetensors
+      dest: diffusion_models/wan2.2_ti2v_5B_fp16.safetensors
+    - source: wan2.2_t2v_high_noise_14B_fp8_scaled.safetensors
+      dest: diffusion_models/wan2.2_t2v_high_noise_14B_fp8_scaled.safetensors
+    - source: wan2.2_t2v_low_noise_14B_fp8_scaled.safetensors
+      dest: diffusion_models/wan2.2_t2v_low_noise_14B_fp8_scaled.safetensors
+    - source: wan2.2_i2v_high_noise_14B_fp16.safetensors
+      dest: diffusion_models/wan2.2_i2v_high_noise_14B_fp16.safetensors
+    - source: wan2.2_i2v_low_noise_14B_fp16.safetensors
+      dest: diffusion_models/wan2.2_i2v_low_noise_14B_fp16.safetensors
+    - source: wan2.2_animate_14B_bf16.safetensors
+      dest: diffusion_models/wan2.2_animate_14B_bf16.safetensors
+    - source: wan2.2_s2v_14B_fp8_scaled.safetensors
+      dest: diffusion_models/wan2.2_s2v_14B_fp8_scaled.safetensors
+    - source: wan2.2_s2v_14B_bf16.safetensors
+      dest: diffusion_models/wan2.2_s2v_14B_bf16.safetensors
+    - source: wan2.2_fun_inpaint_high_noise_14B_fp8_scaled.safetensors
+      dest: diffusion_models/wan2.2_fun_inpaint_high_noise_14B_fp8_scaled.safetensors
+    - source: wan2.2_fun_inpaint_low_noise_14B_fp8_scaled.safetensors
+      dest: diffusion_models/wan2.2_fun_inpaint_low_noise_14B_fp8_scaled.safetensors
+    - source: wan2.2_fun_control_high_noise_14B_fp8_scaled.safetensors
+      dest: diffusion_models/wan2.2_fun_control_high_noise_14B_fp8_scaled.safetensors
+    - source: wan2.2_fun_control_low_noise_14B_fp8_scaled.safetensors
+      dest: diffusion_models/wan2.2_fun_control_low_noise_14B_fp8_scaled.safetensors
+    - source: wan2.2_fun_camera_high_noise_14B_fp8_scaled.safetensors
+      dest: diffusion_models/wan2.2_fun_camera_high_noise_14B_fp8_scaled.safetensors
+    - source: wan2.2_fun_camera_low_noise_14B_fp8_scaled.safetensors
+      dest: diffusion_models/wan2.2_fun_camera_low_noise_14B_fp8_scaled.safetensors
+    - source: wan2.2_vae.safetensors
+      dest: diffusion_models/wan2.2_vae.safetensors
+    - source: wan_2.1_vae.safetensors
+      dest: diffusion_models/wan_2.1_vae.safetensors
+    - source: clip_vision_h.safetensors
+      dest: diffusion_models/clip_vision_h.safetensors
+    - source: wav2vec2_large_english_fp16.safetensors
+      dest: diffusion_models/wav2vec2_large_english_fp16.safetensors
+    - source: lightx2v_I2V_14B_480p_cfg_step_distill_rank64_bf16.safetensors
+      dest: diffusion_models/lightx2v_I2V_14B_480p_cfg_step_distill_rank64_bf16.safetensors
+    - source: wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise.safetensors
+      dest: diffusion_models/wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise.safetensors
+    - source: wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors
+      dest: diffusion_models/wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors
+    - source: wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors
+      dest: diffusion_models/wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors
   audio_models:
-    - repo_id: facebook/musicgen-small
-      description: MusicGen Small - Fast generation
-      size_gb: 3
-      essential: false
-      category: audio
-      type: musicgen
-      format: fp32
-      vram_gb: 4
-      duration_seconds: 30
-      notes: Fastest music generation, lower quality
-      files:
-        - source: "pytorch_model.bin"
-          dest: "musicgen-small-pytorch_model.bin"
-
-    - repo_id: facebook/musicgen-medium
-      description: MusicGen Medium - Balanced quality
-      size_gb: 11
-      essential: true
-      category: audio
-      type: musicgen
-      format: fp32
-      vram_gb: 8
-      duration_seconds: 30
-      notes: Best balance of speed and quality
-      files:
-        - source: "pytorch_model.bin"
-          dest: "musicgen-medium-pytorch_model.bin"
-
-    - repo_id: facebook/musicgen-large
-      description: MusicGen Large - Highest quality
-      size_gb: 22
-      essential: false
-      category: audio
-      type: musicgen
-      format: fp32
-      vram_gb: 16
-      duration_seconds: 30
-      notes: Best quality, slower generation
-      files:
-        - source: "pytorch_model-00001-of-00002.bin"
-          dest: "musicgen-large-pytorch_model-00001-of-00002.bin"
-        - source: "pytorch_model-00002-of-00002.bin"
-          dest: "musicgen-large-pytorch_model-00002-of-00002.bin"
-        - source: "pytorch_model.bin.index.json"
-          dest: "musicgen-large-pytorch_model.bin.index.json"
-
-    # ACE Step v1 3.5B - State-of-the-art music generation
-    - repo_id: Comfy-Org/ACE-Step_ComfyUI_repackaged
-      description: ACE Step v1 3.5B - Fast coherent music generation with 19-language support
-      size_gb: 7.7
-      essential: true
-      category: audio
-      type: checkpoints
-      format: safetensors
-      vram_gb: 16
-      duration_seconds: 240
-      notes: 15x faster than LLM baselines, superior structural coherence, voice cloning, 19-language lyrics
-      files:
-        - source: "all_in_one/ace_step_v1_3.5b.safetensors"
-          dest: "ace_step_v1_3.5b.safetensors"
-
-    # ACE Step Chinese RAP LoRA (optional)
-    - repo_id: ACE-Step/ACE-Step-v1-chinese-rap-LoRA
-      description: ACE Step Chinese RAP LoRA - Enhanced Chinese pronunciation and hip-hop genre
-      size_gb: 0.3
-      essential: false
-      category: audio
-      type: loras
-      format: safetensors
-      notes: Improves Chinese pronunciation accuracy and hip-hop/electronic genre adherence
-      files:
-        - source: "pytorch_lora_weights.safetensors"
-          dest: "ace-step-chinese-rap-lora.safetensors"
-
-  # ==========================================================================
-  # SUPPORT MODELS (CLIP, IP-Adapter, etc.)
-  # ==========================================================================
+  - repo_id: facebook/musicgen-small
+    description: MusicGen Small - Fast generation
+    size_gb: 3
+    essential: false
+    category: audio
+    format: fp32
+    vram_gb: 4
+    duration_seconds: 30
+    notes: Fastest music generation, lower quality
+    files:
+    - source: pytorch_model.bin
+      dest: musicgen/musicgen-small-pytorch_model.bin
+  - repo_id: facebook/musicgen-medium
+    description: MusicGen Medium - Balanced quality
+    size_gb: 11
+    essential: true
+    category: audio
+    format: fp32
+    vram_gb: 8
+    duration_seconds: 30
+    notes: Best balance of speed and quality
+    files:
+    - source: pytorch_model.bin
+      dest: musicgen/musicgen-medium-pytorch_model.bin
+  - repo_id: facebook/musicgen-large
+    description: MusicGen Large - Highest quality
+    size_gb: 22
+    essential: false
+    category: audio
+    format: fp32
+    vram_gb: 16
+    duration_seconds: 30
+    notes: Best quality, slower generation
+    files:
+    - source: pytorch_model-00001-of-00002.bin
+      dest: musicgen/musicgen-large-pytorch_model-00001-of-00002.bin
+    - source: pytorch_model-00002-of-00002.bin
+      dest: musicgen/musicgen-large-pytorch_model-00002-of-00002.bin
+    - source: pytorch_model.bin.index.json
+      dest: musicgen/musicgen-large-pytorch_model.bin.index.json
+  - repo_id: Comfy-Org/ACE-Step_ComfyUI_repackaged
+    description: ACE Step v1 3.5B - Fast coherent music generation with 19-language
+      support
+    size_gb: 7.7
+    essential: true
+    category: audio
+    format: safetensors
+    vram_gb: 16
+    duration_seconds: 240
+    notes: 15x faster than LLM baselines, superior structural coherence, voice cloning,
+      19-language lyrics
+    files:
+    - source: all_in_one/ace_step_v1_3.5b.safetensors
+      dest: checkpoints/ace_step_v1_3.5b.safetensors
+  - repo_id: ACE-Step/ACE-Step-v1-chinese-rap-LoRA
+    description: ACE Step Chinese RAP LoRA - Enhanced Chinese pronunciation and hip-hop
+      genre
+    size_gb: 0.3
+    essential: false
+    category: audio
+    format: safetensors
+    notes: Improves Chinese pronunciation accuracy and hip-hop/electronic genre adherence
+    files:
+    - source: pytorch_lora_weights.safetensors
+      dest: loras/ace-step-chinese-rap-lora.safetensors
   support_models:
-    - repo_id: openai/clip-vit-large-patch14
-      description: CLIP H - For SD 1.5 IP-Adapter
-      size_gb: 2
-      essential: true
-      category: support
-      type: clip_vision
-      format: fp32
-      vram_gb: 2
-      notes: Text-image understanding model for IP-Adapter
-      files:
-        - source: "model.safetensors"
-          dest: "CLIP-ViT-H-14-laion2B-s32B-b79K.safetensors"
-
-    - repo_id: laion/CLIP-ViT-bigG-14-laion2B-39B-b160k
-      description: CLIP G - For SDXL IP-Adapter
-      size_gb: 7
-      essential: true
-      category: support
-      type: clip_vision
-      format: fp32
-      vram_gb: 4
-      notes: Larger CLIP model for SDXL IP-Adapter
-      files:
-        - source: "open_clip_model.safetensors"
-          dest: "CLIP-ViT-bigG-14-laion2B-39B-b160k.safetensors"
-
-    - repo_id: google/siglip-so400m-patch14-384
-      description: SigLIP - For FLUX models
-      size_gb: 2
-      essential: true
-      category: support
-      type: clip_vision
-      format: fp32
-      vram_gb: 2
-      notes: Advanced image-text alignment
-      files:
-        - source: "model.safetensors"
-          dest: "siglip-so400m-patch14-384.safetensors"
-
-    - repo_id: stabilityai/stable-diffusion-3.5-large
-      description: CLIP-L and T5-XXL - For FLUX text encoding
-      size_gb: 10
-      essential: true
-      category: support
-      type: clip
-      format: fp16
-      vram_gb: 4
-      notes: CLIP text encoders required for FLUX models
-      files:
-        - source: "text_encoders/clip_l.safetensors"
-          dest: "clip_l.safetensors"
-        - source: "text_encoders/t5xxl_fp16.safetensors"
-          dest: "t5xxl_fp16.safetensors"
-
-    - repo_id: black-forest-labs/FLUX.1-schnell
-      description: FLUX VAE - Autoencoder for FLUX models
-      size_gb: 0.5
-      essential: true
-      category: support
-      type: vae
-      format: safetensors
-      vram_gb: 1
-      notes: VAE autoencoder required for FLUX image decoding
-      files:
-        - source: "ae.safetensors"
-          dest: "ae.safetensors"
-
-    - repo_id: ai-forever/Real-ESRGAN
-      description: RealESRGAN x2 - 2x upscaling model
-      size_gb: 0.06
-      essential: true
-      category: support
-      type: upscale_models
-      format: pth
-      vram_gb: 2
-      notes: Fast 2x upscaling model for general purpose enhancement
-      files:
-        - source: "RealESRGAN_x2.pth"
-          dest: "RealESRGAN_x2.pth"
-
-    - repo_id: ai-forever/Real-ESRGAN
-      description: RealESRGAN x4 - 4x upscaling model
-      size_gb: 0.06
-      essential: true
-      category: support
-      type: upscale_models
-      format: pth
-      vram_gb: 4
-      notes: High-quality 4x upscaling model for detail enhancement
-      files:
-        - source: "RealESRGAN_x4.pth"
-          dest: "RealESRGAN_x4.pth"
-
-    - repo_id: stabilityai/stable-diffusion-3.5-large
-      description: T5-XXL FP16 - For CogVideoX text encoding
-      size_gb: 9
-      essential: true
-      category: support
-      type: text_encoders
-      format: fp16
-      vram_gb: 4
-      notes: T5 text encoder required for CogVideoX models
-      files:
-        - source: "text_encoders/t5xxl_fp16.safetensors"
-          dest: "t5xxl_fp16.safetensors"
-
-    - repo_id: stabilityai/stable-diffusion-3.5-large
-      description: CLIP-L - For CogVideoX and SD3
-      size_gb: 1
-      essential: true
-      category: support
-      type: text_encoders
-      format: fp32
-      vram_gb: 1
-      notes: CLIP-L text encoder for CogVideoX and SD3 models
-      files:
-        - source: "text_encoders/clip_l.safetensors"
-          dest: "clip_l.safetensors"
-
-    - repo_id: stabilityai/stable-diffusion-3.5-large
-      description: CLIP-G - For SD3 models
-      size_gb: 3
-      essential: false
-      category: support
-      type: text_encoders
-      format: fp32
-      vram_gb: 2
-      notes: CLIP-G text encoder for SD3 models
-      files:
-        - source: "text_encoders/clip_g.safetensors"
-          dest: "clip_g.safetensors"
-
-    # HunyuanVideo Support Models
-    - repo_id: Comfy-Org/HunyuanVideo_repackaged
-      description: HunyuanVideo VAE - 3D VAE for video encoding/decoding (BF16)
-      size_gb: 1
-      essential: true
-      category: support
-      type: vae
-      format: bf16
-      vram_gb: 2
-      notes: 3D VAE autoencoder for HunyuanVideo models
-      files:
-        - source: "split_files/vae/hunyuan_video_vae_bf16.safetensors"
-          dest: "hunyuan_video_vae_bf16.safetensors"
-
-    - repo_id: Comfy-Org/HunyuanVideo_repackaged
-      description: LLaVA LLaMA3 FP8 - Multimodal text encoder for HunyuanVideo
-      size_gb: 8
-      essential: true
-      category: support
-      type: text_encoders
-      format: fp8_scaled
-      vram_gb: 4
-      notes: LLaVA LLaMA3-based text encoder with FP8 quantization
-      files:
-        - source: "split_files/text_encoders/llava_llama3_fp8_scaled.safetensors"
-          dest: "llava_llama3_fp8_scaled.safetensors"
-
-    - repo_id: Comfy-Org/HunyuanVideo_repackaged
-      description: LLaVA LLaMA3 Vision - Vision encoder for HunyuanVideo I2V
-      size_gb: 2
-      essential: true
-      category: support
-      type: clip_vision
-      format: safetensors
-      vram_gb: 2
-      notes: Vision encoder for image-to-video conditioning
-      files:
-        - source: "split_files/clip_vision/llava_llama3_vision.safetensors"
-          dest: "llava_llama3_vision.safetensors"
-
-    # HunyuanVideo 1.5 Support Models
-    - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged
-      description: HunyuanVideo 1.5 VAE - VAE for v1.5 models (FP16)
-      size_gb: 1
-      essential: true
-      category: support
-      type: vae
-      format: fp16
-      vram_gb: 2
-      notes: VAE autoencoder for HunyuanVideo 1.5
-      files:
-        - source: "hunyuanvideo15_vae_fp16.safetensors"
-          dest: "hunyuanvideo15_vae_fp16.safetensors"
-
-    - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged
-      description: Qwen 2.5 VL 7B FP8 - Vision-language encoder for HunyuanVideo 1.5
-      size_gb: 14
-      essential: true
-      category: support
-      type: text_encoders
-      format: fp8_scaled
-      vram_gb: 8
-      notes: Qwen 2.5 VL 7B text encoder with FP8 quantization
-      files:
-        - source: "qwen_2.5_vl_7b_fp8_scaled.safetensors"
-          dest: "qwen_2.5_vl_7b_fp8_scaled.safetensors"
-
-    - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged
-      description: ByT5 Small GlyphXL FP16 - Glyph-aware text encoder for HunyuanVideo 1.5
-      size_gb: 0.5
-      essential: true
-      category: support
-      type: text_encoders
-      format: fp16
-      vram_gb: 1
-      notes: ByT5 small text encoder with glyph awareness
-      files:
-        - source: "byt5_small_glyphxl_fp16.safetensors"
-          dest: "byt5_small_glyphxl_fp16.safetensors"
-
-    # Wan2.2 Support Models
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wan2.2 VAE - VAE for Wan2.2 5B models
-      size_gb: 0.5
-      essential: true
-      category: support
-      type: vae
-      format: safetensors
-      vram_gb: 1
-      notes: VAE autoencoder for Wan2.2 5B TI2V model
-      files:
-        - source: "wan2.2_vae.safetensors"
-          dest: "wan2.2_vae.safetensors"
-
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wan 2.1 VAE - VAE for Wan2.2 14B models
-      size_gb: 0.5
-      essential: true
-      category: support
-      type: vae
-      format: safetensors
-      vram_gb: 1
-      notes: VAE autoencoder for all Wan2.2 14B models (T2V, I2V, S2V, Animate, etc.)
-      files:
-        - source: "wan_2.1_vae.safetensors"
-          dest: "wan_2.1_vae.safetensors"
-
-    - repo_id: Comfy-Org/Wan_2.1_ComfyUI_repackaged
-      description: UMT5-XXL FP8 - Text encoder for all Wan2.2 models
-      size_gb: 10
-      essential: true
-      category: support
-      type: text_encoders
-      format: fp8_scaled
-      vram_gb: 5
-      notes: Shared text encoder for all Wan2.2 models (5B and 14B), FP8 quantized
-      files:
-        - source: "umt5_xxl_fp8_e4m3fn_scaled.safetensors"
-          dest: "umt5_xxl_fp8_e4m3fn_scaled.safetensors"
-
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: CLIP Vision H - Vision encoder for Wan2.2 Animate mode
-      size_gb: 4
-      essential: true
-      category: support
-      type: clip_vision
-      format: safetensors
-      vram_gb: 2
-      notes: CLIP Vision H for reference image in Wan2.2 Animate video-to-video
-      files:
-        - source: "clip_vision_h.safetensors"
-          dest: "clip_vision_h.safetensors"
-
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Wav2Vec2 Large English FP16 - Audio encoder for Wan2.2 S2V
-      size_gb: 1
-      essential: true
-      category: support
-      type: audio_models
-      format: fp16
-      vram_gb: 2
-      notes: Audio encoder for sound-to-video synchronization
-      files:
-        - source: "wav2vec2_large_english_fp16.safetensors"
-          dest: "wav2vec2_large_english_fp16.safetensors"
-
-    # Wan2.2 LoRA Accelerators (4-step distillation)
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Lightx2v I2V Animate LoRA - 4-step acceleration for Wan2.2 Animate
-      size_gb: 0.5
-      essential: true
-      category: support
-      type: loras
-      format: bf16
-      vram_gb: 1
-      notes: 4-step LoRA for Wan2.2 Animate (480p, cfg distilled), 5x speedup
-      files:
-        - source: "lightx2v_I2V_14B_480p_cfg_step_distill_rank64_bf16.safetensors"
-          dest: "lightx2v_I2V_14B_480p_cfg_step_distill_rank64_bf16.safetensors"
-
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Lightx2v T2V High Noise LoRA - 4-step acceleration for Wan2.2 T2V high noise
-      size_gb: 0.5
-      essential: true
-      category: support
-      type: loras
-      format: safetensors
-      vram_gb: 1
-      notes: 4-step LoRA for T2V high noise expert, v1.1
-      files:
-        - source: "wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise.safetensors"
-          dest: "wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise.safetensors"
-
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Lightx2v I2V High Noise LoRA - 4-step acceleration for Wan2.2 I2V high noise
-      size_gb: 0.5
-      essential: true
-      category: support
-      type: loras
-      format: safetensors
-      vram_gb: 1
-      notes: 4-step LoRA for I2V/Fun Inpaint/Fun Control/Fun Camera high noise expert
-      files:
-        - source: "wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors"
-          dest: "wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors"
-
-    - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
-      description: Lightx2v I2V Low Noise LoRA - 4-step acceleration for Wan2.2 I2V low noise
-      size_gb: 0.5
-      essential: true
-      category: support
-      type: loras
-      format: safetensors
-      vram_gb: 1
-      notes: 4-step LoRA for I2V/Fun Inpaint/Fun Control/Fun Camera low noise expert
-      files:
-        - source: "wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors"
-          dest: "wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors"
-
-  # ==========================================================================
-  # ANIMATEDIFF MODELS
-  # ==========================================================================
+  - repo_id: openai/clip-vit-large-patch14
+    description: CLIP H - For SD 1.5 IP-Adapter
+    size_gb: 2
+    essential: true
+    category: support
+    format: fp32
+    vram_gb: 2
+    notes: Text-image understanding model for IP-Adapter
+    files:
+    - source: model.safetensors
+      dest: clip_vision/CLIP-ViT-H-14-laion2B-s32B-b79K.safetensors
+  - repo_id: laion/CLIP-ViT-bigG-14-laion2B-39B-b160k
+    description: CLIP G - For SDXL IP-Adapter
+    size_gb: 7
+    essential: true
+    category: support
+    format: fp32
+    vram_gb: 4
+    notes: Larger CLIP model for SDXL IP-Adapter
+    files:
+    - source: open_clip_model.safetensors
+      dest: clip_vision/CLIP-ViT-bigG-14-laion2B-39B-b160k.safetensors
+  - repo_id: google/siglip-so400m-patch14-384
+    description: SigLIP - For FLUX models
+    size_gb: 2
+    essential: true
+    category: support
+    format: fp32
+    vram_gb: 2
+    notes: Advanced image-text alignment
+    files:
+    - source: model.safetensors
+      dest: clip_vision/siglip-so400m-patch14-384.safetensors
+  - repo_id: black-forest-labs/FLUX.1-schnell
+    description: FLUX VAE - Autoencoder for FLUX models
+    size_gb: 0.5
+    essential: true
+    category: support
+    format: safetensors
+    vram_gb: 1
+    notes: VAE autoencoder required for FLUX image decoding
+    files:
+    - source: ae.safetensors
+      dest: vae/ae.safetensors
+  - repo_id: ai-forever/Real-ESRGAN
+    description: RealESRGAN x2 - 2x upscaling model
+    size_gb: 0.06
+    essential: true
+    category: support
+    format: pth
+    vram_gb: 2
+    notes: Fast 2x upscaling model for general purpose enhancement
+    files:
+    - source: RealESRGAN_x2.pth
+      dest: upscale_models/RealESRGAN_x2.pth
+  - repo_id: ai-forever/Real-ESRGAN
+    description: RealESRGAN x4 - 4x upscaling model
+    size_gb: 0.06
+    essential: true
+    category: support
+    format: pth
+    vram_gb: 4
+    notes: High-quality 4x upscaling model for detail enhancement
+    files:
+    - source: RealESRGAN_x4.pth
+      dest: upscale_models/RealESRGAN_x4.pth
+  - repo_id: Comfy-Org/Wan_2.1_ComfyUI_repackaged
+    description: UMT5-XXL FP8 - Text encoder for all Wan2.2 models
+    size_gb: 10
+    essential: true
+    category: support
+    format: fp8_scaled
+    vram_gb: 5
+    notes: Shared text encoder for all Wan2.2 models (5B and 14B), FP8 quantized
+    files:
+    - source: umt5_xxl_fp8_e4m3fn_scaled.safetensors
+      dest: text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors
   animatediff_models:
-    - repo_id: guoyww/animatediff
-      description: AnimateDiff Motion Modules
-      size_gb: 2
-      essential: true
-      category: animatediff
-      type: animatediff_models
-      filename: mm_sd_v15
-      format: safetensors
-      vram_gb: 4
-      notes: Motion modules for AnimateDiff text-to-video
-      files:
-        - source: "mm_sd_v15_v2.ckpt"
-          dest: "mm_sd_v15_v2.ckpt"
-
-  # ==========================================================================
-  # CONTROLNET MODELS
-  # ==========================================================================
+  - repo_id: guoyww/animatediff
+    description: AnimateDiff Motion Modules
+    size_gb: 2
+    essential: true
+    category: animatediff
+    filename: mm_sd_v15
+    format: safetensors
+    vram_gb: 4
+    notes: Motion modules for AnimateDiff text-to-video
+    files:
+    - source: mm_sd_v15_v2.ckpt
+      dest: animatediff_models/mm_sd_v15_v2.ckpt
   controlnet_models:
-    - repo_id: lllyasviel/control_v11p_sd15_canny
-      description: ControlNet Canny - Edge detection control for SD 1.5
-      size_gb: 1.5
-      essential: false
-      category: controlnet
-      type: controlnet
-      format: safetensors
-      vram_gb: 2
-      notes: Precise edge-based composition control
-      files:
-        - source: "diffusion_pytorch_model.safetensors"
-          dest: "control_v11p_sd15_canny.safetensors"
-
-    - repo_id: lllyasviel/control_v11f1p_sd15_depth
-      description: ControlNet Depth - Depth map control for SD 1.5
-      size_gb: 1.5
-      essential: false
-      category: controlnet
-      type: controlnet
-      format: safetensors
-      vram_gb: 2
-      notes: Depth-based spatial control
-      files:
-        - source: "diffusion_pytorch_model.safetensors"
-          dest: "control_v11p_sd15_depth.safetensors"
-
-    - repo_id: diffusers/controlnet-canny-sdxl-1.0
-      description: ControlNet Canny SDXL - Edge detection for SDXL
-      size_gb: 2.5
-      essential: false
-      category: controlnet
-      type: controlnet
-      format: safetensors
-      vram_gb: 3
-      notes: Canny edge control for SDXL models
-      files:
-        - source: "diffusion_pytorch_model.safetensors"
-          dest: "controlnet-canny-sdxl-1.0.safetensors"
-
-    - repo_id: diffusers/controlnet-depth-sdxl-1.0
-      description: ControlNet Depth SDXL - Depth map for SDXL
-      size_gb: 2.5
-      essential: false
-      category: controlnet
-      type: controlnet
-      format: safetensors
-      vram_gb: 3
-      notes: Depth control for SDXL models
-      files:
-        - source: "diffusion_pytorch_model.safetensors"
-          dest: "controlnet-depth-sdxl-1.0.safetensors"
-
-  # ==========================================================================
-  # IP-ADAPTER MODELS
-  # ==========================================================================
+  - repo_id: lllyasviel/control_v11p_sd15_canny
+    description: ControlNet Canny - Edge detection control for SD 1.5
+    size_gb: 1.5
+    essential: false
+    category: controlnet
+    format: safetensors
+    vram_gb: 2
+    notes: Precise edge-based composition control
+    files:
+    - source: diffusion_pytorch_model.safetensors
+      dest: controlnet/control_v11p_sd15_canny.safetensors
+  - repo_id: lllyasviel/control_v11f1p_sd15_depth
+    description: ControlNet Depth - Depth map control for SD 1.5
+    size_gb: 1.5
+    essential: false
+    category: controlnet
+    format: safetensors
+    vram_gb: 2
+    notes: Depth-based spatial control
+    files:
+    - source: diffusion_pytorch_model.safetensors
+      dest: controlnet/control_v11p_sd15_depth.safetensors
+  - repo_id: diffusers/controlnet-canny-sdxl-1.0
+    description: ControlNet Canny SDXL - Edge detection for SDXL
+    size_gb: 2.5
+    essential: false
+    category: controlnet
+    format: safetensors
+    vram_gb: 3
+    notes: Canny edge control for SDXL models
+    files:
+    - source: diffusion_pytorch_model.safetensors
+      dest: controlnet/controlnet-canny-sdxl-1.0.safetensors
+  - repo_id: diffusers/controlnet-depth-sdxl-1.0
+    description: ControlNet Depth SDXL - Depth map for SDXL
+    size_gb: 2.5
+    essential: false
+    category: controlnet
+    format: safetensors
+    vram_gb: 3
+    notes: Depth control for SDXL models
+    files:
+    - source: diffusion_pytorch_model.safetensors
+      dest: controlnet/controlnet-depth-sdxl-1.0.safetensors
   ipadapter_models:
-    - repo_id: h94/IP-Adapter
-      description: IP-Adapter SDXL Base - Style & Composition
-      size_gb: 1.3
-      essential: true
-      category: ipadapter
-      type: ipadapter
-      format: safetensors
-      vram_gb: 4
-      notes: Basic IP-Adapter for SDXL
-      files:
-        - source: "sdxl_models/ip-adapter_sdxl.safetensors"
-          dest: "ip-adapter_sdxl.safetensors"
-
-    - repo_id: h94/IP-Adapter
-      description: IP-Adapter SDXL VIT-H - For CLIP-ViT-H
-      size_gb: 0.9
-      essential: true
-      category: ipadapter
-      type: ipadapter
-      format: safetensors
-      vram_gb: 4
-      notes: IP-Adapter for SDXL with VIT-H CLIP vision model
-      files:
-        - source: "sdxl_models/ip-adapter_sdxl_vit-h.safetensors"
-          dest: "ip-adapter_sdxl_vit-h.safetensors"
-
-    - repo_id: h94/IP-Adapter
-      description: IP-Adapter SDXL Plus - High Strength Composition
-      size_gb: 0.9
-      essential: false
-      category: ipadapter
-      type: ipadapter
-      format: safetensors
-      vram_gb: 4
-      notes: Enhanced composition control with higher strength
-      files:
-        - source: "sdxl_models/ip-adapter-plus_sdxl_vit-h.safetensors"
-          dest: "ip-adapter-plus_sdxl_vit-h.safetensors"
-
-    - repo_id: h94/IP-Adapter
-      description: IP-Adapter SDXL Plus Face - Face-focused generation
-      size_gb: 0.5
-      essential: false
-      category: ipadapter
-      type: ipadapter
-      format: safetensors
-      vram_gb: 4
-      notes: Specialized for face transfer and portrait generation
-      files:
-        - source: "sdxl_models/ip-adapter-plus-face_sdxl_vit-h.safetensors"
-          dest: "ip-adapter-plus-face_sdxl_vit-h.safetensors"
-
-  # ==========================================================================
-  # DIFFRHYTHM MODELS (Full-length song generation)
-  # ==========================================================================
+  - repo_id: h94/IP-Adapter
+    description: IP-Adapter SDXL Base - Style & Composition
+    size_gb: 1.3
+    essential: true
+    category: ipadapter
+    format: safetensors
+    vram_gb: 4
+    notes: Basic IP-Adapter for SDXL
+    files:
+    - source: sdxl_models/ip-adapter_sdxl.safetensors
+      dest: ipadapter/ip-adapter_sdxl.safetensors
+  - repo_id: h94/IP-Adapter
+    description: IP-Adapter SDXL VIT-H - For CLIP-ViT-H
+    size_gb: 0.9
+    essential: true
+    category: ipadapter
+    format: safetensors
+    vram_gb: 4
+    notes: IP-Adapter for SDXL with VIT-H CLIP vision model
+    files:
+    - source: sdxl_models/ip-adapter_sdxl_vit-h.safetensors
+      dest: ipadapter/ip-adapter_sdxl_vit-h.safetensors
+  - repo_id: h94/IP-Adapter
+    description: IP-Adapter SDXL Plus - High Strength Composition
+    size_gb: 0.9
+    essential: false
+    category: ipadapter
+    format: safetensors
+    vram_gb: 4
+    notes: Enhanced composition control with higher strength
+    files:
+    - source: sdxl_models/ip-adapter-plus_sdxl_vit-h.safetensors
+      dest: ipadapter/ip-adapter-plus_sdxl_vit-h.safetensors
+  - repo_id: h94/IP-Adapter
+    description: IP-Adapter SDXL Plus Face - Face-focused generation
+    size_gb: 0.5
+    essential: false
+    category: ipadapter
+    format: safetensors
+    vram_gb: 4
+    notes: Specialized for face transfer and portrait generation
+    files:
+    - source: sdxl_models/ip-adapter-plus-face_sdxl_vit-h.safetensors
+      dest: ipadapter/ip-adapter-plus-face_sdxl_vit-h.safetensors
   diffrhythm_models:
-    - repo_id: ASLP-lab/DiffRhythm-1_2
-      description: DiffRhythm 1.2 - 95 second generation model
-      size_gb: 2
-      essential: true
-      category: diffrhythm
-      type: TTS/DiffRhythm
-      format: pt
-      vram_gb: 12
-      duration_seconds: 95
-      notes: Latest 95-second generation model
-      files:
-        - source: "cfm_model.pt"
-          dest: "cfm_model_v1_2.pt"
-
-    - repo_id: ASLP-lab/DiffRhythm-full
-      description: DiffRhythm Full - 4m45s full-length generation
-      size_gb: 2
-      essential: false
-      category: diffrhythm
-      type: TTS/DiffRhythm
-      format: pt
-      vram_gb: 16
-      duration_seconds: 285
-      notes: Full-length 4 minute 45 second music generation
-      files:
-        - source: "cfm_model.pt"
-          dest: "cfm_full_model.pt"
-
-    - repo_id: ASLP-lab/DiffRhythm-base
-      description: DiffRhythm Base - 95 second base model
-      size_gb: 2
-      essential: false
-      category: diffrhythm
-      type: TTS/DiffRhythm
-      format: pt
-      vram_gb: 12
-      duration_seconds: 95
-      notes: Base 95-second model
-      files:
-        - source: "cfm_model.pt"
-          dest: "cfm_model.pt"
-
-    - repo_id: ASLP-lab/DiffRhythm-vae
-      description: DiffRhythm VAE - Variational autoencoder
-      size_gb: 1
-      essential: true
-      category: diffrhythm
-      type: TTS/DiffRhythm
-      format: pt
-      vram_gb: 2
-      notes: VAE component fine-tuned from Stable Audio Open (Stability AI Community License)
-      files:
-        - source: "vae_model.pt"
-          dest: "vae_model.pt"
-
-    - repo_id: OpenMuQ/MuQ-MuLan-large
-      description: MuQ-MuLan-large - Music-text joint embedding (~700M parameters)
-      size_gb: 3
-      essential: true
-      category: diffrhythm
-      type: TTS/DiffRhythm/MuQ-MuLan-large
-      format: bin
-      vram_gb: 4
-      notes: Music-text joint embedding for semantic understanding (English/Chinese)
-      files:
-        - source: "config.json"
-          dest: "config.json"
-        - source: "pytorch_model.bin"
-          dest: "pytorch_model.bin"
-
-    - repo_id: OpenMuQ/MuQ-large-msd-iter
-      description: MuQ-large-msd-iter - Music representation learning (~300M parameters)
-      size_gb: 1.2
-      essential: true
-      category: diffrhythm
-      type: TTS/DiffRhythm/MuQ-large-msd-iter
-      format: safetensors
-      vram_gb: 2
-      notes: Music representation model trained on Million Song Dataset
-      files:
-        - source: "config.json"
-          dest: "config.json"
-        - source: "model.safetensors"
-          dest: "model.safetensors"
-
-    - repo_id: FacebookAI/xlm-roberta-base
-      description: XLM-RoBERTa Base - Multilingual text encoder (100 languages, 0.3B params)
-      size_gb: 1.1
-      essential: true
-      category: diffrhythm
-      type: TTS/DiffRhythm/xlm-roberta-base
-      format: safetensors
-      vram_gb: 1
-      notes: Multilingual text encoding for 100 languages
-      files:
-        - source: "config.json"
-          dest: "config.json"
-        - source: "model.safetensors"
-          dest: "model.safetensors"
-        - source: "sentencepiece.bpe.model"
-          dest: "sentencepiece.bpe.model"
-        - source: "tokenizer.json"
-          dest: "tokenizer.json"
-        - source: "tokenizer_config.json"
-          dest: "tokenizer_config.json"
-
-# ============================================================================
-# STORAGE & VRAM SUMMARIES
-# ============================================================================
-
+  - repo_id: ASLP-lab/DiffRhythm-1_2
+    description: DiffRhythm 1.2 - 95 second generation model
+    size_gb: 2
+    essential: true
+    category: diffrhythm
+    format: pt
+    vram_gb: 12
+    duration_seconds: 95
+    notes: Latest 95-second generation model
+    files:
+    - source: cfm_model.pt
+      dest: TTS/DiffRhythm/cfm_model_v1_2.pt
+  - repo_id: ASLP-lab/DiffRhythm-full
+    description: DiffRhythm Full - 4m45s full-length generation
+    size_gb: 2
+    essential: false
+    category: diffrhythm
+    format: pt
+    vram_gb: 16
+    duration_seconds: 285
+    notes: Full-length 4 minute 45 second music generation
+    files:
+    - source: cfm_model.pt
+      dest: TTS/DiffRhythm/cfm_full_model.pt
+  - repo_id: ASLP-lab/DiffRhythm-base
+    description: DiffRhythm Base - 95 second base model
+    size_gb: 2
+    essential: false
+    category: diffrhythm
+    format: pt
+    vram_gb: 12
+    duration_seconds: 95
+    notes: Base 95-second model
+    files:
+    - source: cfm_model.pt
+      dest: TTS/DiffRhythm/cfm_model.pt
+  - repo_id: ASLP-lab/DiffRhythm-vae
+    description: DiffRhythm VAE - Variational autoencoder
+    size_gb: 1
+    essential: true
+    category: diffrhythm
+    format: pt
+    vram_gb: 2
+    notes: VAE component fine-tuned from Stable Audio Open (Stability AI Community
+      License)
+    files:
+    - source: vae_model.pt
+      dest: TTS/DiffRhythm/vae_model.pt
+  - repo_id: OpenMuQ/MuQ-MuLan-large
+    description: MuQ-MuLan-large - Music-text joint embedding (~700M parameters)
+    size_gb: 3
+    essential: true
+    category: diffrhythm
+    format: bin
+    vram_gb: 4
+    notes: Music-text joint embedding for semantic understanding (English/Chinese)
+    files:
+    - source: config.json
+      dest: TTS/DiffRhythm/MuQ-MuLan-large/config.json
+    - source: pytorch_model.bin
+      dest: TTS/DiffRhythm/MuQ-MuLan-large/pytorch_model.bin
+  - repo_id: OpenMuQ/MuQ-large-msd-iter
+    description: MuQ-large-msd-iter - Music representation learning (~300M parameters)
+    size_gb: 1.2
+    essential: true
+    category: diffrhythm
+    format: safetensors
+    vram_gb: 2
+    notes: Music representation model trained on Million Song Dataset
+    files:
+    - source: config.json
+      dest: TTS/DiffRhythm/MuQ-large-msd-iter/config.json
+    - source: model.safetensors
+      dest: TTS/DiffRhythm/MuQ-large-msd-iter/model.safetensors
+  - repo_id: FacebookAI/xlm-roberta-base
+    description: XLM-RoBERTa Base - Multilingual text encoder (100 languages, 0.3B
+      params)
+    size_gb: 1.1
+    essential: true
+    category: diffrhythm
+    format: safetensors
+    vram_gb: 1
+    notes: Multilingual text encoding for 100 languages
+    files:
+    - source: config.json
+      dest: TTS/DiffRhythm/xlm-roberta-base/config.json
+    - source: model.safetensors
+      dest: TTS/DiffRhythm/xlm-roberta-base/model.safetensors
+    - source: sentencepiece.bpe.model
+      dest: TTS/DiffRhythm/xlm-roberta-base/sentencepiece.bpe.model
+    - source: tokenizer.json
+      dest: TTS/DiffRhythm/xlm-roberta-base/tokenizer.json
+    - source: tokenizer_config.json
+      dest: TTS/DiffRhythm/xlm-roberta-base/tokenizer_config.json
 storage_requirements:
   essential_only:
-    image: 30      # FLUX Schnell + SDXL Base
-    video: 28      # CogVideoX + SVD
-    audio: 11      # MusicGen Medium
-    support: 11    # All 3 CLIP models
-    diffrhythm: 10 # DiffRhythm essential models
-    total: 90      # Total essential storage
-
+    image: 30
+    video: 28
+    audio: 11
+    support: 11
+    diffrhythm: 10
+    total: 90
   all_models:
-    image: 54      # All image models
-    video: 36      # All video models
-    audio: 36      # All audio models
-    support: 11    # All support models
-    diffrhythm: 12 # All DiffRhythm models
-    total: 149     # Total with optional models
-
+    image: 54
+    video: 36
+    audio: 36
+    support: 11
+    diffrhythm: 12
+    total: 149
 vram_requirements:
-  # For 24GB GPU (RTX 4090)
   simultaneous_loadable:
-    - name: Image Focus - FLUX FP16
-      models: [FLUX.1 Schnell]
-      vram_used: 23
-      remaining: 1
-
-    - name: Image Focus - FLUX FP8 + SDXL
-      models: [FLUX.1 Schnell FP8, SDXL Base]
-      vram_used: 24
-      remaining: 0
-
-    - name: Video Generation
-      models: [CogVideoX-5B optimized, SDXL]
-      vram_used: 24
-      remaining: 0
-
-    - name: Multi-Modal
-      models: [SDXL, MusicGen Medium]
-      vram_used: 20
-      remaining: 4
-
-# ============================================================================
-# INSTALLATION PROFILES
-# ============================================================================
-
+  - name: Image Focus - FLUX FP16
+    models:
+    - FLUX.1 Schnell
+    vram_used: 23
+    remaining: 1
+  - name: Image Focus - FLUX FP8 + SDXL
+    models:
+    - FLUX.1 Schnell FP8
+    - SDXL Base
+    vram_used: 24
+    remaining: 0
+  - name: Video Generation
+    models:
+    - CogVideoX-5B optimized
+    - SDXL
+    vram_used: 24
+    remaining: 0
+  - name: Multi-Modal
+    models:
+    - SDXL
+    - MusicGen Medium
+    vram_used: 20
+    remaining: 4
 installation_profiles:
   minimal:
     description: Minimal setup for testing
-    categories: [support_models]
+    categories:
+    - support_models
     storage_gb: 11
     estimated_time: 5-10 minutes
-
   essential:
     description: Essential models only (~80GB)
-    categories: [image_models, video_models, audio_models, support_models]
+    categories:
+    - image_models
+    - video_models
+    - audio_models
+    - support_models
     essential_only: true
     storage_gb: 80
     estimated_time: 1-2 hours
-
   image_focused:
     description: All image generation models
-    categories: [image_models, support_models]
+    categories:
+    - image_models
+    - support_models
     storage_gb: 65
     estimated_time: 45-90 minutes
-
   video_focused:
     description: All video generation models
-    categories: [video_models, image_models, support_models]
+    categories:
+    - video_models
+    - image_models
+    - support_models
     essential_only: true
     storage_gb: 69
     estimated_time: 1-2 hours
-
   complete:
     description: All models (including optional)
-    categories: [image_models, video_models, audio_models, support_models]
+    categories:
+    - image_models
+    - video_models
+    - audio_models
+    - support_models
     storage_gb: 137
     estimated_time: 2-4 hours
-
-# ============================================================================
-# METADATA
-# ============================================================================
-
 metadata:
   version: 1.0.0
   last_updated: 2025-11-21
   compatible_with:
-    - ComfyUI >= 0.1.0
-    - Python >= 3.10
-    - HuggingFace Hub >= 0.20.0
+  - ComfyUI >= 0.1.0
+  - Python >= 3.10
+  - HuggingFace Hub >= 0.20.0
   maintainer: Valknar
   repository: https://github.com/yourusername/runpod