All checks were successful
Build and Push RunPod Docker Image / build-and-push (push) Successful in 14s
- CogVideoX-5b: Link 2 shards + index.json instead of single non-existent file - CogVideoX-5b-I2V: Link 3 shards + index.json instead of single non-existent file - Fixes link command failures for these video generation models - Shards are now properly symlinked to ComfyUI diffusion_models directory 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1224 lines
41 KiB
YAML
1224 lines
41 KiB
YAML
# ============================================================================
|
|
# ComfyUI Model Configuration
|
|
# ============================================================================
|
|
#
|
|
# This configuration file defines all available ComfyUI models for download.
|
|
# Models are organized by category: image, video, audio, and support models.
|
|
#
|
|
# Each model entry contains:
|
|
# - repo_id: HuggingFace repository identifier
|
|
# - description: Human-readable description
|
|
# - size_gb: Approximate size in gigabytes
|
|
# - essential: Whether this is an essential model (true/false)
|
|
# - category: Model category (image/video/audio/support)
|
|
#
|
|
# ============================================================================
|
|
|
|
# Global settings
|
|
settings:
|
|
cache_dir: /workspace/huggingface_cache
|
|
parallel_downloads: 1
|
|
retry_attempts: 3
|
|
timeout_seconds: 3600
|
|
|
|
# Model categories
|
|
model_categories:
|
|
# ==========================================================================
|
|
# IMAGE GENERATION MODELS
|
|
# ==========================================================================
|
|
image_models:
|
|
- repo_id: black-forest-labs/FLUX.1-schnell
|
|
description: FLUX.1 Schnell - Fast 4-step inference
|
|
size_gb: 23
|
|
essential: true
|
|
category: image
|
|
type: unet
|
|
format: fp16
|
|
vram_gb: 23
|
|
notes: Industry-leading image generation quality
|
|
files:
|
|
- source: "flux1-schnell.safetensors"
|
|
dest: "flux1-schnell.safetensors"
|
|
|
|
- repo_id: black-forest-labs/FLUX.1-dev
|
|
description: FLUX.1 Dev - Balanced quality/speed
|
|
size_gb: 23
|
|
essential: false
|
|
category: image
|
|
type: unet
|
|
format: fp16
|
|
vram_gb: 23
|
|
notes: Development version with enhanced features
|
|
files:
|
|
- source: "flux1-dev.safetensors"
|
|
dest: "flux1-dev.safetensors"
|
|
|
|
- repo_id: runwayml/stable-diffusion-v1-5
|
|
description: SD 1.5 - For AnimateDiff
|
|
size_gb: 4
|
|
essential: true
|
|
category: image
|
|
type: checkpoints
|
|
format: fp16
|
|
vram_gb: 8
|
|
notes: Stable Diffusion 1.5 required for AnimateDiff motion modules
|
|
files:
|
|
- source: "v1-5-pruned-emaonly.safetensors"
|
|
dest: "v1-5-pruned-emaonly.safetensors"
|
|
|
|
- repo_id: stabilityai/stable-diffusion-xl-base-1.0
|
|
description: SDXL Base 1.0 - Industry standard
|
|
size_gb: 7
|
|
essential: true
|
|
category: image
|
|
type: checkpoints
|
|
format: fp16
|
|
vram_gb: 12
|
|
notes: Most widely used Stable Diffusion model
|
|
files:
|
|
- source: "sd_xl_base_1.0.safetensors"
|
|
dest: "sd_xl_base_1.0.safetensors"
|
|
|
|
- repo_id: stabilityai/stable-diffusion-xl-refiner-1.0
|
|
description: SDXL Refiner 1.0 - Enhances base output
|
|
size_gb: 6
|
|
essential: false
|
|
category: image
|
|
type: checkpoints
|
|
format: fp16
|
|
vram_gb: 12
|
|
notes: Use after SDXL base for improved details
|
|
files:
|
|
- source: "sd_xl_refiner_1.0.safetensors"
|
|
dest: "sd_xl_refiner_1.0.safetensors"
|
|
|
|
- repo_id: stabilityai/stable-diffusion-3.5-large
|
|
description: SD 3.5 Large - Latest Stability AI
|
|
size_gb: 18
|
|
essential: false
|
|
category: image
|
|
type: checkpoints
|
|
format: fp16
|
|
vram_gb: 20
|
|
notes: Newest generation Stable Diffusion
|
|
files:
|
|
- source: "sd3.5_large.safetensors"
|
|
dest: "sd3.5_large.safetensors"
|
|
|
|
# ==========================================================================
|
|
# VIDEO GENERATION MODELS
|
|
# ==========================================================================
|
|
video_models:
|
|
- repo_id: THUDM/CogVideoX-5b
|
|
description: CogVideoX-5B - Professional text-to-video
|
|
size_gb: 20
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: fp16
|
|
vram_gb: 20
|
|
frames: 49
|
|
resolution: 720p
|
|
notes: State-of-the-art text-to-video generation, auto-downloaded by DownloadAndLoadCogVideoModel node
|
|
files:
|
|
- source: "transformer/diffusion_pytorch_model-00001-of-00002.safetensors"
|
|
dest: "cogvideox-5b-transformer-00001-of-00002.safetensors"
|
|
- source: "transformer/diffusion_pytorch_model-00002-of-00002.safetensors"
|
|
dest: "cogvideox-5b-transformer-00002-of-00002.safetensors"
|
|
- source: "transformer/diffusion_pytorch_model.safetensors.index.json"
|
|
dest: "cogvideox-5b-transformer.safetensors.index.json"
|
|
|
|
- repo_id: THUDM/CogVideoX-5b-I2V
|
|
description: CogVideoX-5B-I2V - Image-to-video generation
|
|
size_gb: 20
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: fp16
|
|
vram_gb: 20
|
|
frames: 49
|
|
resolution: 720p
|
|
notes: Image-to-video model, auto-downloaded by DownloadAndLoadCogVideoModel node
|
|
files:
|
|
- source: "transformer/diffusion_pytorch_model-00001-of-00003.safetensors"
|
|
dest: "cogvideox-5b-i2v-transformer-00001-of-00003.safetensors"
|
|
- source: "transformer/diffusion_pytorch_model-00002-of-00003.safetensors"
|
|
dest: "cogvideox-5b-i2v-transformer-00002-of-00003.safetensors"
|
|
- source: "transformer/diffusion_pytorch_model-00003-of-00003.safetensors"
|
|
dest: "cogvideox-5b-i2v-transformer-00003-of-00003.safetensors"
|
|
- source: "transformer/diffusion_pytorch_model.safetensors.index.json"
|
|
dest: "cogvideox-5b-i2v-transformer.safetensors.index.json"
|
|
|
|
- repo_id: stabilityai/stable-video-diffusion-img2vid
|
|
description: SVD - 14 frame image-to-video
|
|
size_gb: 8
|
|
essential: true
|
|
category: video
|
|
type: checkpoints
|
|
format: fp16
|
|
vram_gb: 20
|
|
frames: 14
|
|
resolution: 576x1024
|
|
notes: Convert images to short video clips
|
|
files:
|
|
- source: "svd.safetensors"
|
|
dest: "svd.safetensors"
|
|
|
|
- repo_id: stabilityai/stable-video-diffusion-img2vid-xt
|
|
description: SVD-XT - 25 frame image-to-video
|
|
size_gb: 8
|
|
essential: false
|
|
category: video
|
|
type: checkpoints
|
|
format: fp16
|
|
vram_gb: 20
|
|
frames: 25
|
|
resolution: 576x1024
|
|
notes: Extended frame count version
|
|
files:
|
|
- source: "svd_xt.safetensors"
|
|
dest: "svd_xt.safetensors"
|
|
|
|
# HunyuanVideo - Original (720p, T2V/I2V)
|
|
- repo_id: Comfy-Org/HunyuanVideo_repackaged
|
|
description: HunyuanVideo T2V - 720p text-to-video with MLLM encoders
|
|
size_gb: 20
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: bf16
|
|
vram_gb: 24
|
|
frames: 129
|
|
resolution: 720p
|
|
notes: 5-second T2V generation with Chinese/English support, DiT architecture with 3D VAE
|
|
files:
|
|
- source: "split_files/diffusion_models/hunyuan_video_t2v_720p_bf16.safetensors"
|
|
dest: "hunyuan_video_t2v_720p_bf16.safetensors"
|
|
|
|
- repo_id: Comfy-Org/HunyuanVideo_repackaged
|
|
description: HunyuanVideo I2V v1 - 720p image-to-video (concat method)
|
|
size_gb: 20
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: bf16
|
|
vram_gb: 24
|
|
frames: 129
|
|
resolution: 720p
|
|
notes: Static image to video with concat conditioning, better motion fluidity
|
|
files:
|
|
- source: "split_files/diffusion_models/hunyuan_video_image_to_video_720p_bf16.safetensors"
|
|
dest: "hunyuan_video_image_to_video_720p_bf16.safetensors"
|
|
|
|
- repo_id: Comfy-Org/HunyuanVideo_repackaged
|
|
description: HunyuanVideo I2V v2 - 720p image-to-video (replace method)
|
|
size_gb: 20
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: bf16
|
|
vram_gb: 24
|
|
frames: 129
|
|
resolution: 720p
|
|
notes: Updated I2V with replace conditioning, better image guidance adherence
|
|
files:
|
|
- source: "split_files/diffusion_models/hunyuan_video_v2_replace_image_to_video_720p_bf16.safetensors"
|
|
dest: "hunyuan_video_v2_replace_image_to_video_720p_bf16.safetensors"
|
|
|
|
# HunyuanVideo 1.5 - Latest generation (720p/1080p, T2V/I2V)
|
|
- repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged
|
|
description: HunyuanVideo 1.5 T2V - 720p text-to-video (8.3B parameters)
|
|
size_gb: 18
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: fp16
|
|
vram_gb: 24
|
|
frames: 129-257
|
|
resolution: 720p
|
|
notes: 5-10 second T2V with Qwen 2.5 VL encoder, requires 24GB VRAM
|
|
files:
|
|
- source: "hunyuanvideo1.5_720p_t2v_fp16.safetensors"
|
|
dest: "hunyuanvideo1.5_720p_t2v_fp16.safetensors"
|
|
|
|
- repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged
|
|
description: HunyuanVideo 1.5 SR - 1080p super-resolution (distilled)
|
|
size_gb: 18
|
|
essential: false
|
|
category: video
|
|
type: diffusion_models
|
|
format: fp16
|
|
vram_gb: 24
|
|
frames: 129-257
|
|
resolution: 1080p
|
|
notes: Upscales 720p to 1080p with distilled model for faster generation
|
|
files:
|
|
- source: "hunyuanvideo1.5_1080p_sr_distilled_fp16.safetensors"
|
|
dest: "hunyuanvideo1.5_1080p_sr_distilled_fp16.safetensors"
|
|
|
|
# Wan2.2 5B - Hybrid text+image to video (low VRAM)
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wan2.2 TI2V 5B - Hybrid text+image to video (8GB VRAM)
|
|
size_gb: 10
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: fp16
|
|
vram_gb: 8
|
|
frames: 81
|
|
resolution: 640x640
|
|
notes: Efficient 5B model with native offloading, dual-expert architecture
|
|
files:
|
|
- source: "wan2.2_ti2v_5B_fp16.safetensors"
|
|
dest: "wan2.2_ti2v_5B_fp16.safetensors"
|
|
|
|
# Wan2.2 14B T2V - Dual-expert text-to-video
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wan2.2 T2V High Noise 14B - Text-to-video high noise expert (FP8)
|
|
size_gb: 14
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: fp8_scaled
|
|
vram_gb: 24
|
|
frames: 81
|
|
resolution: 640x640
|
|
notes: Dual-expert T2V high noise denoising, FP8 quantized for 24GB GPU
|
|
files:
|
|
- source: "wan2.2_t2v_high_noise_14B_fp8_scaled.safetensors"
|
|
dest: "wan2.2_t2v_high_noise_14B_fp8_scaled.safetensors"
|
|
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wan2.2 T2V Low Noise 14B - Text-to-video low noise expert (FP8)
|
|
size_gb: 14
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: fp8_scaled
|
|
vram_gb: 24
|
|
frames: 81
|
|
resolution: 640x640
|
|
notes: Dual-expert T2V low noise refinement, FP8 quantized for 24GB GPU
|
|
files:
|
|
- source: "wan2.2_t2v_low_noise_14B_fp8_scaled.safetensors"
|
|
dest: "wan2.2_t2v_low_noise_14B_fp8_scaled.safetensors"
|
|
|
|
# Wan2.2 14B I2V - Image-to-video with content consistency
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wan2.2 I2V High Noise 14B - Image-to-video high noise expert (FP16)
|
|
size_gb: 28
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: fp16
|
|
vram_gb: 24
|
|
frames: 81
|
|
resolution: 640x640
|
|
notes: Dual-expert I2V high noise denoising with content consistency
|
|
files:
|
|
- source: "wan2.2_i2v_high_noise_14B_fp16.safetensors"
|
|
dest: "wan2.2_i2v_high_noise_14B_fp16.safetensors"
|
|
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wan2.2 I2V Low Noise 14B - Image-to-video low noise expert (FP16)
|
|
size_gb: 28
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: fp16
|
|
vram_gb: 24
|
|
frames: 81
|
|
resolution: 640x640
|
|
notes: Dual-expert I2V low noise refinement with content consistency
|
|
files:
|
|
- source: "wan2.2_i2v_low_noise_14B_fp16.safetensors"
|
|
dest: "wan2.2_i2v_low_noise_14B_fp16.safetensors"
|
|
|
|
# Wan2.2 14B Animate - Video-to-video character animation
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wan2.2 Animate 14B - Video-to-video character animation (BF16)
|
|
size_gb: 28
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: bf16
|
|
vram_gb: 24
|
|
frames: 81
|
|
resolution: multiples of 16
|
|
notes: V2V animation with Mix/Move modes, requires CLIP Vision H for reference image
|
|
files:
|
|
- source: "wan2.2_animate_14B_bf16.safetensors"
|
|
dest: "wan2.2_animate_14B_bf16.safetensors"
|
|
|
|
# Wan2.2 14B S2V - Sound-to-video synchronization
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wan2.2 S2V 14B - Sound-to-video with audio sync (FP8)
|
|
size_gb: 14
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: fp8_scaled
|
|
vram_gb: 24
|
|
frames: 81
|
|
resolution: 640x640
|
|
notes: Transforms static images + audio into synchronized videos, uses Wav2Vec2 audio encoder
|
|
files:
|
|
- source: "wan2.2_s2v_14B_fp8_scaled.safetensors"
|
|
dest: "wan2.2_s2v_14B_fp8_scaled.safetensors"
|
|
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wan2.2 S2V 14B - Sound-to-video with audio sync (BF16 quality)
|
|
size_gb: 28
|
|
essential: false
|
|
category: video
|
|
type: diffusion_models
|
|
format: bf16
|
|
vram_gb: 24
|
|
frames: 81
|
|
resolution: 640x640
|
|
notes: Higher quality BF16 version of S2V for better output quality
|
|
files:
|
|
- source: "wan2.2_s2v_14B_bf16.safetensors"
|
|
dest: "wan2.2_s2v_14B_bf16.safetensors"
|
|
|
|
# Wan2.2 14B Fun Inpaint - Start-end frame controlled generation
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wan2.2 Fun Inpaint High Noise 14B - Start-end frame transition (FP8)
|
|
size_gb: 14
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: fp8_scaled
|
|
vram_gb: 24
|
|
frames: 81
|
|
resolution: 640x640
|
|
notes: Generates transition between start and end frames with high noise denoising
|
|
files:
|
|
- source: "wan2.2_fun_inpaint_high_noise_14B_fp8_scaled.safetensors"
|
|
dest: "wan2.2_fun_inpaint_high_noise_14B_fp8_scaled.safetensors"
|
|
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wan2.2 Fun Inpaint Low Noise 14B - Start-end frame transition (FP8)
|
|
size_gb: 14
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: fp8_scaled
|
|
vram_gb: 24
|
|
frames: 81
|
|
resolution: 640x640
|
|
notes: Generates transition between start and end frames with low noise refinement
|
|
files:
|
|
- source: "wan2.2_fun_inpaint_low_noise_14B_fp8_scaled.safetensors"
|
|
dest: "wan2.2_fun_inpaint_low_noise_14B_fp8_scaled.safetensors"
|
|
|
|
# Wan2.2 14B Fun Control - ControlNet-style conditioning
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wan2.2 Fun Control High Noise 14B - Control conditions (Canny/Depth/Pose/MLSD/trajectory)
|
|
size_gb: 14
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: fp8_scaled
|
|
vram_gb: 24
|
|
frames: 81
|
|
resolution: 640x640
|
|
notes: I2V with control conditions (Canny, Depth, OpenPose, MLSD, trajectory), requires controlnet_aux
|
|
files:
|
|
- source: "wan2.2_fun_control_high_noise_14B_fp8_scaled.safetensors"
|
|
dest: "wan2.2_fun_control_high_noise_14B_fp8_scaled.safetensors"
|
|
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wan2.2 Fun Control Low Noise 14B - Control conditions (Canny/Depth/Pose/MLSD/trajectory)
|
|
size_gb: 14
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: fp8_scaled
|
|
vram_gb: 24
|
|
frames: 81
|
|
resolution: 640x640
|
|
notes: I2V with control conditions low noise refinement
|
|
files:
|
|
- source: "wan2.2_fun_control_low_noise_14B_fp8_scaled.safetensors"
|
|
dest: "wan2.2_fun_control_low_noise_14B_fp8_scaled.safetensors"
|
|
|
|
# Wan2.2 14B Fun Camera - Camera motion control
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wan2.2 Fun Camera High Noise 14B - Camera motion control (pan/zoom/static)
|
|
size_gb: 14
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: fp8_scaled
|
|
vram_gb: 24
|
|
frames: 81
|
|
resolution: 640x640
|
|
notes: I2V with camera motion control (pan, zoom, static), 108s with LoRA / 536s without
|
|
files:
|
|
- source: "wan2.2_fun_camera_high_noise_14B_fp8_scaled.safetensors"
|
|
dest: "wan2.2_fun_camera_high_noise_14B_fp8_scaled.safetensors"
|
|
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wan2.2 Fun Camera Low Noise 14B - Camera motion control (pan/zoom/static)
|
|
size_gb: 14
|
|
essential: true
|
|
category: video
|
|
type: diffusion_models
|
|
format: fp8_scaled
|
|
vram_gb: 24
|
|
frames: 81
|
|
resolution: 640x640
|
|
notes: I2V with camera motion control low noise refinement
|
|
files:
|
|
- source: "wan2.2_fun_camera_low_noise_14B_fp8_scaled.safetensors"
|
|
dest: "wan2.2_fun_camera_low_noise_14B_fp8_scaled.safetensors"
|
|
|
|
# ==========================================================================
|
|
# AUDIO GENERATION MODELS
|
|
# ==========================================================================
|
|
audio_models:
|
|
- repo_id: facebook/musicgen-small
|
|
description: MusicGen Small - Fast generation
|
|
size_gb: 3
|
|
essential: false
|
|
category: audio
|
|
type: musicgen
|
|
format: fp32
|
|
vram_gb: 4
|
|
duration_seconds: 30
|
|
notes: Fastest music generation, lower quality
|
|
files:
|
|
- source: "pytorch_model.bin"
|
|
dest: "musicgen-small-pytorch_model.bin"
|
|
|
|
- repo_id: facebook/musicgen-medium
|
|
description: MusicGen Medium - Balanced quality
|
|
size_gb: 11
|
|
essential: true
|
|
category: audio
|
|
type: musicgen
|
|
format: fp32
|
|
vram_gb: 8
|
|
duration_seconds: 30
|
|
notes: Best balance of speed and quality
|
|
files:
|
|
- source: "pytorch_model.bin"
|
|
dest: "musicgen-medium-pytorch_model.bin"
|
|
|
|
- repo_id: facebook/musicgen-large
|
|
description: MusicGen Large - Highest quality
|
|
size_gb: 22
|
|
essential: false
|
|
category: audio
|
|
type: musicgen
|
|
format: fp32
|
|
vram_gb: 16
|
|
duration_seconds: 30
|
|
notes: Best quality, slower generation
|
|
files:
|
|
- source: "pytorch_model-00001-of-00002.bin"
|
|
dest: "musicgen-large-pytorch_model-00001-of-00002.bin"
|
|
- source: "pytorch_model-00002-of-00002.bin"
|
|
dest: "musicgen-large-pytorch_model-00002-of-00002.bin"
|
|
- source: "pytorch_model.bin.index.json"
|
|
dest: "musicgen-large-pytorch_model.bin.index.json"
|
|
|
|
# ACE Step v1 3.5B - State-of-the-art music generation
|
|
- repo_id: Comfy-Org/ACE-Step_ComfyUI_repackaged
|
|
description: ACE Step v1 3.5B - Fast coherent music generation with 19-language support
|
|
size_gb: 7.7
|
|
essential: true
|
|
category: audio
|
|
type: checkpoints
|
|
format: safetensors
|
|
vram_gb: 16
|
|
duration_seconds: 240
|
|
notes: 15x faster than LLM baselines, superior structural coherence, voice cloning, 19-language lyrics
|
|
files:
|
|
- source: "all_in_one/ace_step_v1_3.5b.safetensors"
|
|
dest: "ace_step_v1_3.5b.safetensors"
|
|
|
|
# ACE Step Chinese RAP LoRA (optional)
|
|
- repo_id: ACE-Step/ACE-Step-v1-chinese-rap-LoRA
|
|
description: ACE Step Chinese RAP LoRA - Enhanced Chinese pronunciation and hip-hop genre
|
|
size_gb: 0.3
|
|
essential: false
|
|
category: audio
|
|
type: loras
|
|
format: safetensors
|
|
notes: Improves Chinese pronunciation accuracy and hip-hop/electronic genre adherence
|
|
files:
|
|
- source: "pytorch_lora_weights.safetensors"
|
|
dest: "ace-step-chinese-rap-lora.safetensors"
|
|
|
|
# ==========================================================================
|
|
# SUPPORT MODELS (CLIP, IP-Adapter, etc.)
|
|
# ==========================================================================
|
|
support_models:
|
|
- repo_id: openai/clip-vit-large-patch14
|
|
description: CLIP H - For SD 1.5 IP-Adapter
|
|
size_gb: 2
|
|
essential: true
|
|
category: support
|
|
type: clip_vision
|
|
format: fp32
|
|
vram_gb: 2
|
|
notes: Text-image understanding model for IP-Adapter
|
|
files:
|
|
- source: "model.safetensors"
|
|
dest: "CLIP-ViT-H-14-laion2B-s32B-b79K.safetensors"
|
|
|
|
- repo_id: laion/CLIP-ViT-bigG-14-laion2B-39B-b160k
|
|
description: CLIP G - For SDXL IP-Adapter
|
|
size_gb: 7
|
|
essential: true
|
|
category: support
|
|
type: clip_vision
|
|
format: fp32
|
|
vram_gb: 4
|
|
notes: Larger CLIP model for SDXL IP-Adapter
|
|
files:
|
|
- source: "open_clip_model.safetensors"
|
|
dest: "CLIP-ViT-bigG-14-laion2B-39B-b160k.safetensors"
|
|
|
|
- repo_id: google/siglip-so400m-patch14-384
|
|
description: SigLIP - For FLUX models
|
|
size_gb: 2
|
|
essential: true
|
|
category: support
|
|
type: clip_vision
|
|
format: fp32
|
|
vram_gb: 2
|
|
notes: Advanced image-text alignment
|
|
files:
|
|
- source: "model.safetensors"
|
|
dest: "siglip-so400m-patch14-384.safetensors"
|
|
|
|
- repo_id: stabilityai/stable-diffusion-3.5-large
|
|
description: CLIP-L and T5-XXL - For FLUX text encoding
|
|
size_gb: 10
|
|
essential: true
|
|
category: support
|
|
type: clip
|
|
format: fp16
|
|
vram_gb: 4
|
|
notes: CLIP text encoders required for FLUX models
|
|
files:
|
|
- source: "text_encoders/clip_l.safetensors"
|
|
dest: "clip_l.safetensors"
|
|
- source: "text_encoders/t5xxl_fp16.safetensors"
|
|
dest: "t5xxl_fp16.safetensors"
|
|
|
|
- repo_id: black-forest-labs/FLUX.1-schnell
|
|
description: FLUX VAE - Autoencoder for FLUX models
|
|
size_gb: 0.5
|
|
essential: true
|
|
category: support
|
|
type: vae
|
|
format: safetensors
|
|
vram_gb: 1
|
|
notes: VAE autoencoder required for FLUX image decoding
|
|
files:
|
|
- source: "ae.safetensors"
|
|
dest: "ae.safetensors"
|
|
|
|
- repo_id: ai-forever/Real-ESRGAN
|
|
description: RealESRGAN x2 - 2x upscaling model
|
|
size_gb: 0.06
|
|
essential: true
|
|
category: support
|
|
type: upscale_models
|
|
format: pth
|
|
vram_gb: 2
|
|
notes: Fast 2x upscaling model for general purpose enhancement
|
|
files:
|
|
- source: "RealESRGAN_x2.pth"
|
|
dest: "RealESRGAN_x2.pth"
|
|
|
|
- repo_id: ai-forever/Real-ESRGAN
|
|
description: RealESRGAN x4 - 4x upscaling model
|
|
size_gb: 0.06
|
|
essential: true
|
|
category: support
|
|
type: upscale_models
|
|
format: pth
|
|
vram_gb: 4
|
|
notes: High-quality 4x upscaling model for detail enhancement
|
|
files:
|
|
- source: "RealESRGAN_x4.pth"
|
|
dest: "RealESRGAN_x4.pth"
|
|
|
|
- repo_id: stabilityai/stable-diffusion-3.5-large
|
|
description: T5-XXL FP16 - For CogVideoX text encoding
|
|
size_gb: 9
|
|
essential: true
|
|
category: support
|
|
type: text_encoders
|
|
format: fp16
|
|
vram_gb: 4
|
|
notes: T5 text encoder required for CogVideoX models
|
|
files:
|
|
- source: "text_encoders/t5xxl_fp16.safetensors"
|
|
dest: "t5xxl_fp16.safetensors"
|
|
|
|
- repo_id: stabilityai/stable-diffusion-3.5-large
|
|
description: CLIP-L - For CogVideoX and SD3
|
|
size_gb: 1
|
|
essential: true
|
|
category: support
|
|
type: text_encoders
|
|
format: fp32
|
|
vram_gb: 1
|
|
notes: CLIP-L text encoder for CogVideoX and SD3 models
|
|
files:
|
|
- source: "text_encoders/clip_l.safetensors"
|
|
dest: "clip_l.safetensors"
|
|
|
|
- repo_id: stabilityai/stable-diffusion-3.5-large
|
|
description: CLIP-G - For SD3 models
|
|
size_gb: 3
|
|
essential: false
|
|
category: support
|
|
type: text_encoders
|
|
format: fp32
|
|
vram_gb: 2
|
|
notes: CLIP-G text encoder for SD3 models
|
|
files:
|
|
- source: "text_encoders/clip_g.safetensors"
|
|
dest: "clip_g.safetensors"
|
|
|
|
# HunyuanVideo Support Models
|
|
- repo_id: Comfy-Org/HunyuanVideo_repackaged
|
|
description: HunyuanVideo VAE - 3D VAE for video encoding/decoding (BF16)
|
|
size_gb: 1
|
|
essential: true
|
|
category: support
|
|
type: vae
|
|
format: bf16
|
|
vram_gb: 2
|
|
notes: 3D VAE autoencoder for HunyuanVideo models
|
|
files:
|
|
- source: "split_files/vae/hunyuan_video_vae_bf16.safetensors"
|
|
dest: "hunyuan_video_vae_bf16.safetensors"
|
|
|
|
- repo_id: Comfy-Org/HunyuanVideo_repackaged
|
|
description: LLaVA LLaMA3 FP8 - Multimodal text encoder for HunyuanVideo
|
|
size_gb: 8
|
|
essential: true
|
|
category: support
|
|
type: text_encoders
|
|
format: fp8_scaled
|
|
vram_gb: 4
|
|
notes: LLaVA LLaMA3-based text encoder with FP8 quantization
|
|
files:
|
|
- source: "split_files/text_encoders/llava_llama3_fp8_scaled.safetensors"
|
|
dest: "llava_llama3_fp8_scaled.safetensors"
|
|
|
|
- repo_id: Comfy-Org/HunyuanVideo_repackaged
|
|
description: LLaVA LLaMA3 Vision - Vision encoder for HunyuanVideo I2V
|
|
size_gb: 2
|
|
essential: true
|
|
category: support
|
|
type: clip_vision
|
|
format: safetensors
|
|
vram_gb: 2
|
|
notes: Vision encoder for image-to-video conditioning
|
|
files:
|
|
- source: "split_files/clip_vision/llava_llama3_vision.safetensors"
|
|
dest: "llava_llama3_vision.safetensors"
|
|
|
|
# HunyuanVideo 1.5 Support Models
|
|
- repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged
|
|
description: HunyuanVideo 1.5 VAE - VAE for v1.5 models (FP16)
|
|
size_gb: 1
|
|
essential: true
|
|
category: support
|
|
type: vae
|
|
format: fp16
|
|
vram_gb: 2
|
|
notes: VAE autoencoder for HunyuanVideo 1.5
|
|
files:
|
|
- source: "hunyuanvideo15_vae_fp16.safetensors"
|
|
dest: "hunyuanvideo15_vae_fp16.safetensors"
|
|
|
|
- repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged
|
|
description: Qwen 2.5 VL 7B FP8 - Vision-language encoder for HunyuanVideo 1.5
|
|
size_gb: 14
|
|
essential: true
|
|
category: support
|
|
type: text_encoders
|
|
format: fp8_scaled
|
|
vram_gb: 8
|
|
notes: Qwen 2.5 VL 7B text encoder with FP8 quantization
|
|
files:
|
|
- source: "qwen_2.5_vl_7b_fp8_scaled.safetensors"
|
|
dest: "qwen_2.5_vl_7b_fp8_scaled.safetensors"
|
|
|
|
- repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged
|
|
description: ByT5 Small GlyphXL FP16 - Glyph-aware text encoder for HunyuanVideo 1.5
|
|
size_gb: 0.5
|
|
essential: true
|
|
category: support
|
|
type: text_encoders
|
|
format: fp16
|
|
vram_gb: 1
|
|
notes: ByT5 small text encoder with glyph awareness
|
|
files:
|
|
- source: "byt5_small_glyphxl_fp16.safetensors"
|
|
dest: "byt5_small_glyphxl_fp16.safetensors"
|
|
|
|
# Wan2.2 Support Models
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wan2.2 VAE - VAE for Wan2.2 5B models
|
|
size_gb: 0.5
|
|
essential: true
|
|
category: support
|
|
type: vae
|
|
format: safetensors
|
|
vram_gb: 1
|
|
notes: VAE autoencoder for Wan2.2 5B TI2V model
|
|
files:
|
|
- source: "wan2.2_vae.safetensors"
|
|
dest: "wan2.2_vae.safetensors"
|
|
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wan 2.1 VAE - VAE for Wan2.2 14B models
|
|
size_gb: 0.5
|
|
essential: true
|
|
category: support
|
|
type: vae
|
|
format: safetensors
|
|
vram_gb: 1
|
|
notes: VAE autoencoder for all Wan2.2 14B models (T2V, I2V, S2V, Animate, etc.)
|
|
files:
|
|
- source: "wan_2.1_vae.safetensors"
|
|
dest: "wan_2.1_vae.safetensors"
|
|
|
|
- repo_id: Comfy-Org/Wan_2.1_ComfyUI_repackaged
|
|
description: UMT5-XXL FP8 - Text encoder for all Wan2.2 models
|
|
size_gb: 10
|
|
essential: true
|
|
category: support
|
|
type: text_encoders
|
|
format: fp8_scaled
|
|
vram_gb: 5
|
|
notes: Shared text encoder for all Wan2.2 models (5B and 14B), FP8 quantized
|
|
files:
|
|
- source: "umt5_xxl_fp8_e4m3fn_scaled.safetensors"
|
|
dest: "umt5_xxl_fp8_e4m3fn_scaled.safetensors"
|
|
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: CLIP Vision H - Vision encoder for Wan2.2 Animate mode
|
|
size_gb: 4
|
|
essential: true
|
|
category: support
|
|
type: clip_vision
|
|
format: safetensors
|
|
vram_gb: 2
|
|
notes: CLIP Vision H for reference image in Wan2.2 Animate video-to-video
|
|
files:
|
|
- source: "clip_vision_h.safetensors"
|
|
dest: "clip_vision_h.safetensors"
|
|
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Wav2Vec2 Large English FP16 - Audio encoder for Wan2.2 S2V
|
|
size_gb: 1
|
|
essential: true
|
|
category: support
|
|
type: audio_models
|
|
format: fp16
|
|
vram_gb: 2
|
|
notes: Audio encoder for sound-to-video synchronization
|
|
files:
|
|
- source: "wav2vec2_large_english_fp16.safetensors"
|
|
dest: "wav2vec2_large_english_fp16.safetensors"
|
|
|
|
# Wan2.2 LoRA Accelerators (4-step distillation)
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Lightx2v I2V Animate LoRA - 4-step acceleration for Wan2.2 Animate
|
|
size_gb: 0.5
|
|
essential: true
|
|
category: support
|
|
type: loras
|
|
format: bf16
|
|
vram_gb: 1
|
|
notes: 4-step LoRA for Wan2.2 Animate (480p, cfg distilled), 5x speedup
|
|
files:
|
|
- source: "lightx2v_I2V_14B_480p_cfg_step_distill_rank64_bf16.safetensors"
|
|
dest: "lightx2v_I2V_14B_480p_cfg_step_distill_rank64_bf16.safetensors"
|
|
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Lightx2v T2V High Noise LoRA - 4-step acceleration for Wan2.2 T2V high noise
|
|
size_gb: 0.5
|
|
essential: true
|
|
category: support
|
|
type: loras
|
|
format: safetensors
|
|
vram_gb: 1
|
|
notes: 4-step LoRA for T2V high noise expert, v1.1
|
|
files:
|
|
- source: "wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise.safetensors"
|
|
dest: "wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise.safetensors"
|
|
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Lightx2v I2V High Noise LoRA - 4-step acceleration for Wan2.2 I2V high noise
|
|
size_gb: 0.5
|
|
essential: true
|
|
category: support
|
|
type: loras
|
|
format: safetensors
|
|
vram_gb: 1
|
|
notes: 4-step LoRA for I2V/Fun Inpaint/Fun Control/Fun Camera high noise expert
|
|
files:
|
|
- source: "wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors"
|
|
dest: "wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors"
|
|
|
|
- repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
|
|
description: Lightx2v I2V Low Noise LoRA - 4-step acceleration for Wan2.2 I2V low noise
|
|
size_gb: 0.5
|
|
essential: true
|
|
category: support
|
|
type: loras
|
|
format: safetensors
|
|
vram_gb: 1
|
|
notes: 4-step LoRA for I2V/Fun Inpaint/Fun Control/Fun Camera low noise expert
|
|
files:
|
|
- source: "wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors"
|
|
dest: "wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors"
|
|
|
|
# ==========================================================================
|
|
# ANIMATEDIFF MODELS
|
|
# ==========================================================================
|
|
animatediff_models:
|
|
- repo_id: guoyww/animatediff
|
|
description: AnimateDiff Motion Modules
|
|
size_gb: 2
|
|
essential: true
|
|
category: animatediff
|
|
type: animatediff_models
|
|
filename: mm_sd_v15
|
|
format: safetensors
|
|
vram_gb: 4
|
|
notes: Motion modules for AnimateDiff text-to-video
|
|
files:
|
|
- source: "mm_sd_v15_v2.ckpt"
|
|
dest: "mm_sd_v15_v2.ckpt"
|
|
|
|
# ==========================================================================
|
|
# CONTROLNET MODELS
|
|
# ==========================================================================
|
|
controlnet_models:
|
|
- repo_id: lllyasviel/control_v11p_sd15_canny
|
|
description: ControlNet Canny - Edge detection control for SD 1.5
|
|
size_gb: 1.5
|
|
essential: false
|
|
category: controlnet
|
|
type: controlnet
|
|
format: safetensors
|
|
vram_gb: 2
|
|
notes: Precise edge-based composition control
|
|
files:
|
|
- source: "diffusion_pytorch_model.safetensors"
|
|
dest: "control_v11p_sd15_canny.safetensors"
|
|
|
|
- repo_id: lllyasviel/control_v11f1p_sd15_depth
|
|
description: ControlNet Depth - Depth map control for SD 1.5
|
|
size_gb: 1.5
|
|
essential: false
|
|
category: controlnet
|
|
type: controlnet
|
|
format: safetensors
|
|
vram_gb: 2
|
|
notes: Depth-based spatial control
|
|
files:
|
|
- source: "diffusion_pytorch_model.safetensors"
|
|
dest: "control_v11p_sd15_depth.safetensors"
|
|
|
|
- repo_id: diffusers/controlnet-canny-sdxl-1.0
|
|
description: ControlNet Canny SDXL - Edge detection for SDXL
|
|
size_gb: 2.5
|
|
essential: false
|
|
category: controlnet
|
|
type: controlnet
|
|
format: safetensors
|
|
vram_gb: 3
|
|
notes: Canny edge control for SDXL models
|
|
files:
|
|
- source: "diffusion_pytorch_model.safetensors"
|
|
dest: "controlnet-canny-sdxl-1.0.safetensors"
|
|
|
|
- repo_id: diffusers/controlnet-depth-sdxl-1.0
|
|
description: ControlNet Depth SDXL - Depth map for SDXL
|
|
size_gb: 2.5
|
|
essential: false
|
|
category: controlnet
|
|
type: controlnet
|
|
format: safetensors
|
|
vram_gb: 3
|
|
notes: Depth control for SDXL models
|
|
files:
|
|
- source: "diffusion_pytorch_model.safetensors"
|
|
dest: "controlnet-depth-sdxl-1.0.safetensors"
|
|
|
|
# ==========================================================================
|
|
# IP-ADAPTER MODELS
|
|
# ==========================================================================
|
|
ipadapter_models:
|
|
- repo_id: h94/IP-Adapter
|
|
description: IP-Adapter SDXL Base - Style & Composition
|
|
size_gb: 1.3
|
|
essential: true
|
|
category: ipadapter
|
|
type: ipadapter
|
|
format: safetensors
|
|
vram_gb: 4
|
|
notes: Basic IP-Adapter for SDXL
|
|
files:
|
|
- source: "sdxl_models/ip-adapter_sdxl.safetensors"
|
|
dest: "ip-adapter_sdxl.safetensors"
|
|
|
|
- repo_id: h94/IP-Adapter
|
|
description: IP-Adapter SDXL VIT-H - For CLIP-ViT-H
|
|
size_gb: 0.9
|
|
essential: true
|
|
category: ipadapter
|
|
type: ipadapter
|
|
format: safetensors
|
|
vram_gb: 4
|
|
notes: IP-Adapter for SDXL with VIT-H CLIP vision model
|
|
files:
|
|
- source: "sdxl_models/ip-adapter_sdxl_vit-h.safetensors"
|
|
dest: "ip-adapter_sdxl_vit-h.safetensors"
|
|
|
|
- repo_id: h94/IP-Adapter
|
|
description: IP-Adapter SDXL Plus - High Strength Composition
|
|
size_gb: 0.9
|
|
essential: false
|
|
category: ipadapter
|
|
type: ipadapter
|
|
format: safetensors
|
|
vram_gb: 4
|
|
notes: Enhanced composition control with higher strength
|
|
files:
|
|
- source: "sdxl_models/ip-adapter-plus_sdxl_vit-h.safetensors"
|
|
dest: "ip-adapter-plus_sdxl_vit-h.safetensors"
|
|
|
|
- repo_id: h94/IP-Adapter
|
|
description: IP-Adapter SDXL Plus Face - Face-focused generation
|
|
size_gb: 0.5
|
|
essential: false
|
|
category: ipadapter
|
|
type: ipadapter
|
|
format: safetensors
|
|
vram_gb: 4
|
|
notes: Specialized for face transfer and portrait generation
|
|
files:
|
|
- source: "sdxl_models/ip-adapter-plus-face_sdxl_vit-h.safetensors"
|
|
dest: "ip-adapter-plus-face_sdxl_vit-h.safetensors"
|
|
|
|
# ==========================================================================
|
|
# DIFFRHYTHM MODELS (Full-length song generation)
|
|
# ==========================================================================
|
|
diffrhythm_models:
|
|
- repo_id: ASLP-lab/DiffRhythm-1_2
|
|
description: DiffRhythm 1.2 - 95 second generation model
|
|
size_gb: 2
|
|
essential: true
|
|
category: diffrhythm
|
|
type: TTS/DiffRhythm
|
|
format: pt
|
|
vram_gb: 12
|
|
duration_seconds: 95
|
|
notes: Latest 95-second generation model
|
|
files:
|
|
- source: "cfm_model.pt"
|
|
dest: "cfm_model_v1_2.pt"
|
|
|
|
- repo_id: ASLP-lab/DiffRhythm-full
|
|
description: DiffRhythm Full - 4m45s full-length generation
|
|
size_gb: 2
|
|
essential: false
|
|
category: diffrhythm
|
|
type: TTS/DiffRhythm
|
|
format: pt
|
|
vram_gb: 16
|
|
duration_seconds: 285
|
|
notes: Full-length 4 minute 45 second music generation
|
|
files:
|
|
- source: "cfm_model.pt"
|
|
dest: "cfm_full_model.pt"
|
|
|
|
- repo_id: ASLP-lab/DiffRhythm-base
|
|
description: DiffRhythm Base - 95 second base model
|
|
size_gb: 2
|
|
essential: false
|
|
category: diffrhythm
|
|
type: TTS/DiffRhythm
|
|
format: pt
|
|
vram_gb: 12
|
|
duration_seconds: 95
|
|
notes: Base 95-second model
|
|
files:
|
|
- source: "cfm_model.pt"
|
|
dest: "cfm_model.pt"
|
|
|
|
- repo_id: ASLP-lab/DiffRhythm-vae
|
|
description: DiffRhythm VAE - Variational autoencoder
|
|
size_gb: 1
|
|
essential: true
|
|
category: diffrhythm
|
|
type: TTS/DiffRhythm
|
|
format: pt
|
|
vram_gb: 2
|
|
notes: VAE component fine-tuned from Stable Audio Open (Stability AI Community License)
|
|
files:
|
|
- source: "vae_model.pt"
|
|
dest: "vae_model.pt"
|
|
|
|
- repo_id: OpenMuQ/MuQ-MuLan-large
|
|
description: MuQ-MuLan-large - Music-text joint embedding (~700M parameters)
|
|
size_gb: 3
|
|
essential: true
|
|
category: diffrhythm
|
|
type: TTS/DiffRhythm/MuQ-MuLan-large
|
|
format: bin
|
|
vram_gb: 4
|
|
notes: Music-text joint embedding for semantic understanding (English/Chinese)
|
|
files:
|
|
- source: "config.json"
|
|
dest: "config.json"
|
|
- source: "pytorch_model.bin"
|
|
dest: "pytorch_model.bin"
|
|
|
|
- repo_id: OpenMuQ/MuQ-large-msd-iter
|
|
description: MuQ-large-msd-iter - Music representation learning (~300M parameters)
|
|
size_gb: 1.2
|
|
essential: true
|
|
category: diffrhythm
|
|
type: TTS/DiffRhythm/MuQ-large-msd-iter
|
|
format: safetensors
|
|
vram_gb: 2
|
|
notes: Music representation model trained on Million Song Dataset
|
|
files:
|
|
- source: "config.json"
|
|
dest: "config.json"
|
|
- source: "model.safetensors"
|
|
dest: "model.safetensors"
|
|
|
|
- repo_id: FacebookAI/xlm-roberta-base
|
|
description: XLM-RoBERTa Base - Multilingual text encoder (100 languages, 0.3B params)
|
|
size_gb: 1.1
|
|
essential: true
|
|
category: diffrhythm
|
|
type: TTS/DiffRhythm/xlm-roberta-base
|
|
format: safetensors
|
|
vram_gb: 1
|
|
notes: Multilingual text encoding for 100 languages
|
|
files:
|
|
- source: "config.json"
|
|
dest: "config.json"
|
|
- source: "model.safetensors"
|
|
dest: "model.safetensors"
|
|
- source: "sentencepiece.bpe.model"
|
|
dest: "sentencepiece.bpe.model"
|
|
- source: "tokenizer.json"
|
|
dest: "tokenizer.json"
|
|
- source: "tokenizer_config.json"
|
|
dest: "tokenizer_config.json"
|
|
|
|
# ============================================================================
|
|
# STORAGE & VRAM SUMMARIES
|
|
# ============================================================================
|
|
|
|
storage_requirements:
|
|
essential_only:
|
|
image: 30 # FLUX Schnell + SDXL Base
|
|
video: 28 # CogVideoX + SVD
|
|
audio: 11 # MusicGen Medium
|
|
support: 11 # All 3 CLIP models
|
|
diffrhythm: 10 # DiffRhythm essential models
|
|
total: 90 # Total essential storage
|
|
|
|
all_models:
|
|
image: 54 # All image models
|
|
video: 36 # All video models
|
|
audio: 36 # All audio models
|
|
support: 11 # All support models
|
|
diffrhythm: 12 # All DiffRhythm models
|
|
total: 149 # Total with optional models
|
|
|
|
vram_requirements:
|
|
# For 24GB GPU (RTX 4090)
|
|
simultaneous_loadable:
|
|
- name: Image Focus - FLUX FP16
|
|
models: [FLUX.1 Schnell]
|
|
vram_used: 23
|
|
remaining: 1
|
|
|
|
- name: Image Focus - FLUX FP8 + SDXL
|
|
models: [FLUX.1 Schnell FP8, SDXL Base]
|
|
vram_used: 24
|
|
remaining: 0
|
|
|
|
- name: Video Generation
|
|
models: [CogVideoX-5B optimized, SDXL]
|
|
vram_used: 24
|
|
remaining: 0
|
|
|
|
- name: Multi-Modal
|
|
models: [SDXL, MusicGen Medium]
|
|
vram_used: 20
|
|
remaining: 4
|
|
|
|
# ============================================================================
|
|
# INSTALLATION PROFILES
|
|
# ============================================================================
|
|
|
|
installation_profiles:
|
|
minimal:
|
|
description: Minimal setup for testing
|
|
categories: [support_models]
|
|
storage_gb: 11
|
|
estimated_time: 5-10 minutes
|
|
|
|
essential:
|
|
description: Essential models only (~80GB)
|
|
categories: [image_models, video_models, audio_models, support_models]
|
|
essential_only: true
|
|
storage_gb: 80
|
|
estimated_time: 1-2 hours
|
|
|
|
image_focused:
|
|
description: All image generation models
|
|
categories: [image_models, support_models]
|
|
storage_gb: 65
|
|
estimated_time: 45-90 minutes
|
|
|
|
video_focused:
|
|
description: All video generation models
|
|
categories: [video_models, image_models, support_models]
|
|
essential_only: true
|
|
storage_gb: 69
|
|
estimated_time: 1-2 hours
|
|
|
|
complete:
|
|
description: All models (including optional)
|
|
categories: [image_models, video_models, audio_models, support_models]
|
|
storage_gb: 137
|
|
estimated_time: 2-4 hours
|
|
|
|
# ============================================================================
|
|
# METADATA
|
|
# ============================================================================
|
|
|
|
metadata:
|
|
version: 1.0.0
|
|
last_updated: 2025-11-21
|
|
compatible_with:
|
|
- ComfyUI >= 0.1.0
|
|
- Python >= 3.10
|
|
- HuggingFace Hub >= 0.20.0
|
|
maintainer: Valknar
|
|
repository: https://github.com/yourusername/runpod
|