# ============================================================================ # ComfyUI Model Configuration # ============================================================================ # # This configuration file defines all available ComfyUI models for download. # Models are organized by category: image, video, audio, and support models. # # Each model entry contains: # - repo_id: HuggingFace repository identifier # - description: Human-readable description # - size_gb: Approximate size in gigabytes # - essential: Whether this is an essential model (true/false) # - category: Model category (image/video/audio/support) # # ============================================================================ # Global settings settings: cache_dir: /workspace/huggingface_cache parallel_downloads: 1 retry_attempts: 3 timeout_seconds: 3600 # Model categories model_categories: # ========================================================================== # IMAGE GENERATION MODELS # ========================================================================== image_models: - repo_id: black-forest-labs/FLUX.1-schnell description: FLUX.1 Schnell - Fast 4-step inference size_gb: 23 essential: true category: image type: unet format: fp16 vram_gb: 23 notes: Industry-leading image generation quality files: - source: "flux1-schnell.safetensors" dest: "flux1-schnell.safetensors" - repo_id: black-forest-labs/FLUX.1-dev description: FLUX.1 Dev - Balanced quality/speed size_gb: 23 essential: false category: image type: unet format: fp16 vram_gb: 23 notes: Development version with enhanced features files: - source: "flux1-dev.safetensors" dest: "flux1-dev.safetensors" - repo_id: runwayml/stable-diffusion-v1-5 description: SD 1.5 - For AnimateDiff size_gb: 4 essential: true category: image type: checkpoints format: fp16 vram_gb: 8 notes: Stable Diffusion 1.5 required for AnimateDiff motion modules files: - source: "v1-5-pruned-emaonly.safetensors" dest: "v1-5-pruned-emaonly.safetensors" - repo_id: stabilityai/stable-diffusion-xl-base-1.0 description: SDXL Base 1.0 - Industry standard size_gb: 7 essential: true category: image type: checkpoints format: fp16 vram_gb: 12 notes: Most widely used Stable Diffusion model files: - source: "sd_xl_base_1.0.safetensors" dest: "sd_xl_base_1.0.safetensors" - repo_id: stabilityai/stable-diffusion-xl-refiner-1.0 description: SDXL Refiner 1.0 - Enhances base output size_gb: 6 essential: false category: image type: checkpoints format: fp16 vram_gb: 12 notes: Use after SDXL base for improved details files: - source: "sd_xl_refiner_1.0.safetensors" dest: "sd_xl_refiner_1.0.safetensors" - repo_id: stabilityai/stable-diffusion-3.5-large description: SD 3.5 Large - Latest Stability AI size_gb: 18 essential: false category: image type: checkpoints format: fp16 vram_gb: 20 notes: Newest generation Stable Diffusion files: - source: "sd3.5_large.safetensors" dest: "sd3.5_large.safetensors" - repo_id: John6666/diving-illustrious-real-asian-v50-sdxl description: Diving Illustrious Real Asian v5.0 - Photorealistic Asian subjects size_gb: 7 essential: false category: image type: checkpoints format: fp16 vram_gb: 12 notes: SDXL fine-tune specializing in photorealistic Asian subjects with illustrious quality files: - source: "*.safetensors" dest: "diving-illustrious-real-asian-v50-sdxl.safetensors" - repo_id: playgroundai/playground-v2.5-1024px-aesthetic description: Playground v2.5 - 1024px aesthetic images size_gb: 7 essential: false category: image type: checkpoints format: fp16 vram_gb: 12 notes: Highly aesthetic 1024x1024 outputs, outperforms SDXL and DALL-E 3 in user studies files: - source: "*.safetensors" dest: "playground-v2.5-1024px-aesthetic.safetensors" - repo_id: Lykon/dreamshaper-8 description: DreamShaper 8 - Multi-style versatile model size_gb: 4 essential: false category: image type: checkpoints format: fp16 vram_gb: 8 notes: Versatile SD1.5 fine-tune balancing photorealistic and anime styles with strong LoRA support files: - source: "*.safetensors" dest: "dreamshaper-8.safetensors" # ========================================================================== # VIDEO GENERATION MODELS # ========================================================================== video_models: - repo_id: THUDM/CogVideoX-5b description: CogVideoX-5B - Professional text-to-video size_gb: 20 essential: true category: video type: diffusion_models format: fp16 vram_gb: 20 frames: 49 resolution: 720p notes: State-of-the-art text-to-video generation, auto-downloaded by DownloadAndLoadCogVideoModel node files: - source: "transformer/diffusion_pytorch_model-00001-of-00002.safetensors" dest: "cogvideox-5b-transformer-00001-of-00002.safetensors" - source: "transformer/diffusion_pytorch_model-00002-of-00002.safetensors" dest: "cogvideox-5b-transformer-00002-of-00002.safetensors" - source: "transformer/diffusion_pytorch_model.safetensors.index.json" dest: "cogvideox-5b-transformer.safetensors.index.json" - repo_id: THUDM/CogVideoX-5b-I2V description: CogVideoX-5B-I2V - Image-to-video generation size_gb: 20 essential: true category: video type: diffusion_models format: fp16 vram_gb: 20 frames: 49 resolution: 720p notes: Image-to-video model, auto-downloaded by DownloadAndLoadCogVideoModel node files: - source: "transformer/diffusion_pytorch_model-00001-of-00003.safetensors" dest: "cogvideox-5b-i2v-transformer-00001-of-00003.safetensors" - source: "transformer/diffusion_pytorch_model-00002-of-00003.safetensors" dest: "cogvideox-5b-i2v-transformer-00002-of-00003.safetensors" - source: "transformer/diffusion_pytorch_model-00003-of-00003.safetensors" dest: "cogvideox-5b-i2v-transformer-00003-of-00003.safetensors" - source: "transformer/diffusion_pytorch_model.safetensors.index.json" dest: "cogvideox-5b-i2v-transformer.safetensors.index.json" - repo_id: stabilityai/stable-video-diffusion-img2vid description: SVD - 14 frame image-to-video size_gb: 8 essential: true category: video type: checkpoints format: fp16 vram_gb: 20 frames: 14 resolution: 576x1024 notes: Convert images to short video clips files: - source: "svd.safetensors" dest: "svd.safetensors" - repo_id: stabilityai/stable-video-diffusion-img2vid-xt description: SVD-XT - 25 frame image-to-video size_gb: 8 essential: false category: video type: checkpoints format: fp16 vram_gb: 20 frames: 25 resolution: 576x1024 notes: Extended frame count version files: - source: "svd_xt.safetensors" dest: "svd_xt.safetensors" # HunyuanVideo - Original (720p, T2V/I2V) - repo_id: Comfy-Org/HunyuanVideo_repackaged description: HunyuanVideo T2V - 720p text-to-video with MLLM encoders size_gb: 20 essential: true category: video type: diffusion_models format: bf16 vram_gb: 24 frames: 129 resolution: 720p notes: 5-second T2V generation with Chinese/English support, DiT architecture with 3D VAE files: - source: "split_files/diffusion_models/hunyuan_video_t2v_720p_bf16.safetensors" dest: "hunyuan_video_t2v_720p_bf16.safetensors" - repo_id: Comfy-Org/HunyuanVideo_repackaged description: HunyuanVideo I2V v1 - 720p image-to-video (concat method) size_gb: 20 essential: true category: video type: diffusion_models format: bf16 vram_gb: 24 frames: 129 resolution: 720p notes: Static image to video with concat conditioning, better motion fluidity files: - source: "split_files/diffusion_models/hunyuan_video_image_to_video_720p_bf16.safetensors" dest: "hunyuan_video_image_to_video_720p_bf16.safetensors" - repo_id: Comfy-Org/HunyuanVideo_repackaged description: HunyuanVideo I2V v2 - 720p image-to-video (replace method) size_gb: 20 essential: true category: video type: diffusion_models format: bf16 vram_gb: 24 frames: 129 resolution: 720p notes: Updated I2V with replace conditioning, better image guidance adherence files: - source: "split_files/diffusion_models/hunyuan_video_v2_replace_image_to_video_720p_bf16.safetensors" dest: "hunyuan_video_v2_replace_image_to_video_720p_bf16.safetensors" # HunyuanVideo 1.5 - Latest generation (720p/1080p, T2V/I2V) - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged description: HunyuanVideo 1.5 T2V - 720p text-to-video (8.3B parameters) size_gb: 18 essential: true category: video type: diffusion_models format: fp16 vram_gb: 24 frames: 129-257 resolution: 720p notes: 5-10 second T2V with Qwen 2.5 VL encoder, requires 24GB VRAM files: - source: "hunyuanvideo1.5_720p_t2v_fp16.safetensors" dest: "hunyuanvideo1.5_720p_t2v_fp16.safetensors" - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged description: HunyuanVideo 1.5 SR - 1080p super-resolution (distilled) size_gb: 18 essential: false category: video type: diffusion_models format: fp16 vram_gb: 24 frames: 129-257 resolution: 1080p notes: Upscales 720p to 1080p with distilled model for faster generation files: - source: "hunyuanvideo1.5_1080p_sr_distilled_fp16.safetensors" dest: "hunyuanvideo1.5_1080p_sr_distilled_fp16.safetensors" # Wan2.2 5B - Hybrid text+image to video (low VRAM) - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wan2.2 TI2V 5B - Hybrid text+image to video (8GB VRAM) size_gb: 10 essential: true category: video type: diffusion_models format: fp16 vram_gb: 8 frames: 81 resolution: 640x640 notes: Efficient 5B model with native offloading, dual-expert architecture files: - source: "wan2.2_ti2v_5B_fp16.safetensors" dest: "wan2.2_ti2v_5B_fp16.safetensors" # Wan2.2 14B T2V - Dual-expert text-to-video - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wan2.2 T2V High Noise 14B - Text-to-video high noise expert (FP8) size_gb: 14 essential: true category: video type: diffusion_models format: fp8_scaled vram_gb: 24 frames: 81 resolution: 640x640 notes: Dual-expert T2V high noise denoising, FP8 quantized for 24GB GPU files: - source: "wan2.2_t2v_high_noise_14B_fp8_scaled.safetensors" dest: "wan2.2_t2v_high_noise_14B_fp8_scaled.safetensors" - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wan2.2 T2V Low Noise 14B - Text-to-video low noise expert (FP8) size_gb: 14 essential: true category: video type: diffusion_models format: fp8_scaled vram_gb: 24 frames: 81 resolution: 640x640 notes: Dual-expert T2V low noise refinement, FP8 quantized for 24GB GPU files: - source: "wan2.2_t2v_low_noise_14B_fp8_scaled.safetensors" dest: "wan2.2_t2v_low_noise_14B_fp8_scaled.safetensors" # Wan2.2 14B I2V - Image-to-video with content consistency - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wan2.2 I2V High Noise 14B - Image-to-video high noise expert (FP16) size_gb: 28 essential: true category: video type: diffusion_models format: fp16 vram_gb: 24 frames: 81 resolution: 640x640 notes: Dual-expert I2V high noise denoising with content consistency files: - source: "wan2.2_i2v_high_noise_14B_fp16.safetensors" dest: "wan2.2_i2v_high_noise_14B_fp16.safetensors" - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wan2.2 I2V Low Noise 14B - Image-to-video low noise expert (FP16) size_gb: 28 essential: true category: video type: diffusion_models format: fp16 vram_gb: 24 frames: 81 resolution: 640x640 notes: Dual-expert I2V low noise refinement with content consistency files: - source: "wan2.2_i2v_low_noise_14B_fp16.safetensors" dest: "wan2.2_i2v_low_noise_14B_fp16.safetensors" # Wan2.2 14B Animate - Video-to-video character animation - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wan2.2 Animate 14B - Video-to-video character animation (BF16) size_gb: 28 essential: true category: video type: diffusion_models format: bf16 vram_gb: 24 frames: 81 resolution: multiples of 16 notes: V2V animation with Mix/Move modes, requires CLIP Vision H for reference image files: - source: "wan2.2_animate_14B_bf16.safetensors" dest: "wan2.2_animate_14B_bf16.safetensors" # Wan2.2 14B S2V - Sound-to-video synchronization - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wan2.2 S2V 14B - Sound-to-video with audio sync (FP8) size_gb: 14 essential: true category: video type: diffusion_models format: fp8_scaled vram_gb: 24 frames: 81 resolution: 640x640 notes: Transforms static images + audio into synchronized videos, uses Wav2Vec2 audio encoder files: - source: "wan2.2_s2v_14B_fp8_scaled.safetensors" dest: "wan2.2_s2v_14B_fp8_scaled.safetensors" - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wan2.2 S2V 14B - Sound-to-video with audio sync (BF16 quality) size_gb: 28 essential: false category: video type: diffusion_models format: bf16 vram_gb: 24 frames: 81 resolution: 640x640 notes: Higher quality BF16 version of S2V for better output quality files: - source: "wan2.2_s2v_14B_bf16.safetensors" dest: "wan2.2_s2v_14B_bf16.safetensors" # Wan2.2 14B Fun Inpaint - Start-end frame controlled generation - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wan2.2 Fun Inpaint High Noise 14B - Start-end frame transition (FP8) size_gb: 14 essential: true category: video type: diffusion_models format: fp8_scaled vram_gb: 24 frames: 81 resolution: 640x640 notes: Generates transition between start and end frames with high noise denoising files: - source: "wan2.2_fun_inpaint_high_noise_14B_fp8_scaled.safetensors" dest: "wan2.2_fun_inpaint_high_noise_14B_fp8_scaled.safetensors" - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wan2.2 Fun Inpaint Low Noise 14B - Start-end frame transition (FP8) size_gb: 14 essential: true category: video type: diffusion_models format: fp8_scaled vram_gb: 24 frames: 81 resolution: 640x640 notes: Generates transition between start and end frames with low noise refinement files: - source: "wan2.2_fun_inpaint_low_noise_14B_fp8_scaled.safetensors" dest: "wan2.2_fun_inpaint_low_noise_14B_fp8_scaled.safetensors" # Wan2.2 14B Fun Control - ControlNet-style conditioning - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wan2.2 Fun Control High Noise 14B - Control conditions (Canny/Depth/Pose/MLSD/trajectory) size_gb: 14 essential: true category: video type: diffusion_models format: fp8_scaled vram_gb: 24 frames: 81 resolution: 640x640 notes: I2V with control conditions (Canny, Depth, OpenPose, MLSD, trajectory), requires controlnet_aux files: - source: "wan2.2_fun_control_high_noise_14B_fp8_scaled.safetensors" dest: "wan2.2_fun_control_high_noise_14B_fp8_scaled.safetensors" - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wan2.2 Fun Control Low Noise 14B - Control conditions (Canny/Depth/Pose/MLSD/trajectory) size_gb: 14 essential: true category: video type: diffusion_models format: fp8_scaled vram_gb: 24 frames: 81 resolution: 640x640 notes: I2V with control conditions low noise refinement files: - source: "wan2.2_fun_control_low_noise_14B_fp8_scaled.safetensors" dest: "wan2.2_fun_control_low_noise_14B_fp8_scaled.safetensors" # Wan2.2 14B Fun Camera - Camera motion control - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wan2.2 Fun Camera High Noise 14B - Camera motion control (pan/zoom/static) size_gb: 14 essential: true category: video type: diffusion_models format: fp8_scaled vram_gb: 24 frames: 81 resolution: 640x640 notes: I2V with camera motion control (pan, zoom, static), 108s with LoRA / 536s without files: - source: "wan2.2_fun_camera_high_noise_14B_fp8_scaled.safetensors" dest: "wan2.2_fun_camera_high_noise_14B_fp8_scaled.safetensors" - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wan2.2 Fun Camera Low Noise 14B - Camera motion control (pan/zoom/static) size_gb: 14 essential: true category: video type: diffusion_models format: fp8_scaled vram_gb: 24 frames: 81 resolution: 640x640 notes: I2V with camera motion control low noise refinement files: - source: "wan2.2_fun_camera_low_noise_14B_fp8_scaled.safetensors" dest: "wan2.2_fun_camera_low_noise_14B_fp8_scaled.safetensors" # ========================================================================== # AUDIO GENERATION MODELS # ========================================================================== audio_models: - repo_id: facebook/musicgen-small description: MusicGen Small - Fast generation size_gb: 3 essential: false category: audio type: musicgen format: fp32 vram_gb: 4 duration_seconds: 30 notes: Fastest music generation, lower quality files: - source: "pytorch_model.bin" dest: "musicgen-small-pytorch_model.bin" - repo_id: facebook/musicgen-medium description: MusicGen Medium - Balanced quality size_gb: 11 essential: true category: audio type: musicgen format: fp32 vram_gb: 8 duration_seconds: 30 notes: Best balance of speed and quality files: - source: "pytorch_model.bin" dest: "musicgen-medium-pytorch_model.bin" - repo_id: facebook/musicgen-large description: MusicGen Large - Highest quality size_gb: 22 essential: false category: audio type: musicgen format: fp32 vram_gb: 16 duration_seconds: 30 notes: Best quality, slower generation files: - source: "pytorch_model-00001-of-00002.bin" dest: "musicgen-large-pytorch_model-00001-of-00002.bin" - source: "pytorch_model-00002-of-00002.bin" dest: "musicgen-large-pytorch_model-00002-of-00002.bin" - source: "pytorch_model.bin.index.json" dest: "musicgen-large-pytorch_model.bin.index.json" # ACE Step v1 3.5B - State-of-the-art music generation - repo_id: Comfy-Org/ACE-Step_ComfyUI_repackaged description: ACE Step v1 3.5B - Fast coherent music generation with 19-language support size_gb: 7.7 essential: true category: audio type: checkpoints format: safetensors vram_gb: 16 duration_seconds: 240 notes: 15x faster than LLM baselines, superior structural coherence, voice cloning, 19-language lyrics files: - source: "all_in_one/ace_step_v1_3.5b.safetensors" dest: "ace_step_v1_3.5b.safetensors" # ACE Step Chinese RAP LoRA (optional) - repo_id: ACE-Step/ACE-Step-v1-chinese-rap-LoRA description: ACE Step Chinese RAP LoRA - Enhanced Chinese pronunciation and hip-hop genre size_gb: 0.3 essential: false category: audio type: loras format: safetensors notes: Improves Chinese pronunciation accuracy and hip-hop/electronic genre adherence files: - source: "pytorch_lora_weights.safetensors" dest: "ace-step-chinese-rap-lora.safetensors" # ========================================================================== # SUPPORT MODELS (CLIP, IP-Adapter, etc.) # ========================================================================== support_models: - repo_id: openai/clip-vit-large-patch14 description: CLIP H - For SD 1.5 IP-Adapter size_gb: 2 essential: true category: support type: clip_vision format: fp32 vram_gb: 2 notes: Text-image understanding model for IP-Adapter files: - source: "model.safetensors" dest: "CLIP-ViT-H-14-laion2B-s32B-b79K.safetensors" - repo_id: laion/CLIP-ViT-bigG-14-laion2B-39B-b160k description: CLIP G - For SDXL IP-Adapter size_gb: 7 essential: true category: support type: clip_vision format: fp32 vram_gb: 4 notes: Larger CLIP model for SDXL IP-Adapter files: - source: "open_clip_model.safetensors" dest: "CLIP-ViT-bigG-14-laion2B-39B-b160k.safetensors" - repo_id: google/siglip-so400m-patch14-384 description: SigLIP - For FLUX models size_gb: 2 essential: true category: support type: clip_vision format: fp32 vram_gb: 2 notes: Advanced image-text alignment files: - source: "model.safetensors" dest: "siglip-so400m-patch14-384.safetensors" - repo_id: stabilityai/stable-diffusion-3.5-large description: CLIP-L and T5-XXL - For FLUX text encoding size_gb: 10 essential: true category: support type: clip format: fp16 vram_gb: 4 notes: CLIP text encoders required for FLUX models files: - source: "text_encoders/clip_l.safetensors" dest: "clip_l.safetensors" - source: "text_encoders/t5xxl_fp16.safetensors" dest: "t5xxl_fp16.safetensors" - repo_id: black-forest-labs/FLUX.1-schnell description: FLUX VAE - Autoencoder for FLUX models size_gb: 0.5 essential: true category: support type: vae format: safetensors vram_gb: 1 notes: VAE autoencoder required for FLUX image decoding files: - source: "ae.safetensors" dest: "ae.safetensors" - repo_id: ai-forever/Real-ESRGAN description: RealESRGAN x2 - 2x upscaling model size_gb: 0.06 essential: true category: support type: upscale_models format: pth vram_gb: 2 notes: Fast 2x upscaling model for general purpose enhancement files: - source: "RealESRGAN_x2.pth" dest: "RealESRGAN_x2.pth" - repo_id: ai-forever/Real-ESRGAN description: RealESRGAN x4 - 4x upscaling model size_gb: 0.06 essential: true category: support type: upscale_models format: pth vram_gb: 4 notes: High-quality 4x upscaling model for detail enhancement files: - source: "RealESRGAN_x4.pth" dest: "RealESRGAN_x4.pth" - repo_id: stabilityai/stable-diffusion-3.5-large description: T5-XXL FP16 - For CogVideoX text encoding size_gb: 9 essential: true category: support type: text_encoders format: fp16 vram_gb: 4 notes: T5 text encoder required for CogVideoX models files: - source: "text_encoders/t5xxl_fp16.safetensors" dest: "t5xxl_fp16.safetensors" - repo_id: stabilityai/stable-diffusion-3.5-large description: CLIP-L - For CogVideoX and SD3 size_gb: 1 essential: true category: support type: text_encoders format: fp32 vram_gb: 1 notes: CLIP-L text encoder for CogVideoX and SD3 models files: - source: "text_encoders/clip_l.safetensors" dest: "clip_l.safetensors" - repo_id: stabilityai/stable-diffusion-3.5-large description: CLIP-G - For SD3 models size_gb: 3 essential: false category: support type: text_encoders format: fp32 vram_gb: 2 notes: CLIP-G text encoder for SD3 models files: - source: "text_encoders/clip_g.safetensors" dest: "clip_g.safetensors" # HunyuanVideo Support Models - repo_id: Comfy-Org/HunyuanVideo_repackaged description: HunyuanVideo VAE - 3D VAE for video encoding/decoding (BF16) size_gb: 1 essential: true category: support type: vae format: bf16 vram_gb: 2 notes: 3D VAE autoencoder for HunyuanVideo models files: - source: "split_files/vae/hunyuan_video_vae_bf16.safetensors" dest: "hunyuan_video_vae_bf16.safetensors" - repo_id: Comfy-Org/HunyuanVideo_repackaged description: LLaVA LLaMA3 FP8 - Multimodal text encoder for HunyuanVideo size_gb: 8 essential: true category: support type: text_encoders format: fp8_scaled vram_gb: 4 notes: LLaVA LLaMA3-based text encoder with FP8 quantization files: - source: "split_files/text_encoders/llava_llama3_fp8_scaled.safetensors" dest: "llava_llama3_fp8_scaled.safetensors" - repo_id: Comfy-Org/HunyuanVideo_repackaged description: LLaVA LLaMA3 Vision - Vision encoder for HunyuanVideo I2V size_gb: 2 essential: true category: support type: clip_vision format: safetensors vram_gb: 2 notes: Vision encoder for image-to-video conditioning files: - source: "split_files/clip_vision/llava_llama3_vision.safetensors" dest: "llava_llama3_vision.safetensors" # HunyuanVideo 1.5 Support Models - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged description: HunyuanVideo 1.5 VAE - VAE for v1.5 models (FP16) size_gb: 1 essential: true category: support type: vae format: fp16 vram_gb: 2 notes: VAE autoencoder for HunyuanVideo 1.5 files: - source: "hunyuanvideo15_vae_fp16.safetensors" dest: "hunyuanvideo15_vae_fp16.safetensors" - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged description: Qwen 2.5 VL 7B FP8 - Vision-language encoder for HunyuanVideo 1.5 size_gb: 14 essential: true category: support type: text_encoders format: fp8_scaled vram_gb: 8 notes: Qwen 2.5 VL 7B text encoder with FP8 quantization files: - source: "qwen_2.5_vl_7b_fp8_scaled.safetensors" dest: "qwen_2.5_vl_7b_fp8_scaled.safetensors" - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged description: ByT5 Small GlyphXL FP16 - Glyph-aware text encoder for HunyuanVideo 1.5 size_gb: 0.5 essential: true category: support type: text_encoders format: fp16 vram_gb: 1 notes: ByT5 small text encoder with glyph awareness files: - source: "byt5_small_glyphxl_fp16.safetensors" dest: "byt5_small_glyphxl_fp16.safetensors" # Wan2.2 Support Models - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wan2.2 VAE - VAE for Wan2.2 5B models size_gb: 0.5 essential: true category: support type: vae format: safetensors vram_gb: 1 notes: VAE autoencoder for Wan2.2 5B TI2V model files: - source: "wan2.2_vae.safetensors" dest: "wan2.2_vae.safetensors" - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wan 2.1 VAE - VAE for Wan2.2 14B models size_gb: 0.5 essential: true category: support type: vae format: safetensors vram_gb: 1 notes: VAE autoencoder for all Wan2.2 14B models (T2V, I2V, S2V, Animate, etc.) files: - source: "wan_2.1_vae.safetensors" dest: "wan_2.1_vae.safetensors" - repo_id: Comfy-Org/Wan_2.1_ComfyUI_repackaged description: UMT5-XXL FP8 - Text encoder for all Wan2.2 models size_gb: 10 essential: true category: support type: text_encoders format: fp8_scaled vram_gb: 5 notes: Shared text encoder for all Wan2.2 models (5B and 14B), FP8 quantized files: - source: "umt5_xxl_fp8_e4m3fn_scaled.safetensors" dest: "umt5_xxl_fp8_e4m3fn_scaled.safetensors" - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: CLIP Vision H - Vision encoder for Wan2.2 Animate mode size_gb: 4 essential: true category: support type: clip_vision format: safetensors vram_gb: 2 notes: CLIP Vision H for reference image in Wan2.2 Animate video-to-video files: - source: "clip_vision_h.safetensors" dest: "clip_vision_h.safetensors" - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Wav2Vec2 Large English FP16 - Audio encoder for Wan2.2 S2V size_gb: 1 essential: true category: support type: audio_models format: fp16 vram_gb: 2 notes: Audio encoder for sound-to-video synchronization files: - source: "wav2vec2_large_english_fp16.safetensors" dest: "wav2vec2_large_english_fp16.safetensors" # Wan2.2 LoRA Accelerators (4-step distillation) - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Lightx2v I2V Animate LoRA - 4-step acceleration for Wan2.2 Animate size_gb: 0.5 essential: true category: support type: loras format: bf16 vram_gb: 1 notes: 4-step LoRA for Wan2.2 Animate (480p, cfg distilled), 5x speedup files: - source: "lightx2v_I2V_14B_480p_cfg_step_distill_rank64_bf16.safetensors" dest: "lightx2v_I2V_14B_480p_cfg_step_distill_rank64_bf16.safetensors" - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Lightx2v T2V High Noise LoRA - 4-step acceleration for Wan2.2 T2V high noise size_gb: 0.5 essential: true category: support type: loras format: safetensors vram_gb: 1 notes: 4-step LoRA for T2V high noise expert, v1.1 files: - source: "wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise.safetensors" dest: "wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise.safetensors" - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Lightx2v I2V High Noise LoRA - 4-step acceleration for Wan2.2 I2V high noise size_gb: 0.5 essential: true category: support type: loras format: safetensors vram_gb: 1 notes: 4-step LoRA for I2V/Fun Inpaint/Fun Control/Fun Camera high noise expert files: - source: "wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors" dest: "wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors" - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged description: Lightx2v I2V Low Noise LoRA - 4-step acceleration for Wan2.2 I2V low noise size_gb: 0.5 essential: true category: support type: loras format: safetensors vram_gb: 1 notes: 4-step LoRA for I2V/Fun Inpaint/Fun Control/Fun Camera low noise expert files: - source: "wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors" dest: "wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors" # ========================================================================== # ANIMATEDIFF MODELS # ========================================================================== animatediff_models: - repo_id: guoyww/animatediff description: AnimateDiff Motion Modules size_gb: 2 essential: true category: animatediff type: animatediff_models filename: mm_sd_v15 format: safetensors vram_gb: 4 notes: Motion modules for AnimateDiff text-to-video files: - source: "mm_sd_v15_v2.ckpt" dest: "mm_sd_v15_v2.ckpt" # ========================================================================== # CONTROLNET MODELS # ========================================================================== controlnet_models: - repo_id: lllyasviel/control_v11p_sd15_canny description: ControlNet Canny - Edge detection control for SD 1.5 size_gb: 1.5 essential: false category: controlnet type: controlnet format: safetensors vram_gb: 2 notes: Precise edge-based composition control files: - source: "diffusion_pytorch_model.safetensors" dest: "control_v11p_sd15_canny.safetensors" - repo_id: lllyasviel/control_v11f1p_sd15_depth description: ControlNet Depth - Depth map control for SD 1.5 size_gb: 1.5 essential: false category: controlnet type: controlnet format: safetensors vram_gb: 2 notes: Depth-based spatial control files: - source: "diffusion_pytorch_model.safetensors" dest: "control_v11p_sd15_depth.safetensors" - repo_id: diffusers/controlnet-canny-sdxl-1.0 description: ControlNet Canny SDXL - Edge detection for SDXL size_gb: 2.5 essential: false category: controlnet type: controlnet format: safetensors vram_gb: 3 notes: Canny edge control for SDXL models files: - source: "diffusion_pytorch_model.safetensors" dest: "controlnet-canny-sdxl-1.0.safetensors" - repo_id: diffusers/controlnet-depth-sdxl-1.0 description: ControlNet Depth SDXL - Depth map for SDXL size_gb: 2.5 essential: false category: controlnet type: controlnet format: safetensors vram_gb: 3 notes: Depth control for SDXL models files: - source: "diffusion_pytorch_model.safetensors" dest: "controlnet-depth-sdxl-1.0.safetensors" # ========================================================================== # IP-ADAPTER MODELS # ========================================================================== ipadapter_models: - repo_id: h94/IP-Adapter description: IP-Adapter SDXL Base - Style & Composition size_gb: 1.3 essential: true category: ipadapter type: ipadapter format: safetensors vram_gb: 4 notes: Basic IP-Adapter for SDXL files: - source: "sdxl_models/ip-adapter_sdxl.safetensors" dest: "ip-adapter_sdxl.safetensors" - repo_id: h94/IP-Adapter description: IP-Adapter SDXL VIT-H - For CLIP-ViT-H size_gb: 0.9 essential: true category: ipadapter type: ipadapter format: safetensors vram_gb: 4 notes: IP-Adapter for SDXL with VIT-H CLIP vision model files: - source: "sdxl_models/ip-adapter_sdxl_vit-h.safetensors" dest: "ip-adapter_sdxl_vit-h.safetensors" - repo_id: h94/IP-Adapter description: IP-Adapter SDXL Plus - High Strength Composition size_gb: 0.9 essential: false category: ipadapter type: ipadapter format: safetensors vram_gb: 4 notes: Enhanced composition control with higher strength files: - source: "sdxl_models/ip-adapter-plus_sdxl_vit-h.safetensors" dest: "ip-adapter-plus_sdxl_vit-h.safetensors" - repo_id: h94/IP-Adapter description: IP-Adapter SDXL Plus Face - Face-focused generation size_gb: 0.5 essential: false category: ipadapter type: ipadapter format: safetensors vram_gb: 4 notes: Specialized for face transfer and portrait generation files: - source: "sdxl_models/ip-adapter-plus-face_sdxl_vit-h.safetensors" dest: "ip-adapter-plus-face_sdxl_vit-h.safetensors" # ========================================================================== # DIFFRHYTHM MODELS (Full-length song generation) # ========================================================================== diffrhythm_models: - repo_id: ASLP-lab/DiffRhythm-1_2 description: DiffRhythm 1.2 - 95 second generation model size_gb: 2 essential: true category: diffrhythm type: TTS/DiffRhythm format: pt vram_gb: 12 duration_seconds: 95 notes: Latest 95-second generation model files: - source: "cfm_model.pt" dest: "cfm_model_v1_2.pt" - repo_id: ASLP-lab/DiffRhythm-full description: DiffRhythm Full - 4m45s full-length generation size_gb: 2 essential: false category: diffrhythm type: TTS/DiffRhythm format: pt vram_gb: 16 duration_seconds: 285 notes: Full-length 4 minute 45 second music generation files: - source: "cfm_model.pt" dest: "cfm_full_model.pt" - repo_id: ASLP-lab/DiffRhythm-base description: DiffRhythm Base - 95 second base model size_gb: 2 essential: false category: diffrhythm type: TTS/DiffRhythm format: pt vram_gb: 12 duration_seconds: 95 notes: Base 95-second model files: - source: "cfm_model.pt" dest: "cfm_model.pt" - repo_id: ASLP-lab/DiffRhythm-vae description: DiffRhythm VAE - Variational autoencoder size_gb: 1 essential: true category: diffrhythm type: TTS/DiffRhythm format: pt vram_gb: 2 notes: VAE component fine-tuned from Stable Audio Open (Stability AI Community License) files: - source: "vae_model.pt" dest: "vae_model.pt" - repo_id: OpenMuQ/MuQ-MuLan-large description: MuQ-MuLan-large - Music-text joint embedding (~700M parameters) size_gb: 3 essential: true category: diffrhythm type: TTS/DiffRhythm/MuQ-MuLan-large format: bin vram_gb: 4 notes: Music-text joint embedding for semantic understanding (English/Chinese) files: - source: "config.json" dest: "config.json" - source: "pytorch_model.bin" dest: "pytorch_model.bin" - repo_id: OpenMuQ/MuQ-large-msd-iter description: MuQ-large-msd-iter - Music representation learning (~300M parameters) size_gb: 1.2 essential: true category: diffrhythm type: TTS/DiffRhythm/MuQ-large-msd-iter format: safetensors vram_gb: 2 notes: Music representation model trained on Million Song Dataset files: - source: "config.json" dest: "config.json" - source: "model.safetensors" dest: "model.safetensors" - repo_id: FacebookAI/xlm-roberta-base description: XLM-RoBERTa Base - Multilingual text encoder (100 languages, 0.3B params) size_gb: 1.1 essential: true category: diffrhythm type: TTS/DiffRhythm/xlm-roberta-base format: safetensors vram_gb: 1 notes: Multilingual text encoding for 100 languages files: - source: "config.json" dest: "config.json" - source: "model.safetensors" dest: "model.safetensors" - source: "sentencepiece.bpe.model" dest: "sentencepiece.bpe.model" - source: "tokenizer.json" dest: "tokenizer.json" - source: "tokenizer_config.json" dest: "tokenizer_config.json" # ============================================================================ # STORAGE & VRAM SUMMARIES # ============================================================================ storage_requirements: essential_only: image: 30 # FLUX Schnell + SDXL Base video: 28 # CogVideoX + SVD audio: 11 # MusicGen Medium support: 11 # All 3 CLIP models diffrhythm: 10 # DiffRhythm essential models total: 90 # Total essential storage all_models: image: 54 # All image models video: 36 # All video models audio: 36 # All audio models support: 11 # All support models diffrhythm: 12 # All DiffRhythm models total: 149 # Total with optional models vram_requirements: # For 24GB GPU (RTX 4090) simultaneous_loadable: - name: Image Focus - FLUX FP16 models: [FLUX.1 Schnell] vram_used: 23 remaining: 1 - name: Image Focus - FLUX FP8 + SDXL models: [FLUX.1 Schnell FP8, SDXL Base] vram_used: 24 remaining: 0 - name: Video Generation models: [CogVideoX-5B optimized, SDXL] vram_used: 24 remaining: 0 - name: Multi-Modal models: [SDXL, MusicGen Medium] vram_used: 20 remaining: 4 # ============================================================================ # INSTALLATION PROFILES # ============================================================================ installation_profiles: minimal: description: Minimal setup for testing categories: [support_models] storage_gb: 11 estimated_time: 5-10 minutes essential: description: Essential models only (~80GB) categories: [image_models, video_models, audio_models, support_models] essential_only: true storage_gb: 80 estimated_time: 1-2 hours image_focused: description: All image generation models categories: [image_models, support_models] storage_gb: 65 estimated_time: 45-90 minutes video_focused: description: All video generation models categories: [video_models, image_models, support_models] essential_only: true storage_gb: 69 estimated_time: 1-2 hours complete: description: All models (including optional) categories: [image_models, video_models, audio_models, support_models] storage_gb: 137 estimated_time: 2-4 hours # ============================================================================ # METADATA # ============================================================================ metadata: version: 1.0.0 last_updated: 2025-11-21 compatible_with: - ComfyUI >= 0.1.0 - Python >= 3.10 - HuggingFace Hub >= 0.20.0 maintainer: Valknar repository: https://github.com/yourusername/runpod