runpod/models_huggingface.yaml

settings:
  cache_dir: /workspace/huggingface_cache
  parallel_downloads: 1
  retry_attempts: 3
  timeout_seconds: 3600
model_categories:
  image_models:
  - repo_id: black-forest-labs/FLUX.1-schnell
    description: FLUX.1 Schnell - Fast 4-step inference
    size_gb: 23
    essential: true
    category: image
    format: fp16
    vram_gb: 23
    notes: Industry-leading image generation quality
    files:
    - source: flux1-schnell.safetensors
      dest: unet/flux1-schnell.safetensors
  - repo_id: black-forest-labs/FLUX.1-dev
    description: FLUX.1 Dev - Balanced quality/speed
    size_gb: 23
    essential: false
    category: image
    format: fp16
    vram_gb: 23
    notes: Development version with enhanced features
    files:
    - source: flux1-dev.safetensors
      dest: unet/flux1-dev.safetensors
  - repo_id: runwayml/stable-diffusion-v1-5
    description: SD 1.5 - For AnimateDiff
    size_gb: 4
    essential: true
    category: image
    format: fp16
    vram_gb: 8
    notes: Stable Diffusion 1.5 required for AnimateDiff motion modules
    files:
    - source: v1-5-pruned-emaonly.safetensors
      dest: checkpoints/v1-5-pruned-emaonly.safetensors
  - repo_id: stabilityai/stable-diffusion-xl-base-1.0
    description: SDXL Base 1.0 - Industry standard
    size_gb: 7
    essential: true
    category: image
    format: fp16
    vram_gb: 12
    notes: Most widely used Stable Diffusion model
    files:
    - source: sd_xl_base_1.0.safetensors
      dest: checkpoints/sd_xl_base_1.0.safetensors
  - repo_id: stabilityai/stable-diffusion-xl-refiner-1.0
    description: SDXL Refiner 1.0 - Enhances base output
    size_gb: 6
    essential: false
    category: image
    format: fp16
    vram_gb: 12
    notes: Use after SDXL base for improved details
    files:
    - source: sd_xl_refiner_1.0.safetensors
      dest: checkpoints/sd_xl_refiner_1.0.safetensors
  - repo_id: stabilityai/stable-diffusion-3.5-large
    description: SD 3.5 Large Complete - Checkpoint and text encoders
    size_gb: 31
    essential: false
    category: image
    format: mixed
    vram_gb: 20
    notes: Complete SD3.5 Large model with checkpoint and all text encoders (CLIP-L,
      CLIP-G, T5-XXL)
    files:
    - source: sd3.5_large.safetensors
      dest: checkpoints/sd3.5_large.safetensors
    - source: text_encoders/clip_l.safetensors
      dest: checkpoints/clip_l.safetensors
    - source: text_encoders/clip_g.safetensors
      dest: checkpoints/clip_g.safetensors
    - source: text_encoders/t5xxl_fp16.safetensors
      dest: checkpoints/t5xxl_fp16.safetensors
  - repo_id: John6666/diving-illustrious-real-asian-v50-sdxl
    description: Diving Illustrious Real Asian v5.0 - Photorealistic Asian subjects
    size_gb: 7
    essential: false
    category: image
    format: fp16
    vram_gb: 12
    notes: SDXL fine-tune specializing in photorealistic Asian subjects with illustrious
      quality
    files:
    - source: unet/diffusion_pytorch_model.safetensors
      dest: checkpoints/diving-illustrious-real-asian-v50-sdxl.safetensors
  - repo_id: playgroundai/playground-v2.5-1024px-aesthetic
    description: Playground v2.5 - 1024px aesthetic images
    size_gb: 7
    essential: false
    category: image
    format: fp16
    vram_gb: 12
    notes: Highly aesthetic 1024x1024 outputs, outperforms SDXL and DALL-E 3 in user
      studies
    files:
    - source: playground-v2.5-1024px-aesthetic.fp16.safetensors
      dest: checkpoints/playground-v2.5-1024px-aesthetic.safetensors
  - repo_id: Lykon/dreamshaper-8
    description: DreamShaper 8 - Multi-style versatile model
    size_gb: 4
    essential: false
    category: image
    format: fp16
    vram_gb: 8
    notes: Versatile SD1.5 fine-tune balancing photorealistic and anime styles with
      strong LoRA support
    files:
    - source: unet/diffusion_pytorch_model.fp16.safetensors
      dest: checkpoints/dreamshaper-8.safetensors
  video_models:
  - repo_id: THUDM/CogVideoX-5b
    description: CogVideoX-5B - Professional text-to-video
    size_gb: 20
    essential: true
    category: video
    format: fp16
    vram_gb: 20
    frames: 49
    resolution: 720p
    notes: State-of-the-art text-to-video generation, auto-downloaded by DownloadAndLoadCogVideoModel
      node
    files:
    - source: transformer/diffusion_pytorch_model-00001-of-00002.safetensors
      dest: diffusion_models/cogvideox-5b-transformer-00001-of-00002.safetensors
    - source: transformer/diffusion_pytorch_model-00002-of-00002.safetensors
      dest: diffusion_models/cogvideox-5b-transformer-00002-of-00002.safetensors
    - source: transformer/diffusion_pytorch_model.safetensors.index.json
      dest: diffusion_models/cogvideox-5b-transformer.safetensors.index.json
  - repo_id: THUDM/CogVideoX-5b-I2V
    description: CogVideoX-5B-I2V - Image-to-video generation
    size_gb: 20
    essential: true
    category: video
    format: fp16
    vram_gb: 20
    frames: 49
    resolution: 720p
    notes: Image-to-video model, auto-downloaded by DownloadAndLoadCogVideoModel node
    files:
    - source: transformer/diffusion_pytorch_model-00001-of-00003.safetensors
      dest: diffusion_models/cogvideox-5b-i2v-transformer-00001-of-00003.safetensors
    - source: transformer/diffusion_pytorch_model-00002-of-00003.safetensors
      dest: diffusion_models/cogvideox-5b-i2v-transformer-00002-of-00003.safetensors
    - source: transformer/diffusion_pytorch_model-00003-of-00003.safetensors
      dest: diffusion_models/cogvideox-5b-i2v-transformer-00003-of-00003.safetensors
    - source: transformer/diffusion_pytorch_model.safetensors.index.json
      dest: diffusion_models/cogvideox-5b-i2v-transformer.safetensors.index.json
  - repo_id: stabilityai/stable-video-diffusion-img2vid
    description: SVD - 14 frame image-to-video
    size_gb: 8
    essential: true
    category: video
    format: fp16
    vram_gb: 20
    frames: 14
    resolution: 576x1024
    notes: Convert images to short video clips
    files:
    - source: svd.safetensors
      dest: checkpoints/svd.safetensors
  - repo_id: stabilityai/stable-video-diffusion-img2vid-xt
    description: SVD-XT - 25 frame image-to-video
    size_gb: 8
    essential: false
    category: video
    format: fp16
    vram_gb: 20
    frames: 25
    resolution: 576x1024
    notes: Extended frame count version
    files:
    - source: svd_xt.safetensors
      dest: checkpoints/svd_xt.safetensors
  - repo_id: Comfy-Org/HunyuanVideo_repackaged
    description: HunyuanVideo Complete - 720p T2V/I2V models with VAE and encoders
    size_gb: 51
    essential: true
    category: video
    format: bf16
    vram_gb: 24
    frames: 129
    resolution: 720p
    notes: Complete HunyuanVideo family - T2V, I2V v1/v2, 3D VAE, LLaVA LLaMA3 text/vision
      encoders
    files:
    - source: split_files/diffusion_models/hunyuan_video_t2v_720p_bf16.safetensors
      dest: diffusion_models/hunyuan_video_t2v_720p_bf16.safetensors
    - source: split_files/diffusion_models/hunyuan_video_image_to_video_720p_bf16.safetensors
      dest: diffusion_models/hunyuan_video_image_to_video_720p_bf16.safetensors
    - source: split_files/diffusion_models/hunyuan_video_v2_replace_image_to_video_720p_bf16.safetensors
      dest: diffusion_models/hunyuan_video_v2_replace_image_to_video_720p_bf16.safetensors
    - source: split_files/vae/hunyuan_video_vae_bf16.safetensors
      dest: diffusion_models/hunyuan_video_vae_bf16.safetensors
    - source: split_files/text_encoders/llava_llama3_fp8_scaled.safetensors
      dest: diffusion_models/llava_llama3_fp8_scaled.safetensors
    - source: split_files/clip_vision/llava_llama3_vision.safetensors
      dest: diffusion_models/llava_llama3_vision.safetensors
  - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged
    description: HunyuanVideo 1.5 Complete - 720p/1080p T2V/SR with encoders
    size_gb: 51.5
    essential: true
    category: video
    format: fp16
    vram_gb: 24
    frames: 129-257
    resolution: 720p-1080p
    notes: Complete HunyuanVideo 1.5 - T2V 720p, SR 1080p, VAE, Qwen 2.5 VL, ByT5
      GlyphXL encoders
    files:
    - source: hunyuanvideo1.5_720p_t2v_fp16.safetensors
      dest: diffusion_models/hunyuanvideo1.5_720p_t2v_fp16.safetensors
    - source: hunyuanvideo1.5_1080p_sr_distilled_fp16.safetensors
      dest: diffusion_models/hunyuanvideo1.5_1080p_sr_distilled_fp16.safetensors
    - source: hunyuanvideo15_vae_fp16.safetensors
      dest: diffusion_models/hunyuanvideo15_vae_fp16.safetensors
    - source: qwen_2.5_vl_7b_fp8_scaled.safetensors
      dest: diffusion_models/qwen_2.5_vl_7b_fp8_scaled.safetensors
    - source: byt5_small_glyphxl_fp16.safetensors
      dest: diffusion_models/byt5_small_glyphxl_fp16.safetensors
  - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
    description: Wan2.2 Complete - All video models, VAEs, and LoRAs
    size_gb: 220
    essential: true
    category: video
    format: mixed
    vram_gb: 24
    frames: 81
    resolution: 640x640
    notes: Complete Wan2.2 model family - TI2V 5B, T2V 14B, I2V 14B, Animate, S2V,
      Fun Inpaint/Control/Camera, VAEs, CLIP Vision H, Wav2Vec2, and LoRA accelerators
    files:
    - source: wan2.2_ti2v_5B_fp16.safetensors
      dest: diffusion_models/wan2.2_ti2v_5B_fp16.safetensors
    - source: wan2.2_t2v_high_noise_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_t2v_high_noise_14B_fp8_scaled.safetensors
    - source: wan2.2_t2v_low_noise_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_t2v_low_noise_14B_fp8_scaled.safetensors
    - source: wan2.2_i2v_high_noise_14B_fp16.safetensors
      dest: diffusion_models/wan2.2_i2v_high_noise_14B_fp16.safetensors
    - source: wan2.2_i2v_low_noise_14B_fp16.safetensors
      dest: diffusion_models/wan2.2_i2v_low_noise_14B_fp16.safetensors
    - source: wan2.2_animate_14B_bf16.safetensors
      dest: diffusion_models/wan2.2_animate_14B_bf16.safetensors
    - source: wan2.2_s2v_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_s2v_14B_fp8_scaled.safetensors
    - source: wan2.2_s2v_14B_bf16.safetensors
      dest: diffusion_models/wan2.2_s2v_14B_bf16.safetensors
    - source: wan2.2_fun_inpaint_high_noise_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_fun_inpaint_high_noise_14B_fp8_scaled.safetensors
    - source: wan2.2_fun_inpaint_low_noise_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_fun_inpaint_low_noise_14B_fp8_scaled.safetensors
    - source: wan2.2_fun_control_high_noise_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_fun_control_high_noise_14B_fp8_scaled.safetensors
    - source: wan2.2_fun_control_low_noise_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_fun_control_low_noise_14B_fp8_scaled.safetensors
    - source: wan2.2_fun_camera_high_noise_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_fun_camera_high_noise_14B_fp8_scaled.safetensors
    - source: wan2.2_fun_camera_low_noise_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_fun_camera_low_noise_14B_fp8_scaled.safetensors
    - source: wan2.2_vae.safetensors
      dest: diffusion_models/wan2.2_vae.safetensors
    - source: wan_2.1_vae.safetensors
      dest: diffusion_models/wan_2.1_vae.safetensors
    - source: clip_vision_h.safetensors
      dest: diffusion_models/clip_vision_h.safetensors
    - source: wav2vec2_large_english_fp16.safetensors
      dest: diffusion_models/wav2vec2_large_english_fp16.safetensors
    - source: lightx2v_I2V_14B_480p_cfg_step_distill_rank64_bf16.safetensors
      dest: diffusion_models/lightx2v_I2V_14B_480p_cfg_step_distill_rank64_bf16.safetensors
    - source: wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise.safetensors
      dest: diffusion_models/wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise.safetensors
    - source: wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors
      dest: diffusion_models/wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors
    - source: wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors
      dest: diffusion_models/wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors
  audio_models:
  - repo_id: facebook/musicgen-small
    description: MusicGen Small - Fast generation
    size_gb: 3
    essential: false
    category: audio
    format: fp32
    vram_gb: 4
    duration_seconds: 30
    notes: Fastest music generation, lower quality
    files:
    - source: pytorch_model.bin
      dest: musicgen/musicgen-small-pytorch_model.bin
  - repo_id: facebook/musicgen-medium
    description: MusicGen Medium - Balanced quality
    size_gb: 11
    essential: true
    category: audio
    format: fp32
    vram_gb: 8
    duration_seconds: 30
    notes: Best balance of speed and quality
    files:
    - source: pytorch_model.bin
      dest: musicgen/musicgen-medium-pytorch_model.bin
  - repo_id: facebook/musicgen-large
    description: MusicGen Large - Highest quality
    size_gb: 22
    essential: false
    category: audio
    format: fp32
    vram_gb: 16
    duration_seconds: 30
    notes: Best quality, slower generation
    files:
    - source: pytorch_model-00001-of-00002.bin
      dest: musicgen/musicgen-large-pytorch_model-00001-of-00002.bin
    - source: pytorch_model-00002-of-00002.bin
      dest: musicgen/musicgen-large-pytorch_model-00002-of-00002.bin
    - source: pytorch_model.bin.index.json
      dest: musicgen/musicgen-large-pytorch_model.bin.index.json
  - repo_id: Comfy-Org/ACE-Step_ComfyUI_repackaged
    description: ACE Step v1 3.5B - Fast coherent music generation with 19-language
      support
    size_gb: 7.7
    essential: true
    category: audio
    format: safetensors
    vram_gb: 16
    duration_seconds: 240
    notes: 15x faster than LLM baselines, superior structural coherence, voice cloning,
      19-language lyrics
    files:
    - source: all_in_one/ace_step_v1_3.5b.safetensors
      dest: checkpoints/ace_step_v1_3.5b.safetensors
  - repo_id: ACE-Step/ACE-Step-v1-chinese-rap-LoRA
    description: ACE Step Chinese RAP LoRA - Enhanced Chinese pronunciation and hip-hop
      genre
    size_gb: 0.3
    essential: false
    category: audio
    format: safetensors
    notes: Improves Chinese pronunciation accuracy and hip-hop/electronic genre adherence
    files:
    - source: pytorch_lora_weights.safetensors
      dest: loras/ace-step-chinese-rap-lora.safetensors
  support_models:
  - repo_id: openai/clip-vit-large-patch14
    description: CLIP H - For SD 1.5 IP-Adapter
    size_gb: 2
    essential: true
    category: support
    format: fp32
    vram_gb: 2
    notes: Text-image understanding model for IP-Adapter
    files:
    - source: model.safetensors
      dest: clip_vision/CLIP-ViT-H-14-laion2B-s32B-b79K.safetensors
  - repo_id: laion/CLIP-ViT-bigG-14-laion2B-39B-b160k
    description: CLIP G - For SDXL IP-Adapter
    size_gb: 7
    essential: true
    category: support
    format: fp32
    vram_gb: 4
    notes: Larger CLIP model for SDXL IP-Adapter
    files:
    - source: open_clip_model.safetensors
      dest: clip_vision/CLIP-ViT-bigG-14-laion2B-39B-b160k.safetensors
  - repo_id: google/siglip-so400m-patch14-384
    description: SigLIP - For FLUX models
    size_gb: 2
    essential: true
    category: support
    format: fp32
    vram_gb: 2
    notes: Advanced image-text alignment
    files:
    - source: model.safetensors
      dest: clip_vision/siglip-so400m-patch14-384.safetensors
  - repo_id: black-forest-labs/FLUX.1-schnell
    description: FLUX VAE - Autoencoder for FLUX models
    size_gb: 0.5
    essential: true
    category: support
    format: safetensors
    vram_gb: 1
    notes: VAE autoencoder required for FLUX image decoding
    files:
    - source: ae.safetensors
      dest: vae/ae.safetensors
  - repo_id: ai-forever/Real-ESRGAN
    description: RealESRGAN x2 - 2x upscaling model
    size_gb: 0.06
    essential: true
    category: support
    format: pth
    vram_gb: 2
    notes: Fast 2x upscaling model for general purpose enhancement
    files:
    - source: RealESRGAN_x2.pth
      dest: upscale_models/RealESRGAN_x2.pth
  - repo_id: ai-forever/Real-ESRGAN
    description: RealESRGAN x4 - 4x upscaling model
    size_gb: 0.06
    essential: true
    category: support
    format: pth
    vram_gb: 4
    notes: High-quality 4x upscaling model for detail enhancement
    files:
    - source: RealESRGAN_x4.pth
      dest: upscale_models/RealESRGAN_x4.pth
  - repo_id: Comfy-Org/Wan_2.1_ComfyUI_repackaged
    description: UMT5-XXL FP8 - Text encoder for all Wan2.2 models
    size_gb: 10
    essential: true
    category: support
    format: fp8_scaled
    vram_gb: 5
    notes: Shared text encoder for all Wan2.2 models (5B and 14B), FP8 quantized
    files:
    - source: umt5_xxl_fp8_e4m3fn_scaled.safetensors
      dest: text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors
  animatediff_models:
  - repo_id: guoyww/animatediff
    description: AnimateDiff Motion Modules
    size_gb: 2
    essential: true
    category: animatediff
    filename: mm_sd_v15
    format: safetensors
    vram_gb: 4
    notes: Motion modules for AnimateDiff text-to-video
    files:
    - source: mm_sd_v15_v2.ckpt
      dest: animatediff_models/mm_sd_v15_v2.ckpt
  controlnet_models:
  - repo_id: lllyasviel/control_v11p_sd15_canny
    description: ControlNet Canny - Edge detection control for SD 1.5
    size_gb: 1.5
    essential: false
    category: controlnet
    format: safetensors
    vram_gb: 2
    notes: Precise edge-based composition control
    files:
    - source: diffusion_pytorch_model.safetensors
      dest: controlnet/control_v11p_sd15_canny.safetensors
  - repo_id: lllyasviel/control_v11f1p_sd15_depth
    description: ControlNet Depth - Depth map control for SD 1.5
    size_gb: 1.5
    essential: false
    category: controlnet
    format: safetensors
    vram_gb: 2
    notes: Depth-based spatial control
    files:
    - source: diffusion_pytorch_model.safetensors
      dest: controlnet/control_v11p_sd15_depth.safetensors
  - repo_id: diffusers/controlnet-canny-sdxl-1.0
    description: ControlNet Canny SDXL - Edge detection for SDXL
    size_gb: 2.5
    essential: false
    category: controlnet
    format: safetensors
    vram_gb: 3
    notes: Canny edge control for SDXL models
    files:
    - source: diffusion_pytorch_model.safetensors
      dest: controlnet/controlnet-canny-sdxl-1.0.safetensors
  - repo_id: diffusers/controlnet-depth-sdxl-1.0
    description: ControlNet Depth SDXL - Depth map for SDXL
    size_gb: 2.5
    essential: false
    category: controlnet
    format: safetensors
    vram_gb: 3
    notes: Depth control for SDXL models
    files:
    - source: diffusion_pytorch_model.safetensors
      dest: controlnet/controlnet-depth-sdxl-1.0.safetensors
  ipadapter_models:
  - repo_id: h94/IP-Adapter
    description: IP-Adapter SDXL Base - Style & Composition
    size_gb: 1.3
    essential: true
    category: ipadapter
    format: safetensors
    vram_gb: 4
    notes: Basic IP-Adapter for SDXL
    files:
    - source: sdxl_models/ip-adapter_sdxl.safetensors
      dest: ipadapter/ip-adapter_sdxl.safetensors
  - repo_id: h94/IP-Adapter
    description: IP-Adapter SDXL VIT-H - For CLIP-ViT-H
    size_gb: 0.9
    essential: true
    category: ipadapter
    format: safetensors
    vram_gb: 4
    notes: IP-Adapter for SDXL with VIT-H CLIP vision model
    files:
    - source: sdxl_models/ip-adapter_sdxl_vit-h.safetensors
      dest: ipadapter/ip-adapter_sdxl_vit-h.safetensors
  - repo_id: h94/IP-Adapter
    description: IP-Adapter SDXL Plus - High Strength Composition
    size_gb: 0.9
    essential: false
    category: ipadapter
    format: safetensors
    vram_gb: 4
    notes: Enhanced composition control with higher strength
    files:
    - source: sdxl_models/ip-adapter-plus_sdxl_vit-h.safetensors
      dest: ipadapter/ip-adapter-plus_sdxl_vit-h.safetensors
  - repo_id: h94/IP-Adapter
    description: IP-Adapter SDXL Plus Face - Face-focused generation
    size_gb: 0.5
    essential: false
    category: ipadapter
    format: safetensors
    vram_gb: 4
    notes: Specialized for face transfer and portrait generation
    files:
    - source: sdxl_models/ip-adapter-plus-face_sdxl_vit-h.safetensors
      dest: ipadapter/ip-adapter-plus-face_sdxl_vit-h.safetensors
  diffrhythm_models:
  - repo_id: ASLP-lab/DiffRhythm-1_2
    description: DiffRhythm 1.2 - 95 second generation model
    size_gb: 2
    essential: true
    category: diffrhythm
    format: pt
    vram_gb: 12
    duration_seconds: 95
    notes: Latest 95-second generation model
    files:
    - source: cfm_model.pt
      dest: TTS/DiffRhythm/cfm_model_v1_2.pt
  - repo_id: ASLP-lab/DiffRhythm-full
    description: DiffRhythm Full - 4m45s full-length generation
    size_gb: 2
    essential: false
    category: diffrhythm
    format: pt
    vram_gb: 16
    duration_seconds: 285
    notes: Full-length 4 minute 45 second music generation
    files:
    - source: cfm_model.pt
      dest: TTS/DiffRhythm/cfm_full_model.pt
  - repo_id: ASLP-lab/DiffRhythm-base
    description: DiffRhythm Base - 95 second base model
    size_gb: 2
    essential: false
    category: diffrhythm
    format: pt
    vram_gb: 12
    duration_seconds: 95
    notes: Base 95-second model
    files:
    - source: cfm_model.pt
      dest: TTS/DiffRhythm/cfm_model.pt
  - repo_id: ASLP-lab/DiffRhythm-vae
    description: DiffRhythm VAE - Variational autoencoder
    size_gb: 1
    essential: true
    category: diffrhythm
    format: pt
    vram_gb: 2
    notes: VAE component fine-tuned from Stable Audio Open (Stability AI Community
      License)
    files:
    - source: vae_model.pt
      dest: TTS/DiffRhythm/vae_model.pt
  - repo_id: OpenMuQ/MuQ-MuLan-large
    description: MuQ-MuLan-large - Music-text joint embedding (~700M parameters)
    size_gb: 3
    essential: true
    category: diffrhythm
    format: bin
    vram_gb: 4
    notes: Music-text joint embedding for semantic understanding (English/Chinese)
    files:
    - source: config.json
      dest: TTS/DiffRhythm/MuQ-MuLan-large/config.json
    - source: pytorch_model.bin
      dest: TTS/DiffRhythm/MuQ-MuLan-large/pytorch_model.bin
  - repo_id: OpenMuQ/MuQ-large-msd-iter
    description: MuQ-large-msd-iter - Music representation learning (~300M parameters)
    size_gb: 1.2
    essential: true
    category: diffrhythm
    format: safetensors
    vram_gb: 2
    notes: Music representation model trained on Million Song Dataset
    files:
    - source: config.json
      dest: TTS/DiffRhythm/MuQ-large-msd-iter/config.json
    - source: model.safetensors
      dest: TTS/DiffRhythm/MuQ-large-msd-iter/model.safetensors
  - repo_id: FacebookAI/xlm-roberta-base
    description: XLM-RoBERTa Base - Multilingual text encoder (100 languages, 0.3B
      params)
    size_gb: 1.1
    essential: true
    category: diffrhythm
    format: safetensors
    vram_gb: 1
    notes: Multilingual text encoding for 100 languages
    files:
    - source: config.json
      dest: TTS/DiffRhythm/xlm-roberta-base/config.json
    - source: model.safetensors
      dest: TTS/DiffRhythm/xlm-roberta-base/model.safetensors
    - source: sentencepiece.bpe.model
      dest: TTS/DiffRhythm/xlm-roberta-base/sentencepiece.bpe.model
    - source: tokenizer.json
      dest: TTS/DiffRhythm/xlm-roberta-base/tokenizer.json
    - source: tokenizer_config.json
      dest: TTS/DiffRhythm/xlm-roberta-base/tokenizer_config.json
storage_requirements:
  essential_only:
    image: 30
    video: 28
    audio: 11
    support: 11
    diffrhythm: 10
    total: 90
  all_models:
    image: 54
    video: 36
    audio: 36
    support: 11
    diffrhythm: 12
    total: 149
vram_requirements:
  simultaneous_loadable:
  - name: Image Focus - FLUX FP16
    models:
    - FLUX.1 Schnell
    vram_used: 23
    remaining: 1
  - name: Image Focus - FLUX FP8 + SDXL
    models:
    - FLUX.1 Schnell FP8
    - SDXL Base
    vram_used: 24
    remaining: 0
  - name: Video Generation
    models:
    - CogVideoX-5B optimized
    - SDXL
    vram_used: 24
    remaining: 0
  - name: Multi-Modal
    models:
    - SDXL
    - MusicGen Medium
    vram_used: 20
    remaining: 4
installation_profiles:
  minimal:
    description: Minimal setup for testing
    categories:
    - support_models
    storage_gb: 11
    estimated_time: 5-10 minutes
  essential:
    description: Essential models only (~80GB)
    categories:
    - image_models
    - video_models
    - audio_models
    - support_models
    essential_only: true
    storage_gb: 80
    estimated_time: 1-2 hours
  image_focused:
    description: All image generation models
    categories:
    - image_models
    - support_models
    storage_gb: 65
    estimated_time: 45-90 minutes
  video_focused:
    description: All video generation models
    categories:
    - video_models
    - image_models
    - support_models
    essential_only: true
    storage_gb: 69
    estimated_time: 1-2 hours
  complete:
    description: All models (including optional)
    categories:
    - image_models
    - video_models
    - audio_models
    - support_models
    storage_gb: 137
    estimated_time: 2-4 hours
metadata:
  version: 1.0.0
  last_updated: 2025-11-21
  compatible_with:
  - ComfyUI >= 0.1.0
  - Python >= 3.10
  - HuggingFace Hub >= 0.20.0
  maintainer: Valknar
  repository: https://github.com/yourusername/runpod