runpod/models_huggingface.yaml

# ============================================================================
# ComfyUI Model Configuration
# ============================================================================
#
# This configuration file defines all available ComfyUI models for download.
# Models are organized by category: image, video, audio, and support models.
#
# Each model entry contains:
#   - repo_id: HuggingFace repository identifier
#   - description: Human-readable description
#   - size_gb: Approximate size in gigabytes
#   - essential: Whether this is an essential model (true/false)
#   - category: Model category (image/video/audio/support)
#
# ============================================================================

# Global settings
settings:
  cache_dir: /workspace/huggingface_cache
  parallel_downloads: 1
  retry_attempts: 3
  timeout_seconds: 3600

# Model categories
model_categories:
  # ==========================================================================
  # IMAGE GENERATION MODELS
  # ==========================================================================
  image_models:
    - repo_id: black-forest-labs/FLUX.1-schnell
      description: FLUX.1 Schnell - Fast 4-step inference
      size_gb: 23
      essential: true
      category: image
      type: unet
      format: fp16
      vram_gb: 23
      notes: Industry-leading image generation quality
      files:
        - source: "flux1-schnell.safetensors"
          dest: "flux1-schnell.safetensors"

    - repo_id: black-forest-labs/FLUX.1-dev
      description: FLUX.1 Dev - Balanced quality/speed
      size_gb: 23
      essential: false
      category: image
      type: unet
      format: fp16
      vram_gb: 23
      notes: Development version with enhanced features
      files:
        - source: "flux1-dev.safetensors"
          dest: "flux1-dev.safetensors"

    - repo_id: runwayml/stable-diffusion-v1-5
      description: SD 1.5 - For AnimateDiff
      size_gb: 4
      essential: true
      category: image
      type: checkpoints
      format: fp16
      vram_gb: 8
      notes: Stable Diffusion 1.5 required for AnimateDiff motion modules
      files:
        - source: "v1-5-pruned-emaonly.safetensors"
          dest: "v1-5-pruned-emaonly.safetensors"

    - repo_id: stabilityai/stable-diffusion-xl-base-1.0
      description: SDXL Base 1.0 - Industry standard
      size_gb: 7
      essential: true
      category: image
      type: checkpoints
      format: fp16
      vram_gb: 12
      notes: Most widely used Stable Diffusion model
      files:
        - source: "sd_xl_base_1.0.safetensors"
          dest: "sd_xl_base_1.0.safetensors"

    - repo_id: stabilityai/stable-diffusion-xl-refiner-1.0
      description: SDXL Refiner 1.0 - Enhances base output
      size_gb: 6
      essential: false
      category: image
      type: checkpoints
      format: fp16
      vram_gb: 12
      notes: Use after SDXL base for improved details
      files:
        - source: "sd_xl_refiner_1.0.safetensors"
          dest: "sd_xl_refiner_1.0.safetensors"

    - repo_id: stabilityai/stable-diffusion-3.5-large
      description: SD 3.5 Large - Latest Stability AI
      size_gb: 18
      essential: false
      category: image
      type: checkpoints
      format: fp16
      vram_gb: 20
      notes: Newest generation Stable Diffusion
      files:
        - source: "sd3.5_large.safetensors"
          dest: "sd3.5_large.safetensors"

  # ==========================================================================
  # VIDEO GENERATION MODELS
  # ==========================================================================
  video_models:
    - repo_id: THUDM/CogVideoX-5b
      description: CogVideoX-5B - Professional text-to-video
      size_gb: 20
      essential: true
      category: video
      type: diffusion_models
      format: fp16
      vram_gb: 20
      frames: 49
      resolution: 720p
      notes: State-of-the-art text-to-video generation, auto-downloaded by DownloadAndLoadCogVideoModel node
      files:
        - source: "transformer/diffusion_pytorch_model.safetensors"
          dest: "cogvideox-5b-transformer.safetensors"

    - repo_id: THUDM/CogVideoX-5b-I2V
      description: CogVideoX-5B-I2V - Image-to-video generation
      size_gb: 20
      essential: true
      category: video
      type: diffusion_models
      format: fp16
      vram_gb: 20
      frames: 49
      resolution: 720p
      notes: Image-to-video model, auto-downloaded by DownloadAndLoadCogVideoModel node
      files:
        - source: "transformer/diffusion_pytorch_model.safetensors"
          dest: "cogvideox-5b-i2v-transformer.safetensors"

    - repo_id: stabilityai/stable-video-diffusion-img2vid
      description: SVD - 14 frame image-to-video
      size_gb: 8
      essential: true
      category: video
      type: checkpoints
      format: fp16
      vram_gb: 20
      frames: 14
      resolution: 576x1024
      notes: Convert images to short video clips
      files:
        - source: "svd.safetensors"
          dest: "svd.safetensors"

    - repo_id: stabilityai/stable-video-diffusion-img2vid-xt
      description: SVD-XT - 25 frame image-to-video
      size_gb: 8
      essential: false
      category: video
      type: checkpoints
      format: fp16
      vram_gb: 20
      frames: 25
      resolution: 576x1024
      notes: Extended frame count version
      files:
        - source: "svd_xt.safetensors"
          dest: "svd_xt.safetensors"

  # ==========================================================================
  # AUDIO GENERATION MODELS
  # ==========================================================================
  audio_models:
    - repo_id: facebook/musicgen-small
      description: MusicGen Small - Fast generation
      size_gb: 3
      essential: false
      category: audio
      type: musicgen
      format: fp32
      vram_gb: 4
      duration_seconds: 30
      notes: Fastest music generation, lower quality
      files:
        - source: "pytorch_model.bin"
          dest: "musicgen-small-pytorch_model.bin"

    - repo_id: facebook/musicgen-medium
      description: MusicGen Medium - Balanced quality
      size_gb: 11
      essential: true
      category: audio
      type: musicgen
      format: fp32
      vram_gb: 8
      duration_seconds: 30
      notes: Best balance of speed and quality
      files:
        - source: "pytorch_model.bin"
          dest: "musicgen-medium-pytorch_model.bin"

    - repo_id: facebook/musicgen-large
      description: MusicGen Large - Highest quality
      size_gb: 22
      essential: false
      category: audio
      type: musicgen
      format: fp32
      vram_gb: 16
      duration_seconds: 30
      notes: Best quality, slower generation
      files:
        - source: "pytorch_model-00001-of-00002.bin"
          dest: "musicgen-large-pytorch_model-00001-of-00002.bin"
        - source: "pytorch_model-00002-of-00002.bin"
          dest: "musicgen-large-pytorch_model-00002-of-00002.bin"
        - source: "pytorch_model.bin.index.json"
          dest: "musicgen-large-pytorch_model.bin.index.json"

  # ==========================================================================
  # SUPPORT MODELS (CLIP, IP-Adapter, etc.)
  # ==========================================================================
  support_models:
    - repo_id: openai/clip-vit-large-patch14
      description: CLIP H - For SD 1.5 IP-Adapter
      size_gb: 2
      essential: true
      category: support
      type: clip_vision
      format: fp32
      vram_gb: 2
      notes: Text-image understanding model for IP-Adapter
      files:
        - source: "model.safetensors"
          dest: "CLIP-ViT-H-14-laion2B-s32B-b79K.safetensors"

    - repo_id: laion/CLIP-ViT-bigG-14-laion2B-39B-b160k
      description: CLIP G - For SDXL IP-Adapter
      size_gb: 7
      essential: true
      category: support
      type: clip_vision
      format: fp32
      vram_gb: 4
      notes: Larger CLIP model for SDXL IP-Adapter
      files:
        - source: "open_clip_model.safetensors"
          dest: "CLIP-ViT-bigG-14-laion2B-39B-b160k.safetensors"

    - repo_id: google/siglip-so400m-patch14-384
      description: SigLIP - For FLUX models
      size_gb: 2
      essential: true
      category: support
      type: clip_vision
      format: fp32
      vram_gb: 2
      notes: Advanced image-text alignment
      files:
        - source: "model.safetensors"
          dest: "siglip-so400m-patch14-384.safetensors"

    - repo_id: stabilityai/stable-diffusion-3.5-large
      description: CLIP-L and T5-XXL - For FLUX text encoding
      size_gb: 10
      essential: true
      category: support
      type: clip
      format: fp16
      vram_gb: 4
      notes: CLIP text encoders required for FLUX models
      files:
        - source: "text_encoders/clip_l.safetensors"
          dest: "clip_l.safetensors"
        - source: "text_encoders/t5xxl_fp16.safetensors"
          dest: "t5xxl_fp16.safetensors"

    - repo_id: black-forest-labs/FLUX.1-schnell
      description: FLUX VAE - Autoencoder for FLUX models
      size_gb: 0.5
      essential: true
      category: support
      type: vae
      format: safetensors
      vram_gb: 1
      notes: VAE autoencoder required for FLUX image decoding
      files:
        - source: "ae.safetensors"
          dest: "ae.safetensors"

    - repo_id: ai-forever/Real-ESRGAN
      description: RealESRGAN x2 - 2x upscaling model
      size_gb: 0.06
      essential: true
      category: support
      type: upscale_models
      format: pth
      vram_gb: 2
      notes: Fast 2x upscaling model for general purpose enhancement
      files:
        - source: "RealESRGAN_x2.pth"
          dest: "RealESRGAN_x2.pth"

    - repo_id: ai-forever/Real-ESRGAN
      description: RealESRGAN x4 - 4x upscaling model
      size_gb: 0.06
      essential: true
      category: support
      type: upscale_models
      format: pth
      vram_gb: 4
      notes: High-quality 4x upscaling model for detail enhancement
      files:
        - source: "RealESRGAN_x4.pth"
          dest: "RealESRGAN_x4.pth"

    - repo_id: stabilityai/stable-diffusion-3.5-large
      description: T5-XXL FP16 - For CogVideoX text encoding
      size_gb: 9
      essential: true
      category: support
      type: text_encoders
      format: fp16
      vram_gb: 4
      notes: T5 text encoder required for CogVideoX models
      files:
        - source: "text_encoders/t5xxl_fp16.safetensors"
          dest: "t5xxl_fp16.safetensors"

    - repo_id: stabilityai/stable-diffusion-3.5-large
      description: CLIP-L - For CogVideoX and SD3
      size_gb: 1
      essential: true
      category: support
      type: text_encoders
      format: fp32
      vram_gb: 1
      notes: CLIP-L text encoder for CogVideoX and SD3 models
      files:
        - source: "text_encoders/clip_l.safetensors"
          dest: "clip_l.safetensors"

    - repo_id: stabilityai/stable-diffusion-3.5-large
      description: CLIP-G - For SD3 models
      size_gb: 3
      essential: false
      category: support
      type: text_encoders
      format: fp32
      vram_gb: 2
      notes: CLIP-G text encoder for SD3 models
      files:
        - source: "text_encoders/clip_g.safetensors"
          dest: "clip_g.safetensors"

  # ==========================================================================
  # ANIMATEDIFF MODELS
  # ==========================================================================
  animatediff_models:
    - repo_id: guoyww/animatediff
      description: AnimateDiff Motion Modules
      size_gb: 2
      essential: true
      category: animatediff
      type: animatediff_models
      filename: mm_sd_v15
      format: safetensors
      vram_gb: 4
      notes: Motion modules for AnimateDiff text-to-video
      files:
        - source: "mm_sd_v15_v2.ckpt"
          dest: "mm_sd_v15_v2.ckpt"

  # ==========================================================================
  # CONTROLNET MODELS
  # ==========================================================================
  controlnet_models:
    - repo_id: lllyasviel/control_v11p_sd15_canny
      description: ControlNet Canny - Edge detection control for SD 1.5
      size_gb: 1.5
      essential: false
      category: controlnet
      type: controlnet
      format: safetensors
      vram_gb: 2
      notes: Precise edge-based composition control
      files:
        - source: "diffusion_pytorch_model.safetensors"
          dest: "control_v11p_sd15_canny.safetensors"

    - repo_id: lllyasviel/control_v11f1p_sd15_depth
      description: ControlNet Depth - Depth map control for SD 1.5
      size_gb: 1.5
      essential: false
      category: controlnet
      type: controlnet
      format: safetensors
      vram_gb: 2
      notes: Depth-based spatial control
      files:
        - source: "diffusion_pytorch_model.safetensors"
          dest: "control_v11p_sd15_depth.safetensors"

    - repo_id: diffusers/controlnet-canny-sdxl-1.0
      description: ControlNet Canny SDXL - Edge detection for SDXL
      size_gb: 2.5
      essential: false
      category: controlnet
      type: controlnet
      format: safetensors
      vram_gb: 3
      notes: Canny edge control for SDXL models
      files:
        - source: "diffusion_pytorch_model.safetensors"
          dest: "controlnet-canny-sdxl-1.0.safetensors"

    - repo_id: diffusers/controlnet-depth-sdxl-1.0
      description: ControlNet Depth SDXL - Depth map for SDXL
      size_gb: 2.5
      essential: false
      category: controlnet
      type: controlnet
      format: safetensors
      vram_gb: 3
      notes: Depth control for SDXL models
      files:
        - source: "diffusion_pytorch_model.safetensors"
          dest: "controlnet-depth-sdxl-1.0.safetensors"

  # ==========================================================================
  # IP-ADAPTER MODELS
  # ==========================================================================
  ipadapter_models:
    - repo_id: h94/IP-Adapter
      description: IP-Adapter SDXL Base - Style & Composition
      size_gb: 1.3
      essential: true
      category: ipadapter
      type: ipadapter
      format: safetensors
      vram_gb: 4
      notes: Basic IP-Adapter for SDXL
      files:
        - source: "sdxl_models/ip-adapter_sdxl.safetensors"
          dest: "ip-adapter_sdxl.safetensors"

    - repo_id: h94/IP-Adapter
      description: IP-Adapter SDXL VIT-H - For CLIP-ViT-H
      size_gb: 0.9
      essential: true
      category: ipadapter
      type: ipadapter
      format: safetensors
      vram_gb: 4
      notes: IP-Adapter for SDXL with VIT-H CLIP vision model
      files:
        - source: "sdxl_models/ip-adapter_sdxl_vit-h.safetensors"
          dest: "ip-adapter_sdxl_vit-h.safetensors"

    - repo_id: h94/IP-Adapter
      description: IP-Adapter SDXL Plus - High Strength Composition
      size_gb: 0.9
      essential: false
      category: ipadapter
      type: ipadapter
      format: safetensors
      vram_gb: 4
      notes: Enhanced composition control with higher strength
      files:
        - source: "sdxl_models/ip-adapter-plus_sdxl_vit-h.safetensors"
          dest: "ip-adapter-plus_sdxl_vit-h.safetensors"

    - repo_id: h94/IP-Adapter
      description: IP-Adapter SDXL Plus Face - Face-focused generation
      size_gb: 0.5
      essential: false
      category: ipadapter
      type: ipadapter
      format: safetensors
      vram_gb: 4
      notes: Specialized for face transfer and portrait generation
      files:
        - source: "sdxl_models/ip-adapter-plus-face_sdxl_vit-h.safetensors"
          dest: "ip-adapter-plus-face_sdxl_vit-h.safetensors"

  # ==========================================================================
  # DIFFRHYTHM MODELS (Full-length song generation)
  # ==========================================================================
  diffrhythm_models:
    - repo_id: ASLP-lab/DiffRhythm-1_2
      description: DiffRhythm 1.2 - 95 second generation model
      size_gb: 2
      essential: true
      category: diffrhythm
      type: TTS/DiffRhythm
      format: pt
      vram_gb: 12
      duration_seconds: 95
      notes: Latest 95-second generation model
      files:
        - source: "cfm_model.pt"
          dest: "cfm_model_v1_2.pt"

    - repo_id: ASLP-lab/DiffRhythm-full
      description: DiffRhythm Full - 4m45s full-length generation
      size_gb: 2
      essential: false
      category: diffrhythm
      type: TTS/DiffRhythm
      format: pt
      vram_gb: 16
      duration_seconds: 285
      notes: Full-length 4 minute 45 second music generation
      files:
        - source: "cfm_model.pt"
          dest: "cfm_full_model.pt"

    - repo_id: ASLP-lab/DiffRhythm-base
      description: DiffRhythm Base - 95 second base model
      size_gb: 2
      essential: false
      category: diffrhythm
      type: TTS/DiffRhythm
      format: pt
      vram_gb: 12
      duration_seconds: 95
      notes: Base 95-second model
      files:
        - source: "cfm_model.pt"
          dest: "cfm_model.pt"

    - repo_id: ASLP-lab/DiffRhythm-vae
      description: DiffRhythm VAE - Variational autoencoder
      size_gb: 1
      essential: true
      category: diffrhythm
      type: TTS/DiffRhythm
      format: pt
      vram_gb: 2
      notes: VAE component fine-tuned from Stable Audio Open (Stability AI Community License)
      files:
        - source: "vae_model.pt"
          dest: "vae_model.pt"

    - repo_id: OpenMuQ/MuQ-MuLan-large
      description: MuQ-MuLan-large - Music-text joint embedding (~700M parameters)
      size_gb: 3
      essential: true
      category: diffrhythm
      type: TTS/DiffRhythm/MuQ-MuLan-large
      format: bin
      vram_gb: 4
      notes: Music-text joint embedding for semantic understanding (English/Chinese)
      files:
        - source: "config.json"
          dest: "config.json"
        - source: "pytorch_model.bin"
          dest: "pytorch_model.bin"

    - repo_id: OpenMuQ/MuQ-large-msd-iter
      description: MuQ-large-msd-iter - Music representation learning (~300M parameters)
      size_gb: 1.2
      essential: true
      category: diffrhythm
      type: TTS/DiffRhythm/MuQ-large-msd-iter
      format: safetensors
      vram_gb: 2
      notes: Music representation model trained on Million Song Dataset
      files:
        - source: "config.json"
          dest: "config.json"
        - source: "model.safetensors"
          dest: "model.safetensors"

    - repo_id: FacebookAI/xlm-roberta-base
      description: XLM-RoBERTa Base - Multilingual text encoder (100 languages, 0.3B params)
      size_gb: 1.1
      essential: true
      category: diffrhythm
      type: TTS/DiffRhythm/xlm-roberta-base
      format: safetensors
      vram_gb: 1
      notes: Multilingual text encoding for 100 languages
      files:
        - source: "config.json"
          dest: "config.json"
        - source: "model.safetensors"
          dest: "model.safetensors"
        - source: "sentencepiece.bpe.model"
          dest: "sentencepiece.bpe.model"
        - source: "tokenizer.json"
          dest: "tokenizer.json"
        - source: "tokenizer_config.json"
          dest: "tokenizer_config.json"

# ============================================================================
# STORAGE & VRAM SUMMARIES
# ============================================================================

storage_requirements:
  essential_only:
    image: 30      # FLUX Schnell + SDXL Base
    video: 28      # CogVideoX + SVD
    audio: 11      # MusicGen Medium
    support: 11    # All 3 CLIP models
    diffrhythm: 10 # DiffRhythm essential models
    total: 90      # Total essential storage

  all_models:
    image: 54      # All image models
    video: 36      # All video models
    audio: 36      # All audio models
    support: 11    # All support models
    diffrhythm: 12 # All DiffRhythm models
    total: 149     # Total with optional models

vram_requirements:
  # For 24GB GPU (RTX 4090)
  simultaneous_loadable:
    - name: Image Focus - FLUX FP16
      models: [FLUX.1 Schnell]
      vram_used: 23
      remaining: 1

    - name: Image Focus - FLUX FP8 + SDXL
      models: [FLUX.1 Schnell FP8, SDXL Base]
      vram_used: 24
      remaining: 0

    - name: Video Generation
      models: [CogVideoX-5B optimized, SDXL]
      vram_used: 24
      remaining: 0

    - name: Multi-Modal
      models: [SDXL, MusicGen Medium]
      vram_used: 20
      remaining: 4

# ============================================================================
# INSTALLATION PROFILES
# ============================================================================

installation_profiles:
  minimal:
    description: Minimal setup for testing
    categories: [support_models]
    storage_gb: 11
    estimated_time: 5-10 minutes

  essential:
    description: Essential models only (~80GB)
    categories: [image_models, video_models, audio_models, support_models]
    essential_only: true
    storage_gb: 80
    estimated_time: 1-2 hours

  image_focused:
    description: All image generation models
    categories: [image_models, support_models]
    storage_gb: 65
    estimated_time: 45-90 minutes

  video_focused:
    description: All video generation models
    categories: [video_models, image_models, support_models]
    essential_only: true
    storage_gb: 69
    estimated_time: 1-2 hours

  complete:
    description: All models (including optional)
    categories: [image_models, video_models, audio_models, support_models]
    storage_gb: 137
    estimated_time: 2-4 hours

# ============================================================================
# METADATA
# ============================================================================

metadata:
  version: 1.0.0
  last_updated: 2025-11-21
  compatible_with:
    - ComfyUI >= 0.1.0
    - Python >= 3.10
    - HuggingFace Hub >= 0.20.0
  maintainer: Valknar
  repository: https://github.com/yourusername/runpod