refactor: remove type field from models_huggingface.yaml and include type in dest paths

- Prepended ComfyUI model type folder (checkpoints/, clip/, vae/, etc.) to all dest paths - Removed separate 'type' field from all model entries - Consolidated SD3.5 duplicate entries (5 → 1) - Simplified model configuration by embedding directory structure directly in destination paths This change eliminates the need to parse the 'type' field separately in artifact_huggingface_download.sh, making the configuration more explicit and easier to understand. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
feat: add three new SDXL/SD1.5 image generation models
2025-11-25 19:19:42 +01:00 · 2025-11-25 17:29:04 +01:00 · 2025-11-25 15:47:09 +01:00 · 2025-11-25 12:33:00 +01:00 · 2025-11-25 10:43:39 +01:00 · 2025-11-25 09:46:24 +01:00
35 changed files with 35746 additions and 727 deletions
--- a/arty.yml
+++ b/arty.yml
@@ -63,11 +63,41 @@ references:
    description: "MusicGen and Stable Audio integration"
    essential: false
  - url: https://github.com/billwuhao/ComfyUI_DiffRhythm.git
    into: $COMFYUI_ROOT/custom_nodes/ComfyUI_DiffRhythm
    description: "DiffRhythm - Full-length song generation (up to 4m45s) with text/audio conditioning"
    essential: false
  - url: https://github.com/billwuhao/ComfyUI_ACE-Step.git
    into: $COMFYUI_ROOT/custom_nodes/ComfyUI_ACE-Step
    description: "ACE Step - State-of-the-art music generation with 19-language support, voice cloning, and superior coherence"
    essential: false
  - url: https://github.com/ssitu/ComfyUI_UltimateSDUpscale.git
    into: $COMFYUI_ROOT/custom_nodes/ComfyUI_UltimateSDUpscale
    description: "Ultimate SD Upscale for high-quality image upscaling"
    essential: false
  - url: https://github.com/kijai/ComfyUI-KJNodes.git
    into: $COMFYUI_ROOT/custom_nodes/ComfyUI-KJNodes
    description: "Kijai optimizations for HunyuanVideo and Wan2.2 (FP8 scaling, video helpers, model loading)"
    essential: true
  - url: https://github.com/Fannovel16/comfyui_controlnet_aux.git
    into: $COMFYUI_ROOT/custom_nodes/comfyui_controlnet_aux
    description: "ControlNet preprocessors (Canny, Depth, OpenPose, MLSD) for Wan2.2 Fun Control"
    essential: true
  - url: https://github.com/city96/ComfyUI-GGUF.git
    into: $COMFYUI_ROOT/custom_nodes/ComfyUI-GGUF
    description: "GGUF quantization support for memory-efficient model loading"
    essential: false
  - url: https://github.com/11cafe/comfyui-workspace-manager.git
    into: $COMFYUI_ROOT/custom_nodes/comfyui-workspace-manager
    description: "Workspace manager for ComfyUI - workflow/model organization (obsolete but requested)"
    essential: false
 # Environment profiles for selective repository management
 envs:
  # RunPod environment variables
@@ -78,37 +108,6 @@ envs:
    LOGS_DIR: /workspace/logs
    BIN_DIR: /workspace/bin
  # Production: Only essential components
  prod:
    - $AI_ROOT
    - $COMFYUI_ROOT
    - $COMFYUI_ROOT/custom_nodes/ComfyUI-Manager
    - $COMFYUI_ROOT/custom_nodes/ComfyUI-VideoHelperSuite
    - $COMFYUI_ROOT/custom_nodes/ComfyUI-AnimateDiff-Evolved
    - $COMFYUI_ROOT/custom_nodes/ComfyUI_IPAdapter_plus
    - $COMFYUI_ROOT/custom_nodes/ComfyUI-Impact-Pack
  # Development: All repositories including optional nodes
  dev:
    - $AI_ROOT
    - $COMFYUI_ROOT
    - $COMFYUI_ROOT/custom_nodes/ComfyUI-Manager
    - $COMFYUI_ROOT/custom_nodes/ComfyUI-VideoHelperSuite
    - $COMFYUI_ROOT/custom_nodes/ComfyUI-AnimateDiff-Evolved
    - $COMFYUI_ROOT/custom_nodes/ComfyUI_IPAdapter_plus
    - $COMFYUI_ROOT/custom_nodes/ComfyUI-Impact-Pack
    - $COMFYUI_ROOT/custom_nodes/ComfyUI-CogVideoXWrapper
    - $COMFYUI_ROOT/custom_nodes/ComfyUI-Inspire-Pack
    - $COMFYUI_ROOT/custom_nodes/ComfyUI-Advanced-ControlNet
    - $COMFYUI_ROOT/custom_nodes/ComfyUI-3D-Pack
    - $COMFYUI_ROOT/custom_nodes/comfyui-sound-lab
  # Minimal: Only orchestrator and ComfyUI base
  minimal:
    - $AI_ROOT
    - $COMFYUI_ROOT
    - $COMFYUI_ROOT/custom_nodes/ComfyUI-Manager
 # Deployment scripts for RunPod instances
 scripts:
  #
@@ -165,11 +164,23 @@ scripts:
      htop \
      tmux \
      net-tools \
-      davfs2
+      davfs2 \
      ffmpeg \
      libavcodec-dev \
      libavformat-dev \
      libavutil-dev \
      libswscale-dev
    echo ""
    echo "✓ System packages installed successfully"
    # Verify FFmpeg installation
    if ffmpeg -version > /dev/null 2>&1; then
      echo "✓ FFmpeg installed: $(ffmpeg -version | head -1 | cut -d ' ' -f3)"
    else
      echo "❌ WARNING: FFmpeg not found"
    fi
  setup/python-env: |
    echo "========================================="
    echo "  Setting Up Python Environment"
@@ -279,43 +290,67 @@ scripts:
    echo "========================================="
    echo ""
    # Install system dependencies
    echo "Installing system dependencies..."
    sudo apt-get update -qq
    sudo apt-get install -y -qq espeak-ng
    echo "✓ System dependencies installed (espeak-ng)"
    echo ""
    cd $COMFYUI_ROOT/custom_nodes
    # ComfyUI Manager
-    echo "[1/5] Installing ComfyUI-Manager..."
+    echo "[1/6] Installing ComfyUI-Manager..."
    if [ ! -d "ComfyUI-Manager" ]; then
      git clone https://github.com/ltdrdata/ComfyUI-Manager.git
    fi
    [ -f "ComfyUI-Manager/requirements.txt" ] && sudo pip3 install -r ComfyUI-Manager/requirements.txt
    # VideoHelperSuite
-    echo "[2/5] Installing ComfyUI-VideoHelperSuite..."
+    echo "[2/6] Installing ComfyUI-VideoHelperSuite..."
    if [ ! -d "ComfyUI-VideoHelperSuite" ]; then
      git clone https://github.com/Kosinkadink/ComfyUI-VideoHelperSuite.git
    fi
    [ -f "ComfyUI-VideoHelperSuite/requirements.txt" ] && sudo pip3 install -r ComfyUI-VideoHelperSuite/requirements.txt
    # AnimateDiff-Evolved
-    echo "[3/5] Installing ComfyUI-AnimateDiff-Evolved..."
+    echo "[3/6] Installing ComfyUI-AnimateDiff-Evolved..."
    if [ ! -d "ComfyUI-AnimateDiff-Evolved" ]; then
      git clone https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved.git
    fi
    [ -f "ComfyUI-AnimateDiff-Evolved/requirements.txt" ] && sudo pip3 install -r ComfyUI-AnimateDiff-Evolved/requirements.txt
    # IPAdapter Plus
-    echo "[4/5] Installing ComfyUI_IPAdapter_plus..."
+    echo "[4/6] Installing ComfyUI_IPAdapter_plus..."
    if [ ! -d "ComfyUI_IPAdapter_plus" ]; then
      git clone https://github.com/cubiq/ComfyUI_IPAdapter_plus.git
    fi
    [ -f "ComfyUI_IPAdapter_plus/requirements.txt" ] && sudo pip3 install -r ComfyUI_IPAdapter_plus/requirements.txt
    # Impact-Pack
-    echo "[5/5] Installing ComfyUI-Impact-Pack..."
+    echo "[5/6] Installing ComfyUI-Impact-Pack..."
    if [ ! -d "ComfyUI-Impact-Pack" ]; then
      git clone https://github.com/ltdrdata/ComfyUI-Impact-Pack.git
    fi
    [ -f "ComfyUI-Impact-Pack/requirements.txt" ] && sudo pip3 install -r ComfyUI-Impact-Pack/requirements.txt
    # DiffRhythm
    echo "[6/6] Installing ComfyUI_DiffRhythm..."
    if [ ! -d "ComfyUI_DiffRhythm" ]; then
      git clone https://github.com/billwuhao/ComfyUI_DiffRhythm.git
    fi
    if [ -f "ComfyUI_DiffRhythm/requirements.txt" ]; then
      cd $COMFYUI_ROOT
      source venv/bin/activate
      pip install -r custom_nodes/ComfyUI_DiffRhythm/requirements.txt
      deactivate
      cd custom_nodes
    fi
    # Create DiffRhythm model directories
    echo "Creating DiffRhythm model directories..."
    mkdir -p $COMFYUI_ROOT/models/TTS/DiffRhythm/{MuQ-large-msd-iter,MuQ-MuLan-large,xlm-roberta-base,eval-model}
    # Fix numpy version for vLLM compatibility
    echo "Fixing numpy version..."
    sudo pip3 install 'numpy<2.0.0' --force-reinstall
@@ -327,6 +362,144 @@ scripts:
    echo "  - AnimateDiff-Evolved: Video generation"
    echo "  - IPAdapter_plus: Style transfer"
    echo "  - Impact-Pack: Face enhancement"
    echo "  - DiffRhythm: Full-length song generation"
  models/diffrhythm-eval: |
    echo "========================================="
    echo "  Downloading DiffRhythm Eval Model"
    echo "========================================="
    echo ""
    # Create eval-model directory
    mkdir -p $COMFYUI_ROOT/models/TTS/DiffRhythm/eval-model
    cd $COMFYUI_ROOT/models/TTS/DiffRhythm/eval-model
    # Download eval.yaml (129 bytes)
    echo "Downloading eval.yaml..."
    curl -L -o eval.yaml "https://huggingface.co/spaces/ASLP-lab/DiffRhythm/resolve/main/pretrained/eval.yaml"
    # Download eval.safetensors (101 MB)
    echo "Downloading eval.safetensors (101 MB)..."
    curl -L -o eval.safetensors "https://huggingface.co/spaces/ASLP-lab/DiffRhythm/resolve/main/pretrained/eval.safetensors"
    # Verify files
    if [ -f "eval.yaml" ] && [ -f "eval.safetensors" ]; then
      echo ""
      echo "✓ DiffRhythm eval-model files downloaded successfully"
      echo "  - eval.yaml: $(du -h eval.yaml | cut -f1)"
      echo "  - eval.safetensors: $(du -h eval.safetensors | cut -f1)"
    else
      echo "❌ ERROR: Failed to download eval-model files"
      exit 1
    fi
  setup/comfyui-acestep: |
    echo "========================================="
    echo "  Installing ACE Step Custom Node"
    echo "========================================="
    echo ""
    cd $COMFYUI_ROOT/custom_nodes
    # Clone repository if not exists
    if [ ! -d "ComfyUI_ACE-Step" ]; then
      echo "Cloning ComfyUI_ACE-Step repository..."
      git clone https://github.com/billwuhao/ComfyUI_ACE-Step.git
    else
      echo "ComfyUI_ACE-Step already exists, skipping clone"
    fi
    # Install dependencies in ComfyUI venv
    echo ""
    echo "Installing ACE Step dependencies..."
    cd $COMFYUI_ROOT
    source venv/bin/activate
    pip install -r custom_nodes/ComfyUI_ACE-Step/requirements.txt
    deactivate
    echo ""
    echo "✓ ACE Step custom node installed successfully"
    echo "  Note: Download models separately using:"
    echo "  bash /workspace/bin/artifact_huggingface_download.sh download -c models_huggingface.yaml --category audio_models"
  setup/pivoine-nodes: |
    echo "========================================="
    echo "  Linking Pivoine Custom Nodes"
    echo "========================================="
    echo ""
    NODES_SRC="/workspace/ai/comfyui/nodes"
    NODES_DEST="/workspace/ComfyUI/custom_nodes/ComfyUI_Pivoine"
    # Remove existing symlink if present
    if [ -L "$NODES_DEST" ] || [ -d "$NODES_DEST" ]; then
      echo "Removing existing: $NODES_DEST"
      rm -rf "$NODES_DEST"
    fi
    # Create symlink
    ln -s "$NODES_SRC" "$NODES_DEST"
    echo ""
    echo "✓ Pivoine custom nodes linked"
    echo "  Source: $NODES_SRC"
    echo "  Linked: $NODES_DEST"
    echo ""
    echo "Available Pivoine nodes:"
    echo "  🌸 PivoineDiffRhythmRun - DiffRhythm with chunked disabled"
    echo ""
    echo "Category: 🌸Pivoine/Audio"
  fix/diffrhythm-patch: |
    echo "========================================="
    echo "  Apply DiffRhythm LlamaConfig Patch"
    echo "========================================="
    echo ""
    echo "Issue: Tensor dimension mismatch (32 vs 64) in rotary embeddings"
    echo "Solution: Patch DiffRhythm __init__.py to fix LlamaConfig"
    echo ""
    echo "References:"
    echo "  - https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44"
    echo "  - https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48"
    echo ""
    DIFF_RHYTHM_DIR="/workspace/ComfyUI/custom_nodes/ComfyUI_DiffRhythm"
    PATCH_FILE="/workspace/ai/comfyui/patches/diffrhythm-llamaconfig-fix.patch"
    if [ ! -d "$DIFF_RHYTHM_DIR" ]; then
      echo "✗ Error: DiffRhythm not found at $DIFF_RHYTHM_DIR"
      exit 1
    fi
    if [ ! -f "$PATCH_FILE" ]; then
      echo "✗ Error: Patch file not found at $PATCH_FILE"
      exit 1
    fi
    cd "$DIFF_RHYTHM_DIR"
    echo "Checking if patch already applied..."
    if grep -q "PatchedLlamaConfig" __init__.py; then
      echo "✓ Patch already applied!"
      exit 0
    fi
    echo "Applying patch..."
    patch -p1 < "$PATCH_FILE"
    if [ $? -eq 0 ]; then
      echo ""
      echo "✓ Patch applied successfully!"
      echo ""
      echo "Next steps:"
      echo "  1. Restart ComfyUI: arty services/comfyui/restart"
      echo "  2. Test DiffRhythm workflows"
    else
      echo ""
      echo "✗ Failed to apply patch"
      echo "You may need to manually apply the patch or check for conflicts"
      exit 1
    fi
  setup/comfyui-extensions-deps: |
    echo "========================================="
@@ -436,58 +609,6 @@ scripts:
    echo "To manage: supervisorctl status"
    echo "Web UI: http://localhost:9001 (admin/runpod2024)"
  setup/webdav: |
    echo "========================================="
    echo "  Setting Up WebDAV Mount (HiDrive)"
    echo "========================================="
    echo ""
    # Install davfs2 if not present
    if ! command -v mount.davfs >/dev/null 2>&1; then
      echo "Installing davfs2..."
      DEBIAN_FRONTEND=noninteractive apt update && DEBIAN_FRONTEND=noninteractive apt install -y davfs2
    fi
    # Create mount point
    echo "Creating mount point..."
    mkdir -p /mnt/hidrive
    # Create davfs2 secrets file
    echo "Configuring WebDAV credentials..."
    mkdir -p /etc/davfs2
    echo "https://webdav.hidrive.ionos.com/ valknar MwRTW4hR.eRbipQ" | tee /etc/davfs2/secrets > /dev/null
    chmod 600 /etc/davfs2/secrets
    # Configure davfs2
    sed -i 's/# use_locks       1/use_locks       0/' /etc/davfs2/davfs2.conf 2>/dev/null || true
    # Mount WebDAV
    echo "Mounting HiDrive WebDAV..."
    if mount -t davfs https://webdav.hidrive.ionos.com/ /mnt/hidrive; then
      echo "✓ HiDrive mounted successfully"
    else
      echo "⚠ Warning: Mount failed, you may need to mount manually"
      echo "  Try: mount -t davfs https://webdav.hidrive.ionos.com/ /mnt/hidrive"
    fi
    # Create ComfyUI output directory
    echo "Creating ComfyUI output directory..."
    mkdir -p /mnt/hidrive/users/valknar/Pictures/AI/ComfyUI
    # Create symlink in ComfyUI
    echo "Creating symlink in ComfyUI..."
    ln -sf /mnt/hidrive/users/valknar/Pictures/AI/ComfyUI $COMFYUI_ROOT/output_hidrive
    echo ""
    echo "✓ WebDAV setup complete"
    echo ""
    echo "Mount point: /mnt/hidrive"
    echo "ComfyUI output: /mnt/hidrive/users/valknar/Pictures/AI/ComfyUI"
    echo "ComfyUI symlink: $COMFYUI_ROOT/output_hidrive"
    echo ""
    echo "To unmount: umount /mnt/hidrive"
    echo "To remount: mount -t davfs https://webdav.hidrive.ionos.com/ /mnt/hidrive"
  #
  # Utility Scripts
  #
@@ -575,53 +696,6 @@ scripts:
    echo "  3. Name: multi-modal-ai-v2.0"
    echo "  4. Save and test deployment"
  #
  # Orchestration Scripts
  #
  install/minimal: |
    echo "========================================="
    echo "  Minimal Installation"
    echo "========================================="
    echo ""
    echo "Installing: System + Python + ComfyUI + Supervisor"
    echo ""
    arty run setup/system-packages && \
    arty run setup/python-env && \
    arty run setup/comfyui-base && \
    arty run setup/supervisor
    echo ""
    echo "✓ Minimal installation complete"
    echo ""
    echo "Next steps:"
    echo "  1. Download models: Use Ansible playbook"
    echo "  2. Link models: arty run models/link-comfyui"
    echo "  3. Start services: arty run services/start"
  install/essential: |
    echo "========================================="
    echo "  Essential Installation"
    echo "========================================="
    echo ""
    echo "Installing: System + Python + ComfyUI + Nodes + Supervisor"
    echo ""
    arty run setup/system-packages && \
    arty run setup/python-env && \
    arty run setup/comfyui-base && \
    arty run setup/comfyui-nodes && \
    arty run setup/supervisor
    echo ""
    echo "✓ Essential installation complete"
    echo ""
    echo "Next steps:"
    echo "  1. Download models: ansible-playbook playbook.yml --tags comfyui-essential"
    echo "  2. Link models: arty run models/link-comfyui"
    echo "  3. Link workflows: arty run workflows/link-comfyui"
    echo "  4. Start services: arty run services/start"
  install/full: |
    echo "========================================="
    echo "  Full Installation"
@@ -647,39 +721,6 @@ scripts:
    echo "  4. Configure Tailscale (see instructions above)"
    echo "  5. Start services: arty run services/start"
  #
  # Legacy Setup (deprecated - use install/* instead)
  #
  setup/full-legacy: |
    cd $AI_ROOT
    cp .env.example .env
    echo "⚠ DEPRECATED: Use 'arty run install/full' instead"
    echo "Edit .env and set HF_TOKEN, then run: ansible-playbook playbook.yml"
  setup/essential-legacy: |
    cd $AI_ROOT
    cp .env.example .env
    echo "⚠ DEPRECATED: Use 'arty run install/essential' instead"
    echo "Edit .env and set HF_TOKEN, then run: ansible-playbook playbook.yml --tags comfyui-essential"
  # Model linking (run after models are downloaded)
  models/link-comfyui: |
    cd $COMFYUI_ROOT/models/diffusers
    ln -sf $HF_CACHE/models--black-forest-labs--FLUX.1-schnell FLUX.1-schnell
    ln -sf $HF_CACHE/models--black-forest-labs--FLUX.1-dev FLUX.1-dev
    ln -sf $HF_CACHE/models--stabilityai--stable-diffusion-xl-base-1.0 stable-diffusion-xl-base-1.0
    ln -sf $HF_CACHE/models--stabilityai--stable-diffusion-xl-refiner-1.0 stable-diffusion-xl-refiner-1.0
    ln -sf $HF_CACHE/models--stabilityai--stable-diffusion-3.5-large stable-diffusion-3.5-large
    cd $COMFYUI_ROOT/models/clip_vision
    ln -sf $HF_CACHE/models--openai--clip-vit-large-patch14 clip-vit-large-patch14
    ln -sf $HF_CACHE/models--laion--CLIP-ViT-bigG-14-laion2B-39B-b160k CLIP-ViT-bigG-14
    ln -sf $HF_CACHE/models--google--siglip-so400m-patch14-384 siglip-so400m-patch14-384
    cd $COMFYUI_ROOT/models/diffusion_models
    ln -sf $HF_CACHE/models--THUDM--CogVideoX-5b CogVideoX-5b
    ln -sf $HF_CACHE/models--stabilityai--stable-video-diffusion-img2vid stable-video-diffusion-img2vid
    ln -sf $HF_CACHE/models--stabilityai--stable-video-diffusion-img2vid-xt stable-video-diffusion-img2vid-xt
    echo "Models linked to ComfyUI"
  # Workflow linking (link production workflows with category prefixes)
  workflows/link-comfyui: |
    # Create ComfyUI user workflows directory
@@ -774,38 +815,65 @@ scripts:
  # Service Management (Supervisor-based)
  #
  # All services
-  services/start: supervisorctl -c /workspace/supervisord.conf start ai-services:*
+  services/start: supervisorctl -c /workspace/supervisord.conf start all
-  services/stop: supervisorctl -c /workspace/supervisord.conf stop ai-services:*
+  services/stop: supervisorctl -c /workspace/supervisord.conf stop all
-  services/restart: supervisorctl -c /workspace/supervisord.conf restart ai-services:*
+  services/restart: supervisorctl -c /workspace/supervisord.conf restart all
  services/status: supervisorctl -c /workspace/supervisord.conf status
-  # ComfyUI service
+  # ComfyUI services group
-  services/comfyui/start: supervisorctl -c /workspace/supervisord.conf start ai-services:comfyui
+  services/comfyui-group/start: supervisorctl -c /workspace/supervisord.conf start comfyui-services:*
-  services/comfyui/stop: supervisorctl -c /workspace/supervisord.conf stop ai-services:comfyui
+  services/comfyui-group/stop: supervisorctl -c /workspace/supervisord.conf stop comfyui-services:*
-  services/comfyui/restart: supervisorctl -c /workspace/supervisord.conf restart ai-services:comfyui
+  services/comfyui-group/restart: supervisorctl -c /workspace/supervisord.conf restart comfyui-services:*
-  services/comfyui/status: supervisorctl -c /workspace/supervisord.conf status ai-services:comfyui
+  services/comfyui-group/status: supervisorctl -c /workspace/supervisord.conf status comfyui-services:*
  services/comfyui/logs: supervisorctl -c /workspace/supervisord.conf tail -f ai-services:comfyui
-  # Orchestrator service
+  # vLLM services group
-  services/orchestrator/start: supervisorctl -c /workspace/supervisord.conf start ai-services:orchestrator
+  services/vllm-group/start: supervisorctl -c /workspace/supervisord.conf start vllm-services:*
-  services/orchestrator/stop: supervisorctl -c /workspace/supervisord.conf stop ai-services:orchestrator
+  services/vllm-group/stop: supervisorctl -c /workspace/supervisord.conf stop vllm-services:*
-  services/orchestrator/restart: supervisorctl -c /workspace/supervisord.conf restart ai-services:orchestrator
+  services/vllm-group/restart: supervisorctl -c /workspace/supervisord.conf restart vllm-services:*
-  services/orchestrator/status: supervisorctl -c /workspace/supervisord.conf status ai-services:orchestrator
+  services/vllm-group/status: supervisorctl -c /workspace/supervisord.conf status vllm-services:*
-  services/orchestrator/logs: supervisorctl -c /workspace/supervisord.conf tail -f ai-services:orchestrator
+
  # ComfyUI service
  services/comfyui/start: supervisorctl -c /workspace/supervisord.conf start comfyui-services:comfyui
  services/comfyui/stop: supervisorctl -c /workspace/supervisord.conf stop comfyui-services:comfyui
  services/comfyui/restart: supervisorctl -c /workspace/supervisord.conf restart comfyui-services:comfyui
  services/comfyui/status: supervisorctl -c /workspace/supervisord.conf status comfyui-services:comfyui
  services/comfyui/logs: supervisorctl -c /workspace/supervisord.conf tail -f comfyui-services:comfyui
  # WebDAV Sync service
-  services/webdav-sync/start: supervisorctl -c /workspace/supervisord.conf start ai-services:webdav-sync
+  services/webdav-sync/start: supervisorctl -c /workspace/supervisord.conf start comfyui-services:webdav-sync
-  services/webdav-sync/stop: supervisorctl -c /workspace/supervisord.conf stop ai-services:webdav-sync
+  services/webdav-sync/stop: supervisorctl -c /workspace/supervisord.conf stop comfyui-services:webdav-sync
-  services/webdav-sync/restart: supervisorctl -c /workspace/supervisord.conf restart ai-services:webdav-sync
+  services/webdav-sync/restart: supervisorctl -c /workspace/supervisord.conf restart comfyui-services:webdav-sync
-  services/webdav-sync/status: supervisorctl -c /workspace/supervisord.conf status ai-services:webdav-sync
+  services/webdav-sync/status: supervisorctl -c /workspace/supervisord.conf status comfyui-services:webdav-sync
-  services/webdav-sync/logs: supervisorctl -c /workspace/supervisord.conf tail -f ai-services:webdav-sync
+  services/webdav-sync/logs: supervisorctl -c /workspace/supervisord.conf tail -f comfyui-services:webdav-sync
  # vLLM Qwen service
  services/vllm-qwen/start: supervisorctl -c /workspace/supervisord.conf start vllm-services:vllm-qwen
  services/vllm-qwen/stop: supervisorctl -c /workspace/supervisord.conf stop vllm-services:vllm-qwen
  services/vllm-qwen/restart: supervisorctl -c /workspace/supervisord.conf restart vllm-services:vllm-qwen
  services/vllm-qwen/status: supervisorctl -c /workspace/supervisord.conf status vllm-services:vllm-qwen
  services/vllm-qwen/logs: supervisorctl -c /workspace/supervisord.conf tail -f vllm-services:vllm-qwen
  # vLLM Llama service
  services/vllm-llama/start: supervisorctl -c /workspace/supervisord.conf start vllm-services:vllm-llama
  services/vllm-llama/stop: supervisorctl -c /workspace/supervisord.conf stop vllm-services:vllm-llama
  services/vllm-llama/restart: supervisorctl -c /workspace/supervisord.conf restart vllm-services:vllm-llama
  services/vllm-llama/status: supervisorctl -c /workspace/supervisord.conf status vllm-services:vllm-llama
  services/vllm-llama/logs: supervisorctl -c /workspace/supervisord.conf tail -f vllm-services:vllm-llama
  # vLLM Embedding service
  services/vllm-embedding/start: supervisorctl -c /workspace/supervisord.conf start vllm-services:vllm-embedding
  services/vllm-embedding/stop: supervisorctl -c /workspace/supervisord.conf stop vllm-services:vllm-embedding
  services/vllm-embedding/restart: supervisorctl -c /workspace/supervisord.conf restart vllm-services:vllm-embedding
  services/vllm-embedding/status: supervisorctl -c /workspace/supervisord.conf status vllm-services:vllm-embedding
  services/vllm-embedding/logs: supervisorctl -c /workspace/supervisord.conf tail -f vllm-services:vllm-embedding
  #
  # Health Checks
  #
  health/orchestrator: curl http://localhost:9000/health
  health/comfyui: curl http://localhost:8188
-  health/vllm: curl http://localhost:8000/health
+  health/vllm-qwen: curl http://localhost:8000/health
  health/vllm-llama: curl http://localhost:8001/health
  health/vllm-embedding: curl http://localhost:8002/health
  #
  # System Checks
--- a/comfyui/patches/diffrhythm-llamaconfig-fix.patch
+++ b/comfyui/patches/diffrhythm-llamaconfig-fix.patch
@@ -0,0 +1,56 @@
 diff --git a/__init__.py b/__init__.py
 index 1234567..abcdefg 100644
 --- a/__init__.py
 +++ b/__init__.py
@@ -1,3 +1,51 @@
 +"""
 +DiffRhythm ComfyUI Node with LlamaConfig Patch
 +
 +PATCH: Fixes "The size of tensor a (32) must match the size of tensor b (64)" error
 +in DiffRhythm's rotary position embeddings by patching LlamaConfig initialization.
 +
 +Issue: DiffRhythm's DIT model doesn't specify num_attention_heads and
 +num_key_value_heads when creating LlamaConfig, causing transformers 4.49.0+
 +to incorrectly infer head_dim = 32 instead of 64.
 +
 +Solution: Patch LlamaConfig globally before importing DiffRhythmNode.
 +
 +Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44
 +Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48
 +
 +Patch author: valknar@pivoine.art
 +"""
 +
 +# CRITICAL: Patch LlamaConfig BEFORE importing DiffRhythmNode
 +from transformers.models.llama import LlamaConfig as _OriginalLlamaConfig
 +
 +class PatchedLlamaConfig(_OriginalLlamaConfig):
 +    """
 +    Patched LlamaConfig that automatically adds missing attention head parameters.
 +
 +    Standard Llama architecture assumptions:
 +    - head_dim = 64 (fixed)
 +    - num_attention_heads = hidden_size // head_dim
 +    - num_key_value_heads = num_attention_heads // 4 (for GQA)
 +    """
 +    def __init__(self, *args, **kwargs):
 +        # If hidden_size is provided but num_attention_heads is not, calculate it
 +        if 'hidden_size' in kwargs and 'num_attention_heads' not in kwargs:
 +            hidden_size = kwargs['hidden_size']
 +            kwargs['num_attention_heads'] = hidden_size // 64
 +
 +        # If num_key_value_heads is not provided, use GQA configuration
 +        if 'num_attention_heads' in kwargs and 'num_key_value_heads' not in kwargs:
 +            kwargs['num_key_value_heads'] = max(1, kwargs['num_attention_heads'] // 4)
 +
 +        super().__init__(*args, **kwargs)
 +
 +# Replace LlamaConfig in transformers module BEFORE DiffRhythm imports it
 +import transformers.models.llama
 +transformers.models.llama.LlamaConfig = PatchedLlamaConfig
 +import transformers.models.llama.modeling_llama
 +transformers.models.llama.modeling_llama.LlamaConfig = PatchedLlamaConfig
 +
 from .DiffRhythmNode import  NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
 __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]
--- a/comfyui/requirements.txt
+++ b/comfyui/requirements.txt
@@ -1,7 +1,7 @@
 torch
 torchvision
 torchaudio
-transformers
+transformers==4.49.0
 diffusers>=0.31.0
 accelerate
 safetensors
@@ -19,3 +19,4 @@ insightface
 onnxruntime
 pyyaml
 imageio-ffmpeg
 torchcodec
--- a/comfyui/workflows/image-to-video/i2v_hunyuan-i2v-v1-robot.webp
+++ b/comfyui/workflows/image-to-video/i2v_hunyuan-i2v-v1-robot.webp
--- a/comfyui/workflows/image-to-video/i2v_hunyuan-i2v-v2-fennec.webp
+++ b/comfyui/workflows/image-to-video/i2v_hunyuan-i2v-v2-fennec.webp
--- a/comfyui/workflows/image-to-video/i2v_hunyuan-t2v-kitchen.webp
+++ b/comfyui/workflows/image-to-video/i2v_hunyuan-t2v-kitchen.webp
--- a/comfyui/workflows/image-to-video/i2v_hunyuan15-i2v-720p.json
+++ b/comfyui/workflows/image-to-video/i2v_hunyuan15-i2v-720p.json
--- a/comfyui/workflows/image-to-video/i2v_hunyuan15-t2v-720p.json
+++ b/comfyui/workflows/image-to-video/i2v_hunyuan15-t2v-720p.json
--- a/comfyui/workflows/image-to-video/i2v_wan22-14b-animate.json
+++ b/comfyui/workflows/image-to-video/i2v_wan22-14b-animate.json
--- a/comfyui/workflows/image-to-video/i2v_wan22-14b-flf2v.json
+++ b/comfyui/workflows/image-to-video/i2v_wan22-14b-flf2v.json
--- a/comfyui/workflows/image-to-video/i2v_wan22-14b-fun-camera.json
+++ b/comfyui/workflows/image-to-video/i2v_wan22-14b-fun-camera.json
--- a/comfyui/workflows/image-to-video/i2v_wan22-14b-fun-control.json
+++ b/comfyui/workflows/image-to-video/i2v_wan22-14b-fun-control.json
--- a/comfyui/workflows/image-to-video/i2v_wan22-14b-i2v.json
+++ b/comfyui/workflows/image-to-video/i2v_wan22-14b-i2v.json
--- a/comfyui/workflows/image-to-video/i2v_wan22-14b-s2v.json
+++ b/comfyui/workflows/image-to-video/i2v_wan22-14b-s2v.json
--- a/comfyui/workflows/image-to-video/i2v_wan22-14b-t2v.json
+++ b/comfyui/workflows/image-to-video/i2v_wan22-14b-t2v.json
--- a/comfyui/workflows/image-to-video/i2v_wan22-5b-ti2v.json
+++ b/comfyui/workflows/image-to-video/i2v_wan22-5b-ti2v.json
@@ -0,0 +1,733 @@
 {
  "id": "91f6bbe2-ed41-4fd6-bac7-71d5b5864ecb",
  "revision": 0,
  "last_node_id": 59,
  "last_link_id": 108,
  "nodes": [
    {
      "id": 37,
      "type": "UNETLoader",
      "pos": [
        -30,
        50
      ],
      "size": [
        346.7470703125,
        82
      ],
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "MODEL",
          "type": "MODEL",
          "slot_index": 0,
          "links": [
            94
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.45",
        "Node name for S&R": "UNETLoader",
        "models": [
          {
            "name": "wan2.2_ti2v_5B_fp16.safetensors",
            "url": "https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/resolve/main/split_files/diffusion_models/wan2.2_ti2v_5B_fp16.safetensors",
            "directory": "diffusion_models"
          }
        ]
      },
      "widgets_values": [
        "wan2.2_ti2v_5B_fp16.safetensors",
        "default"
      ]
    },
    {
      "id": 38,
      "type": "CLIPLoader",
      "pos": [
        -30,
        190
      ],
      "size": [
        350,
        110
      ],
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "CLIP",
          "type": "CLIP",
          "slot_index": 0,
          "links": [
            74,
            75
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.45",
        "Node name for S&R": "CLIPLoader",
        "models": [
          {
            "name": "umt5_xxl_fp8_e4m3fn_scaled.safetensors",
            "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors",
            "directory": "text_encoders"
          }
        ]
      },
      "widgets_values": [
        "umt5_xxl_fp8_e4m3fn_scaled.safetensors",
        "wan",
        "default"
      ]
    },
    {
      "id": 39,
      "type": "VAELoader",
      "pos": [
        -30,
        350
      ],
      "size": [
        350,
        60
      ],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "VAE",
          "type": "VAE",
          "slot_index": 0,
          "links": [
            76,
            105
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.45",
        "Node name for S&R": "VAELoader",
        "models": [
          {
            "name": "wan2.2_vae.safetensors",
            "url": "https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/resolve/main/split_files/vae/wan2.2_vae.safetensors",
            "directory": "vae"
          }
        ]
      },
      "widgets_values": [
        "wan2.2_vae.safetensors"
      ]
    },
    {
      "id": 8,
      "type": "VAEDecode",
      "pos": [
        1190,
        150
      ],
      "size": [
        210,
        46
      ],
      "flags": {},
      "order": 10,
      "mode": 0,
      "inputs": [
        {
          "name": "samples",
          "type": "LATENT",
          "link": 35
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": 76
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "slot_index": 0,
          "links": [
            107
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.45",
        "Node name for S&R": "VAEDecode"
      },
      "widgets_values": []
    },
    {
      "id": 57,
      "type": "CreateVideo",
      "pos": [
        1200,
        240
      ],
      "size": [
        270,
        78
      ],
      "flags": {},
      "order": 11,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "link": 107
        },
        {
          "name": "audio",
          "shape": 7,
          "type": "AUDIO",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "VIDEO",
          "type": "VIDEO",
          "links": [
            108
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.45",
        "Node name for S&R": "CreateVideo"
      },
      "widgets_values": [
        24
      ]
    },
    {
      "id": 58,
      "type": "SaveVideo",
      "pos": [
        1200,
        370
      ],
      "size": [
        660,
        450
      ],
      "flags": {},
      "order": 12,
      "mode": 0,
      "inputs": [
        {
          "name": "video",
          "type": "VIDEO",
          "link": 108
        }
      ],
      "outputs": [],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.45",
        "Node name for S&R": "SaveVideo"
      },
      "widgets_values": [
        "video/ComfyUI",
        "auto",
        "auto"
      ]
    },
    {
      "id": 55,
      "type": "Wan22ImageToVideoLatent",
      "pos": [
        380,
        540
      ],
      "size": [
        271.9126892089844,
        150
      ],
      "flags": {},
      "order": 8,
      "mode": 0,
      "inputs": [
        {
          "name": "vae",
          "type": "VAE",
          "link": 105
        },
        {
          "name": "start_image",
          "shape": 7,
          "type": "IMAGE",
          "link": 106
        }
      ],
      "outputs": [
        {
          "name": "LATENT",
          "type": "LATENT",
          "links": [
            104
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.45",
        "Node name for S&R": "Wan22ImageToVideoLatent"
      },
      "widgets_values": [
        1280,
        704,
        121,
        1
      ]
    },
    {
      "id": 56,
      "type": "LoadImage",
      "pos": [
        0,
        540
      ],
      "size": [
        274.080078125,
        314
      ],
      "flags": {},
      "order": 3,
      "mode": 4,
      "inputs": [],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            106
          ]
        },
        {
          "name": "MASK",
          "type": "MASK",
          "links": null
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.45",
        "Node name for S&R": "LoadImage"
      },
      "widgets_values": [
        "example.png",
        "image"
      ]
    },
    {
      "id": 7,
      "type": "CLIPTextEncode",
      "pos": [
        380,
        260
      ],
      "size": [
        425.27801513671875,
        180.6060791015625
      ],
      "flags": {},
      "order": 7,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 75
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "slot_index": 0,
          "links": [
            52
          ]
        }
      ],
      "title": "CLIP Text Encode (Negative Prompt)",
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.45",
        "Node name for S&R": "CLIPTextEncode"
      },
      "widgets_values": [
        "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
      ],
      "color": "#322",
      "bgcolor": "#533"
    },
    {
      "id": 6,
      "type": "CLIPTextEncode",
      "pos": [
        380,
        50
      ],
      "size": [
        422.84503173828125,
        164.31304931640625
      ],
      "flags": {},
      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 74
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "slot_index": 0,
          "links": [
            46
          ]
        }
      ],
      "title": "CLIP Text Encode (Positive Prompt)",
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.45",
        "Node name for S&R": "CLIPTextEncode"
      },
      "widgets_values": [
        "Low contrast. In a retro 1970s-style subway station, a street musician plays in dim colors and rough textures. He wears an old jacket, playing guitar with focus. Commuters hurry by, and a small crowd gathers to listen. The camera slowly moves right, capturing the blend of music and city noise, with old subway signs and mottled walls in the background."
      ],
      "color": "#232",
      "bgcolor": "#353"
    },
    {
      "id": 3,
      "type": "KSampler",
      "pos": [
        850,
        130
      ],
      "size": [
        315,
        262
      ],
      "flags": {},
      "order": 9,
      "mode": 0,
      "inputs": [
        {
          "name": "model",
          "type": "MODEL",
          "link": 95
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
          "link": 46
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
          "link": 52
        },
        {
          "name": "latent_image",
          "type": "LATENT",
          "link": 104
        }
      ],
      "outputs": [
        {
          "name": "LATENT",
          "type": "LATENT",
          "slot_index": 0,
          "links": [
            35
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.45",
        "Node name for S&R": "KSampler"
      },
      "widgets_values": [
        898471028164125,
        "randomize",
        20,
        5,
        "uni_pc",
        "simple",
        1
      ]
    },
    {
      "id": 48,
      "type": "ModelSamplingSD3",
      "pos": [
        850,
        20
      ],
      "size": [
        210,
        58
      ],
      "flags": {
        "collapsed": false
      },
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "model",
          "type": "MODEL",
          "link": 94
        }
      ],
      "outputs": [
        {
          "name": "MODEL",
          "type": "MODEL",
          "slot_index": 0,
          "links": [
            95
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.45",
        "Node name for S&R": "ModelSamplingSD3"
      },
      "widgets_values": [
        8
      ]
    },
    {
      "id": 59,
      "type": "MarkdownNote",
      "pos": [
        -550,
        10
      ],
      "size": [
        480,
        340
      ],
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [],
      "outputs": [],
      "title": "Model Links",
      "properties": {},
      "widgets_values": [
        "[Tutorial](https://docs.comfy.org/tutorials/video/wan/wan2_2\n) \n\n**Diffusion Model**\n- [wan2.2_ti2v_5B_fp16.safetensors](https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/resolve/main/split_files/diffusion_models/wan2.2_ti2v_5B_fp16.safetensors)\n\n**VAE**\n- [wan2.2_vae.safetensors](https://huggingface.co/Comfy-Org/Wan_2.2_ComfyUI_Repackaged/resolve/main/split_files/vae/wan2.2_vae.safetensors)\n\n**Text Encoder**   \n- [umt5_xxl_fp8_e4m3fn_scaled.safetensors](https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors)\n\n\nFile save location\n\n```\nComfyUI/\n├───📂 models/\n│   ├───📂 diffusion_models/\n│   │   └───wan2.2_ti2v_5B_fp16.safetensors\n│   ├───📂 text_encoders/\n│   │   └─── umt5_xxl_fp8_e4m3fn_scaled.safetensors \n│   └───📂 vae/\n│       └── wan2.2_vae.safetensors\n```\n"
      ],
      "color": "#432",
      "bgcolor": "#653"
    }
  ],
  "links": [
    [
      35,
      3,
      0,
      8,
      0,
      "LATENT"
    ],
    [
      46,
      6,
      0,
      3,
      1,
      "CONDITIONING"
    ],
    [
      52,
      7,
      0,
      3,
      2,
      "CONDITIONING"
    ],
    [
      74,
      38,
      0,
      6,
      0,
      "CLIP"
    ],
    [
      75,
      38,
      0,
      7,
      0,
      "CLIP"
    ],
    [
      76,
      39,
      0,
      8,
      1,
      "VAE"
    ],
    [
      94,
      37,
      0,
      48,
      0,
      "MODEL"
    ],
    [
      95,
      48,
      0,
      3,
      0,
      "MODEL"
    ],
    [
      104,
      55,
      0,
      3,
      3,
      "LATENT"
    ],
    [
      105,
      39,
      0,
      55,
      0,
      "VAE"
    ],
    [
      106,
      56,
      0,
      55,
      1,
      "IMAGE"
    ],
    [
      107,
      8,
      0,
      57,
      0,
      "IMAGE"
    ],
    [
      108,
      57,
      0,
      58,
      0,
      "VIDEO"
    ]
  ],
  "groups": [
    {
      "id": 1,
      "title": "Step1 - Load models",
      "bounding": [
        -50,
        -20,
        400,
        453.6000061035156
      ],
      "color": "#3f789e",
      "font_size": 24,
      "flags": {}
    },
    {
      "id": 2,
      "title": "Step3 - Prompt",
      "bounding": [
        370,
        -20,
        448.27801513671875,
        473.2060852050781
      ],
      "color": "#3f789e",
      "font_size": 24,
      "flags": {}
    },
    {
      "id": 3,
      "title": "For i2v, use Ctrl + B to enable",
      "bounding": [
        -50,
        450,
        400,
        420
      ],
      "color": "#3f789e",
      "font_size": 24,
      "flags": {}
    },
    {
      "id": 4,
      "title": "Video Size & length",
      "bounding": [
        370,
        470,
        291.9127197265625,
        233.60000610351562
      ],
      "color": "#3f789e",
      "font_size": 24,
      "flags": {}
    }
  ],
  "config": {},
  "extra": {
    "ds": {
      "scale": 0.46462425349300085,
      "offset": [
        847.5372059811432,
        288.7938392118285
      ]
    },
    "frontendVersion": "1.27.10",
    "VHS_latentpreview": false,
    "VHS_latentpreviewrate": 0,
    "VHS_MetadataImage": true,
    "VHS_KeepIntermediate": true
  },
  "version": 0.4
 }
--- a/comfyui/workflows/image-to-video/wan22-animate-original-video.mp4
+++ b/comfyui/workflows/image-to-video/wan22-animate-original-video.mp4
--- a/comfyui/workflows/image-to-video/wan22-animate-ref-image.png
+++ b/comfyui/workflows/image-to-video/wan22-animate-ref-image.png
--- a/comfyui/workflows/image-to-video/wan22-flf2v-end.png
+++ b/comfyui/workflows/image-to-video/wan22-flf2v-end.png
--- a/comfyui/workflows/image-to-video/wan22-flf2v-start.png
+++ b/comfyui/workflows/image-to-video/wan22-flf2v-start.png
--- a/comfyui/workflows/image-to-video/wan22-fun-camera-input.jpg
+++ b/comfyui/workflows/image-to-video/wan22-fun-camera-input.jpg
--- a/comfyui/workflows/image-to-video/wan22-i2v-input.jpg
+++ b/comfyui/workflows/image-to-video/wan22-i2v-input.jpg
--- a/comfyui/workflows/text-to-music/acestep-m2m-editing-v1.json
+++ b/comfyui/workflows/text-to-music/acestep-m2m-editing-v1.json
@@ -0,0 +1,865 @@
 {
  "id": "88ac5dad-efd7-40bb-84fe-fbaefdee1fa9",
  "revision": 0,
  "last_node_id": 75,
  "last_link_id": 138,
  "nodes": [
    {
      "id": 49,
      "type": "LatentApplyOperationCFG",
      "pos": [
        940,
        -160
      ],
      "size": [
        290,
        50
      ],
      "flags": {
        "collapsed": false
      },
      "order": 10,
      "mode": 0,
      "inputs": [
        {
          "name": "model",
          "type": "MODEL",
          "link": 113
        },
        {
          "name": "operation",
          "type": "LATENT_OPERATION",
          "link": 114
        }
      ],
      "outputs": [
        {
          "name": "MODEL",
          "type": "MODEL",
          "links": [
            121
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "LatentApplyOperationCFG"
      },
      "widgets_values": []
    },
    {
      "id": 40,
      "type": "CheckpointLoaderSimple",
      "pos": [
        180,
        -160
      ],
      "size": [
        370,
        98
      ],
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "MODEL",
          "type": "MODEL",
          "links": [
            115
          ]
        },
        {
          "name": "CLIP",
          "type": "CLIP",
          "links": [
            80
          ]
        },
        {
          "name": "VAE",
          "type": "VAE",
          "links": [
            83,
            137
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.32",
        "Node name for S&R": "CheckpointLoaderSimple",
        "models": [
          {
            "name": "ace_step_v1_3.5b.safetensors",
            "url": "https://huggingface.co/Comfy-Org/ACE-Step_ComfyUI_repackaged/resolve/main/all_in_one/ace_step_v1_3.5b.safetensors?download=true",
            "directory": "checkpoints"
          }
        ]
      },
      "widgets_values": [
        "ace_step_v1_3.5b.safetensors"
      ],
      "color": "#322",
      "bgcolor": "#533"
    },
    {
      "id": 48,
      "type": "MarkdownNote",
      "pos": [
        -460,
        -200
      ],
      "size": [
        610,
        820
      ],
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [],
      "outputs": [],
      "title": "About ACE Step and Multi-language Input",
      "properties": {},
      "widgets_values": [
        "[Tutorial](http://docs.comfy.org/tutorials/audio/ace-step/ace-step-v1) | [教程](http://docs.comfy.org/zh-CN/tutorials/audio/ace-step/ace-step-v1)\n\n\n### Model Download\n\nDownload the following model and save it to the **ComfyUI/models/checkpoints** folder.\n[ace_step_v1_3.5b.safetensors](https://huggingface.co/Comfy-Org/ACE-Step_ComfyUI_repackaged/blob/main/all_in_one/ace_step_v1_3.5b.safetensors)\n\n\n### Multilingual Support\n\nCurrently, the implementation of multi-language support for ACE-Step V1 is achieved by uniformly converting different languages into English characters. At present, in ComfyUI, we haven't implemented the step of converting multi-languages into English. This is because if we need to implement the corresponding conversion, we have to add additional core dependencies of ComfyUI, which may lead to uncertain dependency conflicts.\n\nSo, currently, if you need to input multi-language text, you have to manually convert it into English characters to complete this process. Then, at the beginning of the corresponding `lyrics`, input the abbreviation of the corresponding language code.\n\nFor example, for Chinese, use `[zh]`, for Japanese use `[ja]`, for Korean use `[ko]`, and so on. For specific language input, please check the examples in the instructions. \n\nFor example, Chinese `[zh]`, Japanese `[ja]`, Korean `[ko]`, etc.\n\nExample:\n\n```\n[verse]\n\n[zh]wo3zou3guo4shen1ye4de5jie1dao4\n[zh]leng3feng1chui1luan4si1nian4de5piao4liang4wai4tao4\n[zh]ni3de5wei1xiao4xiang4xing1guang1hen3xuan4yao4\n[zh]zhao4liang4le5wo3gu1du2de5mei3fen1mei3miao3\n\n[chorus]\n\n[verse]\n[ko]hamkke si-kkeuleo-un sesang-ui sodong-eul pihae\n[ko]honja ogsang-eseo dalbich-ui eolyeompus-ileul balaboda\n[ko]niga salang-eun lideum-i ganghan eum-ag gatdago malhaess-eo\n[ko]han ta han tamada ma-eum-ui ondoga eolmana heojeonhanji ijge hae\n\n[bridge]\n[es]cantar mi anhelo por ti sin ocultar\n[es]como poesía y pintura, lleno de anhelo indescifrable\n[es]tu sombra es tan terca como el viento, inborrable\n[es]persiguiéndote en vuelo, brilla como cruzar una mar de nubes\n\n[chorus]\n[fr]que tu sois le vent qui souffle sur ma main\n[fr]un contact chaud comme la douce pluie printanière\n[fr]que tu sois le vent qui s'entoure de mon corps\n[fr]un amour profond qui ne s'éloignera jamais\n\n```\n\n---\n\n### 模型下载\n\n下载下面的模型并保存到 **ComfyUI/models/checkpoints** 文件夹下\n[ace_step_v1_3.5b.safetensors](https://huggingface.co/Comfy-Org/ACE-Step_ComfyUI_repackaged/blob/main/all_in_one/ace_step_v1_3.5b.safetensors)\n\n\n### 多语言支持\n\n目前 ACE-Step V1 多语言的实现是通过将不同语言统一转换为英文字符来实现的，目前在 ComfyUI 中我们并没有实现多语言转换为英文的这一步骤。因为如果需要实现对应转换，则需要增加额外的 ComfyUI 核心依赖，这将可能带来不确定的依赖冲突。\n\n所以目前如果你需要输入多语言，则需要手动转换为英文字符来实现这一过程，然后在对应 `lyrics` 开头输入对应语言代码的缩写。\n\n比如中文`[zh]` 日语 `[ja]` 韩语 `[ko]` 等，具体语言输入请查看说明中的示例\n\n"
      ],
      "color": "#432",
      "bgcolor": "#653"
    },
    {
      "id": 18,
      "type": "VAEDecodeAudio",
      "pos": [
        1080,
        270
      ],
      "size": [
        150.93612670898438,
        46
      ],
      "flags": {
        "collapsed": false
      },
      "order": 13,
      "mode": 0,
      "inputs": [
        {
          "name": "samples",
          "type": "LATENT",
          "link": 122
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": 83
        }
      ],
      "outputs": [
        {
          "name": "AUDIO",
          "type": "AUDIO",
          "links": [
            126,
            127,
            128
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.32",
        "Node name for S&R": "VAEDecodeAudio"
      },
      "widgets_values": []
    },
    {
      "id": 60,
      "type": "SaveAudio",
      "pos": [
        1260,
        40
      ],
      "size": [
        610,
        112
      ],
      "flags": {},
      "order": 15,
      "mode": 4,
      "inputs": [
        {
          "name": "audio",
          "type": "AUDIO",
          "link": 127
        }
      ],
      "outputs": [],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "SaveAudio"
      },
      "widgets_values": [
        "audio/ComfyUI"
      ]
    },
    {
      "id": 61,
      "type": "SaveAudioOpus",
      "pos": [
        1260,
        220
      ],
      "size": [
        610,
        136
      ],
      "flags": {},
      "order": 16,
      "mode": 4,
      "inputs": [
        {
          "name": "audio",
          "type": "AUDIO",
          "link": 128
        }
      ],
      "outputs": [],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "SaveAudioOpus"
      },
      "widgets_values": [
        "audio/ComfyUI",
        "128k"
      ]
    },
    {
      "id": 44,
      "type": "ConditioningZeroOut",
      "pos": [
        600,
        70
      ],
      "size": [
        197.712890625,
        26
      ],
      "flags": {
        "collapsed": true
      },
      "order": 11,
      "mode": 0,
      "inputs": [
        {
          "name": "conditioning",
          "type": "CONDITIONING",
          "link": 108
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "links": [
            120
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.32",
        "Node name for S&R": "ConditioningZeroOut"
      },
      "widgets_values": []
    },
    {
      "id": 51,
      "type": "ModelSamplingSD3",
      "pos": [
        590,
        -40
      ],
      "size": [
        330,
        60
      ],
      "flags": {
        "collapsed": false
      },
      "order": 7,
      "mode": 0,
      "inputs": [
        {
          "name": "model",
          "type": "MODEL",
          "link": 115
        }
      ],
      "outputs": [
        {
          "name": "MODEL",
          "type": "MODEL",
          "links": [
            113
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "ModelSamplingSD3"
      },
      "widgets_values": [
        5.000000000000001
      ]
    },
    {
      "id": 50,
      "type": "LatentOperationTonemapReinhard",
      "pos": [
        590,
        -160
      ],
      "size": [
        330,
        58
      ],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "LATENT_OPERATION",
          "type": "LATENT_OPERATION",
          "links": [
            114
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "LatentOperationTonemapReinhard"
      },
      "widgets_values": [
        1.0000000000000002
      ]
    },
    {
      "id": 17,
      "type": "EmptyAceStepLatentAudio",
      "pos": [
        180,
        50
      ],
      "size": [
        370,
        82
      ],
      "flags": {},
      "order": 3,
      "mode": 4,
      "inputs": [],
      "outputs": [
        {
          "name": "LATENT",
          "type": "LATENT",
          "links": []
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.32",
        "Node name for S&R": "EmptyAceStepLatentAudio"
      },
      "widgets_values": [
        120,
        1
      ]
    },
    {
      "id": 68,
      "type": "VAEEncodeAudio",
      "pos": [
        180,
        180
      ],
      "size": [
        370,
        46
      ],
      "flags": {},
      "order": 9,
      "mode": 0,
      "inputs": [
        {
          "name": "audio",
          "type": "AUDIO",
          "link": 136
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": 137
        }
      ],
      "outputs": [
        {
          "name": "LATENT",
          "type": "LATENT",
          "links": [
            138
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "VAEEncodeAudio"
      },
      "widgets_values": []
    },
    {
      "id": 64,
      "type": "LoadAudio",
      "pos": [
        180,
        340
      ],
      "size": [
        370,
        140
      ],
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "AUDIO",
          "type": "AUDIO",
          "links": [
            136
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "LoadAudio"
      },
      "widgets_values": [
        "audio_ace_step_1_t2a_song-1.mp3",
        null,
        null
      ],
      "color": "#322",
      "bgcolor": "#533"
    },
    {
      "id": 52,
      "type": "KSampler",
      "pos": [
        940,
        -40
      ],
      "size": [
        290,
        262
      ],
      "flags": {},
      "order": 12,
      "mode": 0,
      "inputs": [
        {
          "name": "model",
          "type": "MODEL",
          "link": 121
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
          "link": 117
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
          "link": 120
        },
        {
          "name": "latent_image",
          "type": "LATENT",
          "link": 138
        }
      ],
      "outputs": [
        {
          "name": "LATENT",
          "type": "LATENT",
          "slot_index": 0,
          "links": [
            122
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "KSampler"
      },
      "widgets_values": [
        938549746349002,
        "randomize",
        50,
        5,
        "euler",
        "simple",
        0.30000000000000004
      ]
    },
    {
      "id": 59,
      "type": "SaveAudioMP3",
      "pos": [
        1260,
        -160
      ],
      "size": [
        610,
        136
      ],
      "flags": {},
      "order": 14,
      "mode": 0,
      "inputs": [
        {
          "name": "audio",
          "type": "AUDIO",
          "link": 126
        }
      ],
      "outputs": [],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "SaveAudioMP3"
      },
      "widgets_values": [
        "audio/ComfyUI",
        "V0"
      ]
    },
    {
      "id": 73,
      "type": "Note",
      "pos": [
        1260,
        410
      ],
      "size": [
        610,
        90
      ],
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [],
      "outputs": [],
      "properties": {},
      "widgets_values": [
        "These nodes can save audio in different formats. Currently, all the modes are Bypass. You can enable them as per your needs.\n\n这些节点可以将 audio  保存成不同格式，目前的模式都是 Bypass ，你可以按你的需要来启用"
      ],
      "color": "#432",
      "bgcolor": "#653"
    },
    {
      "id": 14,
      "type": "TextEncodeAceStepAudio",
      "pos": [
        590,
        120
      ],
      "size": [
        340,
        500
      ],
      "flags": {},
      "order": 8,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 80
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "links": [
            108,
            117
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.32",
        "Node name for S&R": "TextEncodeAceStepAudio"
      },
      "widgets_values": [
        "anime, cute female vocals, kawaii pop, j-pop, childish, piano, guitar, synthesizer, fast, happy, cheerful, lighthearted",
        "[verse]\nフワフワ　オミミガ\nユレルヨ　カゼノナカ\nキラキラ　アオイメ\nミツメル　セカイヲ\n\n[verse]\nフワフワ　シッポハ\nオオキク　ユレルヨ\nキンイロ　カミノケ\nナビクヨ　カゼノナカ\n\n[verse]\nコンフィーユーアイノ\nマモリビト\nピンクノ　セーターデ\nエガオヲ　クレルヨ\n\nアオイロ　スカートト\nクロイコート　キンノモヨウ\nヤサシイ　ヒカリガ\nツツムヨ　フェネックガール\n\n[verse]\nフワフワ　オミミデ\nキコエル　ココロノ　コエ\nダイスキ　フェネックガール\nイツデモ　ソバニイルヨ",
        0.9900000000000002
      ]
    },
    {
      "id": 75,
      "type": "MarkdownNote",
      "pos": [
        950,
        410
      ],
      "size": [
        280,
        210
      ],
      "flags": {},
      "order": 6,
      "mode": 0,
      "inputs": [],
      "outputs": [],
      "title": "About Repainting",
      "properties": {},
      "widgets_values": [
        "Providing the lyrics of the original song or the modified lyrics is very important for the output of repainting or editing. \n\nAdjust the value of the **denoise** parameter in KSampler. The larger the value, the lower the similarity between the output audio and the original audio.\n\n提供原始歌曲的歌词或者修改后的歌词对于音频编辑的输出是非常重要的，调整 KSampler 中的  denoise 参数的数值，数值越大输出的音频与原始音频相似度越低"
      ],
      "color": "#432",
      "bgcolor": "#653"
    }
  ],
  "links": [
    [
      80,
      40,
      1,
      14,
      0,
      "CLIP"
    ],
    [
      83,
      40,
      2,
      18,
      1,
      "VAE"
    ],
    [
      108,
      14,
      0,
      44,
      0,
      "CONDITIONING"
    ],
    [
      113,
      51,
      0,
      49,
      0,
      "MODEL"
    ],
    [
      114,
      50,
      0,
      49,
      1,
      "LATENT_OPERATION"
    ],
    [
      115,
      40,
      0,
      51,
      0,
      "MODEL"
    ],
    [
      117,
      14,
      0,
      52,
      1,
      "CONDITIONING"
    ],
    [
      120,
      44,
      0,
      52,
      2,
      "CONDITIONING"
    ],
    [
      121,
      49,
      0,
      52,
      0,
      "MODEL"
    ],
    [
      122,
      52,
      0,
      18,
      0,
      "LATENT"
    ],
    [
      126,
      18,
      0,
      59,
      0,
      "AUDIO"
    ],
    [
      127,
      18,
      0,
      60,
      0,
      "AUDIO"
    ],
    [
      128,
      18,
      0,
      61,
      0,
      "AUDIO"
    ],
    [
      136,
      64,
      0,
      68,
      0,
      "AUDIO"
    ],
    [
      137,
      40,
      2,
      68,
      1,
      "VAE"
    ],
    [
      138,
      68,
      0,
      52,
      3,
      "LATENT"
    ]
  ],
  "groups": [
    {
      "id": 1,
      "title": "Load model here",
      "bounding": [
        170,
        -230,
        390,
        180
      ],
      "color": "#3f789e",
      "font_size": 24,
      "flags": {}
    },
    {
      "id": 4,
      "title": "Latent",
      "bounding": [
        170,
        -30,
        390,
        280
      ],
      "color": "#3f789e",
      "font_size": 24,
      "flags": {}
    },
    {
      "id": 5,
      "title": "Adjust the vocal volume",
      "bounding": [
        580,
        -230,
        350,
        140
      ],
      "color": "#3f789e",
      "font_size": 24,
      "flags": {}
    },
    {
      "id": 6,
      "title": "For repainting",
      "bounding": [
        170,
        270,
        390,
        223.60000610351562
      ],
      "color": "#3f789e",
      "font_size": 24,
      "flags": {}
    },
    {
      "id": 7,
      "title": "Output",
      "bounding": [
        1250,
        -230,
        630,
        760
      ],
      "color": "#3f789e",
      "font_size": 24,
      "flags": {}
    }
  ],
  "config": {},
  "extra": {
    "ds": {
      "scale": 0.6830134553650705,
      "offset": [
        785.724285521853,
        434.02395631202546
      ]
    },
    "frontendVersion": "1.19.9",
    "node_versions": {
      "comfy-core": "0.3.34",
      "ace-step": "06f751d65491c9077fa2bc9b06d2c6f2a90e4c56"
    },
    "VHS_latentpreview": false,
    "VHS_latentpreviewrate": 0,
    "VHS_MetadataImage": true,
    "VHS_KeepIntermediate": true
  },
  "version": 0.4
 }
--- a/comfyui/workflows/text-to-music/acestep-m2m-input.mp3
+++ b/comfyui/workflows/text-to-music/acestep-m2m-input.mp3
--- a/comfyui/workflows/text-to-music/acestep-m2m-output.mp3
+++ b/comfyui/workflows/text-to-music/acestep-m2m-output.mp3
--- a/comfyui/workflows/text-to-music/acestep-official-t2m-v1.json
+++ b/comfyui/workflows/text-to-music/acestep-official-t2m-v1.json
@@ -0,0 +1,841 @@
 {
  "id": "88ac5dad-efd7-40bb-84fe-fbaefdee1fa9",
  "revision": 0,
  "last_node_id": 73,
  "last_link_id": 137,
  "nodes": [
    {
      "id": 49,
      "type": "LatentApplyOperationCFG",
      "pos": [
        940,
        -160
      ],
      "size": [
        290,
        50
      ],
      "flags": {
        "collapsed": false
      },
      "order": 9,
      "mode": 0,
      "inputs": [
        {
          "name": "model",
          "type": "MODEL",
          "link": 113
        },
        {
          "name": "operation",
          "type": "LATENT_OPERATION",
          "link": 114
        }
      ],
      "outputs": [
        {
          "name": "MODEL",
          "type": "MODEL",
          "links": [
            121
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "LatentApplyOperationCFG"
      },
      "widgets_values": []
    },
    {
      "id": 64,
      "type": "LoadAudio",
      "pos": [
        180,
        340
      ],
      "size": [
        370,
        140
      ],
      "flags": {},
      "order": 0,
      "mode": 4,
      "inputs": [],
      "outputs": [
        {
          "name": "AUDIO",
          "type": "AUDIO",
          "links": [
            136
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "LoadAudio"
      },
      "widgets_values": [
        "ace_step_example.flac",
        null,
        null
      ],
      "color": "#322",
      "bgcolor": "#533"
    },
    {
      "id": 68,
      "type": "VAEEncodeAudio",
      "pos": [
        180,
        180
      ],
      "size": [
        370,
        46
      ],
      "flags": {},
      "order": 8,
      "mode": 4,
      "inputs": [
        {
          "name": "audio",
          "type": "AUDIO",
          "link": 136
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": 137
        }
      ],
      "outputs": [
        {
          "name": "LATENT",
          "type": "LATENT",
          "links": null
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "VAEEncodeAudio"
      },
      "widgets_values": []
    },
    {
      "id": 40,
      "type": "CheckpointLoaderSimple",
      "pos": [
        180,
        -160
      ],
      "size": [
        370,
        98
      ],
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "MODEL",
          "type": "MODEL",
          "links": [
            115
          ]
        },
        {
          "name": "CLIP",
          "type": "CLIP",
          "links": [
            80
          ]
        },
        {
          "name": "VAE",
          "type": "VAE",
          "links": [
            83,
            137
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.32",
        "Node name for S&R": "CheckpointLoaderSimple",
        "models": [
          {
            "name": "ace_step_v1_3.5b.safetensors",
            "url": "https://huggingface.co/Comfy-Org/ACE-Step_ComfyUI_repackaged/resolve/main/all_in_one/ace_step_v1_3.5b.safetensors?download=true",
            "directory": "checkpoints"
          }
        ]
      },
      "widgets_values": [
        "ace_step_v1_3.5b.safetensors"
      ],
      "color": "#322",
      "bgcolor": "#533"
    },
    {
      "id": 48,
      "type": "MarkdownNote",
      "pos": [
        -460,
        -200
      ],
      "size": [
        610,
        820
      ],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [],
      "outputs": [],
      "title": "About ACE Step and Multi-language Input",
      "properties": {},
      "widgets_values": [
        "[Tutorial](http://docs.comfy.org/tutorials/audio/ace-step/ace-step-v1) | [教程](http://docs.comfy.org/zh-CN/tutorials/audio/ace-step/ace-step-v1)\n\n\n### Model Download\n\nDownload the following model and save it to the **ComfyUI/models/checkpoints** folder.\n[ace_step_v1_3.5b.safetensors](https://huggingface.co/Comfy-Org/ACE-Step_ComfyUI_repackaged/blob/main/all_in_one/ace_step_v1_3.5b.safetensors)\n\n\n### Multilingual Support\n\nCurrently, the implementation of multi-language support for ACE-Step V1 is achieved by uniformly converting different languages into English characters. At present, in ComfyUI, we haven't implemented the step of converting multi-languages into English. This is because if we need to implement the corresponding conversion, we have to add additional core dependencies of ComfyUI, which may lead to uncertain dependency conflicts.\n\nSo, currently, if you need to input multi-language text, you have to manually convert it into English characters to complete this process. Then, at the beginning of the corresponding `lyrics`, input the abbreviation of the corresponding language code.\n\nFor example, for Chinese, use `[zh]`, for Japanese use `[ja]`, for Korean use `[ko]`, and so on. For specific language input, please check the examples in the instructions. \n\nFor example, Chinese `[zh]`, Japanese `[ja]`, Korean `[ko]`, etc.\n\nExample:\n\n```\n[verse]\n\n[zh]wo3zou3guo4shen1ye4de5jie1dao4\n[zh]leng3feng1chui1luan4si1nian4de5piao4liang4wai4tao4\n[zh]ni3de5wei1xiao4xiang4xing1guang1hen3xuan4yao4\n[zh]zhao4liang4le5wo3gu1du2de5mei3fen1mei3miao3\n\n[chorus]\n\n[verse]\n[ko]hamkke si-kkeuleo-un sesang-ui sodong-eul pihae\n[ko]honja ogsang-eseo dalbich-ui eolyeompus-ileul balaboda\n[ko]niga salang-eun lideum-i ganghan eum-ag gatdago malhaess-eo\n[ko]han ta han tamada ma-eum-ui ondoga eolmana heojeonhanji ijge hae\n\n[bridge]\n[es]cantar mi anhelo por ti sin ocultar\n[es]como poesía y pintura, lleno de anhelo indescifrable\n[es]tu sombra es tan terca como el viento, inborrable\n[es]persiguiéndote en vuelo, brilla como cruzar una mar de nubes\n\n[chorus]\n[fr]que tu sois le vent qui souffle sur ma main\n[fr]un contact chaud comme la douce pluie printanière\n[fr]que tu sois le vent qui s'entoure de mon corps\n[fr]un amour profond qui ne s'éloignera jamais\n\n```\n\n---\n\n### 模型下载\n\n下载下面的模型并保存到 **ComfyUI/models/checkpoints** 文件夹下\n[ace_step_v1_3.5b.safetensors](https://huggingface.co/Comfy-Org/ACE-Step_ComfyUI_repackaged/blob/main/all_in_one/ace_step_v1_3.5b.safetensors)\n\n\n### 多语言支持\n\n目前 ACE-Step V1 多语言的实现是通过将不同语言统一转换为英文字符来实现的，目前在 ComfyUI 中我们并没有实现多语言转换为英文的这一步骤。因为如果需要实现对应转换，则需要增加额外的 ComfyUI 核心依赖，这将可能带来不确定的依赖冲突。\n\n所以目前如果你需要输入多语言，则需要手动转换为英文字符来实现这一过程，然后在对应 `lyrics` 开头输入对应语言代码的缩写。\n\n比如中文`[zh]` 日语 `[ja]` 韩语 `[ko]` 等，具体语言输入请查看说明中的示例\n\n"
      ],
      "color": "#432",
      "bgcolor": "#653"
    },
    {
      "id": 18,
      "type": "VAEDecodeAudio",
      "pos": [
        1080,
        270
      ],
      "size": [
        150.93612670898438,
        46
      ],
      "flags": {
        "collapsed": false
      },
      "order": 12,
      "mode": 0,
      "inputs": [
        {
          "name": "samples",
          "type": "LATENT",
          "link": 122
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": 83
        }
      ],
      "outputs": [
        {
          "name": "AUDIO",
          "type": "AUDIO",
          "links": [
            126,
            127,
            128
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.32",
        "Node name for S&R": "VAEDecodeAudio"
      },
      "widgets_values": []
    },
    {
      "id": 60,
      "type": "SaveAudio",
      "pos": [
        1260,
        40
      ],
      "size": [
        610,
        112
      ],
      "flags": {},
      "order": 14,
      "mode": 4,
      "inputs": [
        {
          "name": "audio",
          "type": "AUDIO",
          "link": 127
        }
      ],
      "outputs": [],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "SaveAudio"
      },
      "widgets_values": [
        "audio/ComfyUI"
      ]
    },
    {
      "id": 61,
      "type": "SaveAudioOpus",
      "pos": [
        1260,
        220
      ],
      "size": [
        610,
        136
      ],
      "flags": {},
      "order": 15,
      "mode": 4,
      "inputs": [
        {
          "name": "audio",
          "type": "AUDIO",
          "link": 128
        }
      ],
      "outputs": [],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "SaveAudioOpus"
      },
      "widgets_values": [
        "audio/ComfyUI",
        "128k"
      ]
    },
    {
      "id": 73,
      "type": "Note",
      "pos": [
        1260,
        410
      ],
      "size": [
        610,
        90
      ],
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [],
      "outputs": [],
      "properties": {},
      "widgets_values": [
        "These nodes can save audio in different formats. Currently, all the modes are Bypass. You can enable them as per your needs.\n\n这些节点可以将 audio  保存成不同格式，目前的模式都是 Bypass ，你可以按你的需要来启用"
      ],
      "color": "#432",
      "bgcolor": "#653"
    },
    {
      "id": 44,
      "type": "ConditioningZeroOut",
      "pos": [
        600,
        70
      ],
      "size": [
        197.712890625,
        26
      ],
      "flags": {
        "collapsed": true
      },
      "order": 10,
      "mode": 0,
      "inputs": [
        {
          "name": "conditioning",
          "type": "CONDITIONING",
          "link": 108
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "links": [
            120
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.32",
        "Node name for S&R": "ConditioningZeroOut"
      },
      "widgets_values": []
    },
    {
      "id": 51,
      "type": "ModelSamplingSD3",
      "pos": [
        590,
        -40
      ],
      "size": [
        330,
        60
      ],
      "flags": {
        "collapsed": false
      },
      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "model",
          "type": "MODEL",
          "link": 115
        }
      ],
      "outputs": [
        {
          "name": "MODEL",
          "type": "MODEL",
          "links": [
            113
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "ModelSamplingSD3"
      },
      "widgets_values": [
        5.000000000000001
      ]
    },
    {
      "id": 50,
      "type": "LatentOperationTonemapReinhard",
      "pos": [
        590,
        -160
      ],
      "size": [
        330,
        58
      ],
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "LATENT_OPERATION",
          "type": "LATENT_OPERATION",
          "links": [
            114
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "LatentOperationTonemapReinhard"
      },
      "widgets_values": [
        1.0000000000000002
      ]
    },
    {
      "id": 52,
      "type": "KSampler",
      "pos": [
        940,
        -40
      ],
      "size": [
        290,
        262
      ],
      "flags": {},
      "order": 11,
      "mode": 0,
      "inputs": [
        {
          "name": "model",
          "type": "MODEL",
          "link": 121
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
          "link": 117
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
          "link": 120
        },
        {
          "name": "latent_image",
          "type": "LATENT",
          "link": 119
        }
      ],
      "outputs": [
        {
          "name": "LATENT",
          "type": "LATENT",
          "slot_index": 0,
          "links": [
            122
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "KSampler"
      },
      "widgets_values": [
        468254064217846,
        "randomize",
        50,
        5,
        "euler",
        "simple",
        1
      ]
    },
    {
      "id": 14,
      "type": "TextEncodeAceStepAudio",
      "pos": [
        590,
        120
      ],
      "size": [
        340,
        500
      ],
      "flags": {},
      "order": 7,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 80
        }
      ],
      "outputs": [
        {
          "name": "CONDITIONING",
          "type": "CONDITIONING",
          "links": [
            108,
            117
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.32",
        "Node name for S&R": "TextEncodeAceStepAudio"
      },
      "widgets_values": [
        "anime, soft female vocals, kawaii pop, j-pop, childish, piano, guitar, synthesizer, fast, happy, cheerful, lighthearted\t\n",
        "[inst]\n\n[verse]\nふわふわ　おみみが\nゆれるよ　かぜのなか\nきらきら　あおいめ\nみつめる　せかいを\n\n[verse]\nふわふわ　しっぽは\nおおきく　ゆれるよ\nきんいろ　かみのけ\nなびくよ　かぜのなか\n\n[verse]\nコンフィーユーアイの\nまもりびと\nピンクの　セーターで\nえがおを　くれるよ\n\nあおいろ　スカートと\nくろいコート　きんのもよう\nやさしい　ひかりが\nつつむよ　フェネックガール\n\n[verse]\nふわふわ　おみみで\nきこえる　こころの　こえ\nだいすき　フェネックガール\nいつでも　そばにいるよ\n\n\n",
        0.9900000000000002
      ]
    },
    {
      "id": 17,
      "type": "EmptyAceStepLatentAudio",
      "pos": [
        180,
        50
      ],
      "size": [
        370,
        82
      ],
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "LATENT",
          "type": "LATENT",
          "links": [
            119
          ]
        }
      ],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.32",
        "Node name for S&R": "EmptyAceStepLatentAudio"
      },
      "widgets_values": [
        120,
        1
      ]
    },
    {
      "id": 59,
      "type": "SaveAudioMP3",
      "pos": [
        1260,
        -160
      ],
      "size": [
        610,
        136
      ],
      "flags": {},
      "order": 13,
      "mode": 0,
      "inputs": [
        {
          "name": "audio",
          "type": "AUDIO",
          "link": 126
        }
      ],
      "outputs": [],
      "properties": {
        "cnr_id": "comfy-core",
        "ver": "0.3.34",
        "Node name for S&R": "SaveAudioMP3"
      },
      "widgets_values": [
        "audio/ComfyUI",
        "V0"
      ]
    }
  ],
  "links": [
    [
      80,
      40,
      1,
      14,
      0,
      "CLIP"
    ],
    [
      83,
      40,
      2,
      18,
      1,
      "VAE"
    ],
    [
      108,
      14,
      0,
      44,
      0,
      "CONDITIONING"
    ],
    [
      113,
      51,
      0,
      49,
      0,
      "MODEL"
    ],
    [
      114,
      50,
      0,
      49,
      1,
      "LATENT_OPERATION"
    ],
    [
      115,
      40,
      0,
      51,
      0,
      "MODEL"
    ],
    [
      117,
      14,
      0,
      52,
      1,
      "CONDITIONING"
    ],
    [
      119,
      17,
      0,
      52,
      3,
      "LATENT"
    ],
    [
      120,
      44,
      0,
      52,
      2,
      "CONDITIONING"
    ],
    [
      121,
      49,
      0,
      52,
      0,
      "MODEL"
    ],
    [
      122,
      52,
      0,
      18,
      0,
      "LATENT"
    ],
    [
      126,
      18,
      0,
      59,
      0,
      "AUDIO"
    ],
    [
      127,
      18,
      0,
      60,
      0,
      "AUDIO"
    ],
    [
      128,
      18,
      0,
      61,
      0,
      "AUDIO"
    ],
    [
      136,
      64,
      0,
      68,
      0,
      "AUDIO"
    ],
    [
      137,
      40,
      2,
      68,
      1,
      "VAE"
    ]
  ],
  "groups": [
    {
      "id": 1,
      "title": "Load model here",
      "bounding": [
        170,
        -230,
        390,
        180
      ],
      "color": "#3f789e",
      "font_size": 24,
      "flags": {}
    },
    {
      "id": 4,
      "title": "Latent",
      "bounding": [
        170,
        -30,
        390,
        280
      ],
      "color": "#3f789e",
      "font_size": 24,
      "flags": {}
    },
    {
      "id": 5,
      "title": "Adjust the vocal volume",
      "bounding": [
        580,
        -230,
        350,
        140
      ],
      "color": "#3f789e",
      "font_size": 24,
      "flags": {}
    },
    {
      "id": 6,
      "title": "For repainting",
      "bounding": [
        170,
        270,
        390,
        223.60000610351562
      ],
      "color": "#3f789e",
      "font_size": 24,
      "flags": {}
    },
    {
      "id": 7,
      "title": "Output",
      "bounding": [
        1250,
        -230,
        630,
        760
      ],
      "color": "#3f789e",
      "font_size": 24,
      "flags": {}
    }
  ],
  "config": {},
  "extra": {
    "ds": {
      "scale": 1,
      "offset": [
        -147.02717343600432,
        384.62272311479
      ]
    },
    "frontendVersion": "1.19.9",
    "node_versions": {
      "comfy-core": "0.3.34",
      "ace-step": "06f751d65491c9077fa2bc9b06d2c6f2a90e4c56"
    },
    "VHS_latentpreview": false,
    "VHS_latentpreviewrate": 0,
    "VHS_MetadataImage": true,
    "VHS_KeepIntermediate": true
  },
  "version": 0.4
 }
--- a/comfyui/workflows/text-to-music/acestep-t2m-output.flac
+++ b/comfyui/workflows/text-to-music/acestep-t2m-output.flac
--- a/comfyui/workflows/text-to-music/diffrhythm-full-length-t2m-v1.json
+++ b/comfyui/workflows/text-to-music/diffrhythm-full-length-t2m-v1.json
@@ -0,0 +1,130 @@
 {
  "last_node_id": 3,
  "last_link_id": 2,
  "nodes": [
    {
      "id": 1,
      "type": "DiffRhythmRun",
      "pos": [100, 100],
      "size": [400, 400],
      "flags": {},
      "order": 0,
      "mode": 0,
      "outputs": [
        {
          "name": "AUDIO",
          "type": "AUDIO",
          "links": [1, 2]
        }
      ],
      "properties": {
        "Node name for S&R": "DiffRhythmRun"
      },
      "widgets_values": [
        "cfm_full_model.pt",
        "Cinematic orchestral piece with soaring strings, powerful brass, and emotional piano melodies building to an epic crescendo",
        true,
        "euler",
        30,
        4,
        "quality",
        123,
        "randomize",
        false,
        "[-1, 20], [60, -1]"
      ],
      "title": "DiffRhythm Full-Length Text-to-Music (4m45s)"
    },
    {
      "id": 2,
      "type": "PreviewAudio",
      "pos": [600, 100],
      "size": [300, 100],
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [
        {
          "name": "audio",
          "type": "AUDIO",
          "link": 1
        }
      ],
      "properties": {
        "Node name for S&R": "PreviewAudio"
      },
      "title": "Preview Audio"
    },
    {
      "id": 3,
      "type": "SaveAudio",
      "pos": [600, 250],
      "size": [300, 100],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [
        {
          "name": "audio",
          "type": "AUDIO",
          "link": 2
        }
      ],
      "properties": {
        "Node name for S&R": "SaveAudio"
      },
      "widgets_values": [
        "diffrhythm_full_output"
      ],
      "title": "Save Audio"
    }
  ],
  "links": [
    [1, 1, 0, 2, 0, "AUDIO"],
    [2, 1, 0, 3, 0, "AUDIO"]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "workflow_info": {
      "name": "DiffRhythm Full-Length Text-to-Music v1",
      "description": "Full-length music generation using DiffRhythm Full (4 minutes 45 seconds)",
      "version": "1.0.0",
      "author": "valknar@pivoine.art",
      "category": "text-to-music",
      "tags": ["diffrhythm", "music-generation", "text-to-music", "full-length", "4m45s"],
      "requirements": {
        "custom_nodes": ["ComfyUI_DiffRhythm"],
        "models": ["ASLP-lab/DiffRhythm-full", "ASLP-lab/DiffRhythm-vae", "OpenMuQ/MuQ-MuLan-large", "OpenMuQ/MuQ-large-msd-iter", "FacebookAI/xlm-roberta-base"],
        "vram_min": "16GB",
        "vram_recommended": "20GB",
        "system_deps": ["espeak-ng"]
      },
      "usage": {
        "model": "cfm_full_model.pt (DiffRhythm Full - 4m45s/285s generation)",
        "style_prompt": "Detailed text description of the desired full-length music composition",
        "unload_model": "Boolean to unload model after generation (default: true)",
        "odeint_method": "ODE solver: euler, midpoint, rk4, implicit_adams (default: euler)",
        "steps": "Number of diffusion steps: 1-100 (default: 30)",
        "cfg": "Classifier-free guidance scale: 1-10 (default: 4)",
        "quality_or_speed": "Generation mode: quality or speed (default: quality for full-length)",
        "seed": "Random seed for reproducibility (default: 123)",
        "edit": "Enable segment editing mode (default: false)",
        "edit_segments": "Segments to edit when edit=true"
      },
      "performance": {
        "generation_time": "~60-90 seconds on RTX 4090",
        "vram_usage": "~16GB during generation",
        "note": "Significantly faster than real-time music generation"
      },
      "notes": [
        "This workflow uses DiffRhythm Full for 4 minute 45 second music generation",
        "Best for complete song compositions with intro, development, and outro",
        "All parameters except model and style_prompt are optional",
        "Supports complex, multi-part compositions",
        "Can optionally connect MultiLineLyricsDR node for lyrics input"
      ]
    }
  },
  "version": 0.4
 }
--- a/comfyui/workflows/text-to-music/diffrhythm-reference-based-v1.json
+++ b/comfyui/workflows/text-to-music/diffrhythm-reference-based-v1.json
@@ -0,0 +1,164 @@
 {
  "last_node_id": 4,
  "last_link_id": 3,
  "nodes": [
    {
      "id": 1,
      "type": "LoadAudio",
      "pos": [100, 100],
      "size": [300, 100],
      "flags": {},
      "order": 0,
      "mode": 0,
      "outputs": [
        {
          "name": "AUDIO",
          "type": "AUDIO",
          "links": [1]
        }
      ],
      "properties": {
        "Node name for S&R": "LoadAudio"
      },
      "widgets_values": [
        "reference_audio.wav"
      ],
      "title": "Load Reference Audio"
    },
    {
      "id": 2,
      "type": "DiffRhythmRun",
      "pos": [500, 100],
      "size": [400, 450],
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [
        {
          "name": "style_audio_or_edit_song",
          "type": "AUDIO",
          "link": 1
        }
      ],
      "outputs": [
        {
          "name": "AUDIO",
          "type": "AUDIO",
          "links": [2, 3]
        }
      ],
      "properties": {
        "Node name for S&R": "DiffRhythmRun"
      },
      "widgets_values": [
        "cfm_model_v1_2.pt",
        "Energetic rock music with driving guitar riffs and powerful drums",
        true,
        "euler",
        30,
        5,
        "speed",
        456,
        "randomize",
        false,
        "[-1, 20], [60, -1]"
      ],
      "title": "DiffRhythm Reference-Based Generation"
    },
    {
      "id": 3,
      "type": "PreviewAudio",
      "pos": [1000, 100],
      "size": [300, 100],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [
        {
          "name": "audio",
          "type": "AUDIO",
          "link": 2
        }
      ],
      "properties": {
        "Node name for S&R": "PreviewAudio"
      },
      "title": "Preview Generated Audio"
    },
    {
      "id": 4,
      "type": "SaveAudio",
      "pos": [1000, 250],
      "size": [300, 100],
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [
        {
          "name": "audio",
          "type": "AUDIO",
          "link": 3
        }
      ],
      "properties": {
        "Node name for S&R": "SaveAudio"
      },
      "widgets_values": [
        "diffrhythm_reference_output"
      ],
      "title": "Save Audio"
    }
  ],
  "links": [
    [1, 1, 0, 2, 0, "AUDIO"],
    [2, 2, 0, 3, 0, "AUDIO"],
    [3, 2, 0, 4, 0, "AUDIO"]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "workflow_info": {
      "name": "DiffRhythm Reference-Based Generation v1",
      "description": "Generate new music based on a reference audio file while following text prompt guidance",
      "version": "1.0.0",
      "author": "valknar@pivoine.art",
      "category": "text-to-music",
      "tags": ["diffrhythm", "music-generation", "reference-based", "style-transfer"],
      "requirements": {
        "custom_nodes": ["ComfyUI_DiffRhythm"],
        "models": ["ASLP-lab/DiffRhythm-1_2", "ASLP-lab/DiffRhythm-vae", "OpenMuQ/MuQ-MuLan-large", "OpenMuQ/MuQ-large-msd-iter", "FacebookAI/xlm-roberta-base"],
        "vram_min": "14GB",
        "vram_recommended": "18GB",
        "system_deps": ["espeak-ng"]
      },
      "usage": {
        "reference_audio": "Path to reference audio file (WAV, MP3, or other supported formats)",
        "model": "cfm_model_v1_2.pt (DiffRhythm 1.2)",
        "style_prompt": "Text description guiding the style and characteristics of generated music",
        "unload_model": "Boolean to unload model after generation (default: true)",
        "odeint_method": "ODE solver: euler, midpoint, rk4, implicit_adams (default: euler)",
        "steps": "Number of diffusion steps: 1-100 (default: 30)",
        "cfg": "Classifier-free guidance scale: 1-10 (default: 5 for reference-based)",
        "quality_or_speed": "Generation mode: quality or speed (default: speed)",
        "seed": "Random seed for reproducibility (default: 456)",
        "edit": "Enable segment editing mode (default: false)",
        "edit_segments": "Segments to edit when edit=true"
      },
      "use_cases": [
        "Style transfer: Apply the style of reference music to new prompt",
        "Variations: Create variations of existing compositions",
        "Genre transformation: Transform music to different genre while keeping structure",
        "Mood adaptation: Change the mood/emotion while maintaining musical elements"
      ],
      "notes": [
        "This workflow combines reference audio with text prompt guidance",
        "The reference audio is connected to the style_audio_or_edit_song input",
        "Higher cfg values (7-10) = closer adherence to both prompt and reference",
        "Lower cfg values (2-4) = more creative interpretation",
        "Reference audio should ideally be similar duration to target (95s for cfm_model_v1_2.pt)",
        "Can use any format supported by ComfyUI's LoadAudio node"
      ]
    }
  },
  "version": 0.4
 }
--- a/comfyui/workflows/text-to-music/diffrhythm-simple-t2m-v1.json
+++ b/comfyui/workflows/text-to-music/diffrhythm-simple-t2m-v1.json
@@ -0,0 +1,125 @@
 {
  "last_node_id": 3,
  "last_link_id": 2,
  "nodes": [
    {
      "id": 1,
      "type": "DiffRhythmRun",
      "pos": [100, 100],
      "size": [400, 400],
      "flags": {},
      "order": 0,
      "mode": 0,
      "outputs": [
        {
          "name": "AUDIO",
          "type": "AUDIO",
          "links": [1, 2]
        }
      ],
      "properties": {
        "Node name for S&R": "DiffRhythmRun"
      },
      "widgets_values": [
        "cfm_model_v1_2.pt",
        "Upbeat electronic dance music with energetic beats and synthesizer melodies",
        true,
        "euler",
        30,
        4,
        "speed",
        42,
        "randomize",
        false,
        "[-1, 20], [60, -1]"
      ],
      "title": "DiffRhythm Text-to-Music (95s)"
    },
    {
      "id": 2,
      "type": "PreviewAudio",
      "pos": [600, 100],
      "size": [300, 100],
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [
        {
          "name": "audio",
          "type": "AUDIO",
          "link": 1
        }
      ],
      "properties": {
        "Node name for S&R": "PreviewAudio"
      },
      "title": "Preview Audio"
    },
    {
      "id": 3,
      "type": "SaveAudio",
      "pos": [600, 250],
      "size": [300, 100],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [
        {
          "name": "audio",
          "type": "AUDIO",
          "link": 2
        }
      ],
      "properties": {
        "Node name for S&R": "SaveAudio"
      },
      "widgets_values": [
        "diffrhythm_output"
      ],
      "title": "Save Audio"
    }
  ],
  "links": [
    [1, 1, 0, 2, 0, "AUDIO"],
    [2, 1, 0, 3, 0, "AUDIO"]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "workflow_info": {
      "name": "DiffRhythm Simple Text-to-Music v1",
      "description": "Basic text-to-music generation using DiffRhythm 1.2 (95 seconds)",
      "version": "1.0.0",
      "author": "valknar@pivoine.art",
      "category": "text-to-music",
      "tags": ["diffrhythm", "music-generation", "text-to-music", "95s"],
      "requirements": {
        "custom_nodes": ["ComfyUI_DiffRhythm"],
        "models": ["ASLP-lab/DiffRhythm-1_2", "ASLP-lab/DiffRhythm-vae", "OpenMuQ/MuQ-MuLan-large", "OpenMuQ/MuQ-large-msd-iter", "FacebookAI/xlm-roberta-base"],
        "vram_min": "12GB",
        "vram_recommended": "16GB",
        "system_deps": ["espeak-ng"]
      },
      "usage": {
        "model": "cfm_model_v1_2.pt (DiffRhythm 1.2 - 95s generation)",
        "style_prompt": "Text description of the desired music style, mood, and instruments",
        "unload_model": "Boolean to unload model after generation (default: true)",
        "odeint_method": "ODE solver: euler, midpoint, rk4, implicit_adams (default: euler)",
        "steps": "Number of diffusion steps: 1-100 (default: 30)",
        "cfg": "Classifier-free guidance scale: 1-10 (default: 4)",
        "quality_or_speed": "Generation mode: quality or speed (default: speed)",
        "seed": "Random seed for reproducibility (default: 42)",
        "edit": "Enable segment editing mode (default: false)",
        "edit_segments": "Segments to edit when edit=true (default: [-1, 20], [60, -1])"
      },
      "notes": [
        "This workflow uses DiffRhythm 1.2 for 95-second music generation",
        "All parameters except model and style_prompt are optional",
        "Supports English and Chinese text prompts",
        "Generation time: ~30-60 seconds on RTX 4090",
        "Can optionally connect MultiLineLyricsDR node for lyrics input"
      ]
    }
  },
  "version": 0.4
 }
--- a/comfyui/workflows/text-to-music/reference_audio.wav
+++ b/comfyui/workflows/text-to-music/reference_audio.wav
--- a/models_huggingface.yaml
+++ b/models_huggingface.yaml
@@ -1,573 +1,709 @@
 # ============================================================================
 # ComfyUI Model Configuration
 # ============================================================================
 #
 # This configuration file defines all available ComfyUI models for download.
 # Models are organized by category: image, video, audio, and support models.
 #
 # Each model entry contains:
 #   - repo_id: HuggingFace repository identifier
 #   - description: Human-readable description
 #   - size_gb: Approximate size in gigabytes
 #   - essential: Whether this is an essential model (true/false)
 #   - category: Model category (image/video/audio/support)
 #
 # ============================================================================
 # Global settings
 settings:
  cache_dir: /workspace/huggingface_cache
  parallel_downloads: 1
  retry_attempts: 3
  timeout_seconds: 3600
 # Model categories
 model_categories:
  # ==========================================================================
  # IMAGE GENERATION MODELS
  # ==========================================================================
  image_models:
  - repo_id: black-forest-labs/FLUX.1-schnell
    description: FLUX.1 Schnell - Fast 4-step inference
    size_gb: 23
    essential: true
    category: image
      type: unet
    format: fp16
    vram_gb: 23
    notes: Industry-leading image generation quality
    files:
-        - source: "flux1-schnell.safetensors"
+    - source: flux1-schnell.safetensors
-          dest: "flux1-schnell.safetensors"
+      dest: unet/flux1-schnell.safetensors
  - repo_id: black-forest-labs/FLUX.1-dev
    description: FLUX.1 Dev - Balanced quality/speed
    size_gb: 23
    essential: false
    category: image
      type: unet
    format: fp16
    vram_gb: 23
    notes: Development version with enhanced features
    files:
-        - source: "flux1-dev.safetensors"
+    - source: flux1-dev.safetensors
-          dest: "flux1-dev.safetensors"
+      dest: unet/flux1-dev.safetensors
  - repo_id: runwayml/stable-diffusion-v1-5
    description: SD 1.5 - For AnimateDiff
    size_gb: 4
    essential: true
    category: image
      type: checkpoints
    format: fp16
    vram_gb: 8
    notes: Stable Diffusion 1.5 required for AnimateDiff motion modules
    files:
-        - source: "v1-5-pruned-emaonly.safetensors"
+    - source: v1-5-pruned-emaonly.safetensors
-          dest: "v1-5-pruned-emaonly.safetensors"
+      dest: checkpoints/v1-5-pruned-emaonly.safetensors
  - repo_id: stabilityai/stable-diffusion-xl-base-1.0
    description: SDXL Base 1.0 - Industry standard
    size_gb: 7
    essential: true
    category: image
      type: checkpoints
    format: fp16
    vram_gb: 12
    notes: Most widely used Stable Diffusion model
    files:
-        - source: "sd_xl_base_1.0.safetensors"
+    - source: sd_xl_base_1.0.safetensors
-          dest: "sd_xl_base_1.0.safetensors"
+      dest: checkpoints/sd_xl_base_1.0.safetensors
  - repo_id: stabilityai/stable-diffusion-xl-refiner-1.0
    description: SDXL Refiner 1.0 - Enhances base output
    size_gb: 6
    essential: false
    category: image
      type: checkpoints
    format: fp16
    vram_gb: 12
    notes: Use after SDXL base for improved details
    files:
-        - source: "sd_xl_refiner_1.0.safetensors"
+    - source: sd_xl_refiner_1.0.safetensors
-          dest: "sd_xl_refiner_1.0.safetensors"
+      dest: checkpoints/sd_xl_refiner_1.0.safetensors
  - repo_id: stabilityai/stable-diffusion-3.5-large
-      description: SD 3.5 Large - Latest Stability AI
+    description: SD 3.5 Large Complete - Checkpoint and text encoders
-      size_gb: 18
+    size_gb: 31
    essential: false
    category: image
-      type: checkpoints
+    format: mixed
      format: fp16
    vram_gb: 20
-      notes: Newest generation Stable Diffusion
+    notes: Complete SD3.5 Large model with checkpoint and all text encoders (CLIP-L,
      CLIP-G, T5-XXL)
    files:
-        - source: "sd3.5_large.safetensors"
+    - source: sd3.5_large.safetensors
-          dest: "sd3.5_large.safetensors"
+      dest: checkpoints/sd3.5_large.safetensors
-
+    - source: text_encoders/clip_l.safetensors
-  # ==========================================================================
+      dest: checkpoints/clip_l.safetensors
-  # VIDEO GENERATION MODELS
+    - source: text_encoders/clip_g.safetensors
-  # ==========================================================================
+      dest: checkpoints/clip_g.safetensors
    - source: text_encoders/t5xxl_fp16.safetensors
      dest: checkpoints/t5xxl_fp16.safetensors
  - repo_id: John6666/diving-illustrious-real-asian-v50-sdxl
    description: Diving Illustrious Real Asian v5.0 - Photorealistic Asian subjects
    size_gb: 7
    essential: false
    category: image
    format: fp16
    vram_gb: 12
    notes: SDXL fine-tune specializing in photorealistic Asian subjects with illustrious
      quality
    files:
    - source: unet/diffusion_pytorch_model.safetensors
      dest: checkpoints/diving-illustrious-real-asian-v50-sdxl.safetensors
  - repo_id: playgroundai/playground-v2.5-1024px-aesthetic
    description: Playground v2.5 - 1024px aesthetic images
    size_gb: 7
    essential: false
    category: image
    format: fp16
    vram_gb: 12
    notes: Highly aesthetic 1024x1024 outputs, outperforms SDXL and DALL-E 3 in user
      studies
    files:
    - source: playground-v2.5-1024px-aesthetic.fp16.safetensors
      dest: checkpoints/playground-v2.5-1024px-aesthetic.safetensors
  - repo_id: Lykon/dreamshaper-8
    description: DreamShaper 8 - Multi-style versatile model
    size_gb: 4
    essential: false
    category: image
    format: fp16
    vram_gb: 8
    notes: Versatile SD1.5 fine-tune balancing photorealistic and anime styles with
      strong LoRA support
    files:
    - source: unet/diffusion_pytorch_model.fp16.safetensors
      dest: checkpoints/dreamshaper-8.safetensors
  video_models:
  - repo_id: THUDM/CogVideoX-5b
    description: CogVideoX-5B - Professional text-to-video
    size_gb: 20
    essential: true
    category: video
      type: diffusion_models
    format: fp16
    vram_gb: 20
    frames: 49
    resolution: 720p
-      notes: State-of-the-art text-to-video generation, auto-downloaded by DownloadAndLoadCogVideoModel node
+    notes: State-of-the-art text-to-video generation, auto-downloaded by DownloadAndLoadCogVideoModel
      node
    files:
-        - source: "transformer/diffusion_pytorch_model.safetensors"
+    - source: transformer/diffusion_pytorch_model-00001-of-00002.safetensors
-          dest: "cogvideox-5b-transformer.safetensors"
+      dest: diffusion_models/cogvideox-5b-transformer-00001-of-00002.safetensors
-
+    - source: transformer/diffusion_pytorch_model-00002-of-00002.safetensors
      dest: diffusion_models/cogvideox-5b-transformer-00002-of-00002.safetensors
    - source: transformer/diffusion_pytorch_model.safetensors.index.json
      dest: diffusion_models/cogvideox-5b-transformer.safetensors.index.json
  - repo_id: THUDM/CogVideoX-5b-I2V
    description: CogVideoX-5B-I2V - Image-to-video generation
    size_gb: 20
    essential: true
    category: video
      type: diffusion_models
    format: fp16
    vram_gb: 20
    frames: 49
    resolution: 720p
    notes: Image-to-video model, auto-downloaded by DownloadAndLoadCogVideoModel node
    files:
-        - source: "transformer/diffusion_pytorch_model.safetensors"
+    - source: transformer/diffusion_pytorch_model-00001-of-00003.safetensors
-          dest: "cogvideox-5b-i2v-transformer.safetensors"
+      dest: diffusion_models/cogvideox-5b-i2v-transformer-00001-of-00003.safetensors
-
+    - source: transformer/diffusion_pytorch_model-00002-of-00003.safetensors
      dest: diffusion_models/cogvideox-5b-i2v-transformer-00002-of-00003.safetensors
    - source: transformer/diffusion_pytorch_model-00003-of-00003.safetensors
      dest: diffusion_models/cogvideox-5b-i2v-transformer-00003-of-00003.safetensors
    - source: transformer/diffusion_pytorch_model.safetensors.index.json
      dest: diffusion_models/cogvideox-5b-i2v-transformer.safetensors.index.json
  - repo_id: stabilityai/stable-video-diffusion-img2vid
    description: SVD - 14 frame image-to-video
    size_gb: 8
    essential: true
    category: video
      type: checkpoints
    format: fp16
    vram_gb: 20
    frames: 14
    resolution: 576x1024
    notes: Convert images to short video clips
    files:
-        - source: "svd.safetensors"
+    - source: svd.safetensors
-          dest: "svd.safetensors"
+      dest: checkpoints/svd.safetensors
  - repo_id: stabilityai/stable-video-diffusion-img2vid-xt
    description: SVD-XT - 25 frame image-to-video
    size_gb: 8
    essential: false
    category: video
      type: checkpoints
    format: fp16
    vram_gb: 20
    frames: 25
    resolution: 576x1024
    notes: Extended frame count version
    files:
-        - source: "svd_xt.safetensors"
+    - source: svd_xt.safetensors
-          dest: "svd_xt.safetensors"
+      dest: checkpoints/svd_xt.safetensors
-
+  - repo_id: Comfy-Org/HunyuanVideo_repackaged
-  # ==========================================================================
+    description: HunyuanVideo Complete - 720p T2V/I2V models with VAE and encoders
-  # AUDIO GENERATION MODELS
+    size_gb: 51
-  # ==========================================================================
+    essential: true
    category: video
    format: bf16
    vram_gb: 24
    frames: 129
    resolution: 720p
    notes: Complete HunyuanVideo family - T2V, I2V v1/v2, 3D VAE, LLaVA LLaMA3 text/vision
      encoders
    files:
    - source: split_files/diffusion_models/hunyuan_video_t2v_720p_bf16.safetensors
      dest: diffusion_models/hunyuan_video_t2v_720p_bf16.safetensors
    - source: split_files/diffusion_models/hunyuan_video_image_to_video_720p_bf16.safetensors
      dest: diffusion_models/hunyuan_video_image_to_video_720p_bf16.safetensors
    - source: split_files/diffusion_models/hunyuan_video_v2_replace_image_to_video_720p_bf16.safetensors
      dest: diffusion_models/hunyuan_video_v2_replace_image_to_video_720p_bf16.safetensors
    - source: split_files/vae/hunyuan_video_vae_bf16.safetensors
      dest: diffusion_models/hunyuan_video_vae_bf16.safetensors
    - source: split_files/text_encoders/llava_llama3_fp8_scaled.safetensors
      dest: diffusion_models/llava_llama3_fp8_scaled.safetensors
    - source: split_files/clip_vision/llava_llama3_vision.safetensors
      dest: diffusion_models/llava_llama3_vision.safetensors
  - repo_id: Comfy-Org/HunyuanVideo_1.5_repackaged
    description: HunyuanVideo 1.5 Complete - 720p/1080p T2V/SR with encoders
    size_gb: 51.5
    essential: true
    category: video
    format: fp16
    vram_gb: 24
    frames: 129-257
    resolution: 720p-1080p
    notes: Complete HunyuanVideo 1.5 - T2V 720p, SR 1080p, VAE, Qwen 2.5 VL, ByT5
      GlyphXL encoders
    files:
    - source: hunyuanvideo1.5_720p_t2v_fp16.safetensors
      dest: diffusion_models/hunyuanvideo1.5_720p_t2v_fp16.safetensors
    - source: hunyuanvideo1.5_1080p_sr_distilled_fp16.safetensors
      dest: diffusion_models/hunyuanvideo1.5_1080p_sr_distilled_fp16.safetensors
    - source: hunyuanvideo15_vae_fp16.safetensors
      dest: diffusion_models/hunyuanvideo15_vae_fp16.safetensors
    - source: qwen_2.5_vl_7b_fp8_scaled.safetensors
      dest: diffusion_models/qwen_2.5_vl_7b_fp8_scaled.safetensors
    - source: byt5_small_glyphxl_fp16.safetensors
      dest: diffusion_models/byt5_small_glyphxl_fp16.safetensors
  - repo_id: Comfy-Org/Wan_2.2_ComfyUI_Repackaged
    description: Wan2.2 Complete - All video models, VAEs, and LoRAs
    size_gb: 220
    essential: true
    category: video
    format: mixed
    vram_gb: 24
    frames: 81
    resolution: 640x640
    notes: Complete Wan2.2 model family - TI2V 5B, T2V 14B, I2V 14B, Animate, S2V,
      Fun Inpaint/Control/Camera, VAEs, CLIP Vision H, Wav2Vec2, and LoRA accelerators
    files:
    - source: wan2.2_ti2v_5B_fp16.safetensors
      dest: diffusion_models/wan2.2_ti2v_5B_fp16.safetensors
    - source: wan2.2_t2v_high_noise_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_t2v_high_noise_14B_fp8_scaled.safetensors
    - source: wan2.2_t2v_low_noise_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_t2v_low_noise_14B_fp8_scaled.safetensors
    - source: wan2.2_i2v_high_noise_14B_fp16.safetensors
      dest: diffusion_models/wan2.2_i2v_high_noise_14B_fp16.safetensors
    - source: wan2.2_i2v_low_noise_14B_fp16.safetensors
      dest: diffusion_models/wan2.2_i2v_low_noise_14B_fp16.safetensors
    - source: wan2.2_animate_14B_bf16.safetensors
      dest: diffusion_models/wan2.2_animate_14B_bf16.safetensors
    - source: wan2.2_s2v_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_s2v_14B_fp8_scaled.safetensors
    - source: wan2.2_s2v_14B_bf16.safetensors
      dest: diffusion_models/wan2.2_s2v_14B_bf16.safetensors
    - source: wan2.2_fun_inpaint_high_noise_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_fun_inpaint_high_noise_14B_fp8_scaled.safetensors
    - source: wan2.2_fun_inpaint_low_noise_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_fun_inpaint_low_noise_14B_fp8_scaled.safetensors
    - source: wan2.2_fun_control_high_noise_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_fun_control_high_noise_14B_fp8_scaled.safetensors
    - source: wan2.2_fun_control_low_noise_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_fun_control_low_noise_14B_fp8_scaled.safetensors
    - source: wan2.2_fun_camera_high_noise_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_fun_camera_high_noise_14B_fp8_scaled.safetensors
    - source: wan2.2_fun_camera_low_noise_14B_fp8_scaled.safetensors
      dest: diffusion_models/wan2.2_fun_camera_low_noise_14B_fp8_scaled.safetensors
    - source: wan2.2_vae.safetensors
      dest: diffusion_models/wan2.2_vae.safetensors
    - source: wan_2.1_vae.safetensors
      dest: diffusion_models/wan_2.1_vae.safetensors
    - source: clip_vision_h.safetensors
      dest: diffusion_models/clip_vision_h.safetensors
    - source: wav2vec2_large_english_fp16.safetensors
      dest: diffusion_models/wav2vec2_large_english_fp16.safetensors
    - source: lightx2v_I2V_14B_480p_cfg_step_distill_rank64_bf16.safetensors
      dest: diffusion_models/lightx2v_I2V_14B_480p_cfg_step_distill_rank64_bf16.safetensors
    - source: wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise.safetensors
      dest: diffusion_models/wan2.2_t2v_lightx2v_4steps_lora_v1.1_high_noise.safetensors
    - source: wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors
      dest: diffusion_models/wan2.2_i2v_lightx2v_4steps_lora_v1_high_noise.safetensors
    - source: wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors
      dest: diffusion_models/wan2.2_i2v_lightx2v_4steps_lora_v1_low_noise.safetensors
  audio_models:
  - repo_id: facebook/musicgen-small
    description: MusicGen Small - Fast generation
    size_gb: 3
    essential: false
    category: audio
      type: musicgen
    format: fp32
    vram_gb: 4
    duration_seconds: 30
    notes: Fastest music generation, lower quality
    files:
-        - source: "pytorch_model.bin"
+    - source: pytorch_model.bin
-          dest: "musicgen-small-pytorch_model.bin"
+      dest: musicgen/musicgen-small-pytorch_model.bin
  - repo_id: facebook/musicgen-medium
    description: MusicGen Medium - Balanced quality
    size_gb: 11
    essential: true
    category: audio
      type: musicgen
    format: fp32
    vram_gb: 8
    duration_seconds: 30
    notes: Best balance of speed and quality
    files:
-        - source: "pytorch_model.bin"
+    - source: pytorch_model.bin
-          dest: "musicgen-medium-pytorch_model.bin"
+      dest: musicgen/musicgen-medium-pytorch_model.bin
  - repo_id: facebook/musicgen-large
    description: MusicGen Large - Highest quality
    size_gb: 22
    essential: false
    category: audio
      type: musicgen
    format: fp32
    vram_gb: 16
    duration_seconds: 30
    notes: Best quality, slower generation
    files:
-        - source: "pytorch_model-00001-of-00002.bin"
+    - source: pytorch_model-00001-of-00002.bin
-          dest: "musicgen-large-pytorch_model-00001-of-00002.bin"
+      dest: musicgen/musicgen-large-pytorch_model-00001-of-00002.bin
-        - source: "pytorch_model-00002-of-00002.bin"
+    - source: pytorch_model-00002-of-00002.bin
-          dest: "musicgen-large-pytorch_model-00002-of-00002.bin"
+      dest: musicgen/musicgen-large-pytorch_model-00002-of-00002.bin
-        - source: "pytorch_model.bin.index.json"
+    - source: pytorch_model.bin.index.json
-          dest: "musicgen-large-pytorch_model.bin.index.json"
+      dest: musicgen/musicgen-large-pytorch_model.bin.index.json
-
+  - repo_id: Comfy-Org/ACE-Step_ComfyUI_repackaged
-  # ==========================================================================
+    description: ACE Step v1 3.5B - Fast coherent music generation with 19-language
-  # SUPPORT MODELS (CLIP, IP-Adapter, etc.)
+      support
-  # ==========================================================================
+    size_gb: 7.7
    essential: true
    category: audio
    format: safetensors
    vram_gb: 16
    duration_seconds: 240
    notes: 15x faster than LLM baselines, superior structural coherence, voice cloning,
      19-language lyrics
    files:
    - source: all_in_one/ace_step_v1_3.5b.safetensors
      dest: checkpoints/ace_step_v1_3.5b.safetensors
  - repo_id: ACE-Step/ACE-Step-v1-chinese-rap-LoRA
    description: ACE Step Chinese RAP LoRA - Enhanced Chinese pronunciation and hip-hop
      genre
    size_gb: 0.3
    essential: false
    category: audio
    format: safetensors
    notes: Improves Chinese pronunciation accuracy and hip-hop/electronic genre adherence
    files:
    - source: pytorch_lora_weights.safetensors
      dest: loras/ace-step-chinese-rap-lora.safetensors
  support_models:
  - repo_id: openai/clip-vit-large-patch14
    description: CLIP H - For SD 1.5 IP-Adapter
    size_gb: 2
    essential: true
    category: support
      type: clip_vision
    format: fp32
    vram_gb: 2
    notes: Text-image understanding model for IP-Adapter
    files:
-        - source: "model.safetensors"
+    - source: model.safetensors
-          dest: "CLIP-ViT-H-14-laion2B-s32B-b79K.safetensors"
+      dest: clip_vision/CLIP-ViT-H-14-laion2B-s32B-b79K.safetensors
  - repo_id: laion/CLIP-ViT-bigG-14-laion2B-39B-b160k
    description: CLIP G - For SDXL IP-Adapter
    size_gb: 7
    essential: true
    category: support
      type: clip_vision
    format: fp32
    vram_gb: 4
    notes: Larger CLIP model for SDXL IP-Adapter
    files:
-        - source: "open_clip_model.safetensors"
+    - source: open_clip_model.safetensors
-          dest: "CLIP-ViT-bigG-14-laion2B-39B-b160k.safetensors"
+      dest: clip_vision/CLIP-ViT-bigG-14-laion2B-39B-b160k.safetensors
  - repo_id: google/siglip-so400m-patch14-384
    description: SigLIP - For FLUX models
    size_gb: 2
    essential: true
    category: support
      type: clip_vision
    format: fp32
    vram_gb: 2
    notes: Advanced image-text alignment
    files:
-        - source: "model.safetensors"
+    - source: model.safetensors
-          dest: "siglip-so400m-patch14-384.safetensors"
+      dest: clip_vision/siglip-so400m-patch14-384.safetensors
    - repo_id: stabilityai/stable-diffusion-3.5-large
      description: CLIP-L and T5-XXL - For FLUX text encoding
      size_gb: 10
      essential: true
      category: support
      type: clip
      format: fp16
      vram_gb: 4
      notes: CLIP text encoders required for FLUX models
      files:
        - source: "text_encoders/clip_l.safetensors"
          dest: "clip_l.safetensors"
        - source: "text_encoders/t5xxl_fp16.safetensors"
          dest: "t5xxl_fp16.safetensors"
  - repo_id: black-forest-labs/FLUX.1-schnell
    description: FLUX VAE - Autoencoder for FLUX models
    size_gb: 0.5
    essential: true
    category: support
      type: vae
    format: safetensors
    vram_gb: 1
    notes: VAE autoencoder required for FLUX image decoding
    files:
-        - source: "ae.safetensors"
+    - source: ae.safetensors
-          dest: "ae.safetensors"
+      dest: vae/ae.safetensors
  - repo_id: ai-forever/Real-ESRGAN
    description: RealESRGAN x2 - 2x upscaling model
    size_gb: 0.06
    essential: true
    category: support
      type: upscale_models
    format: pth
    vram_gb: 2
    notes: Fast 2x upscaling model for general purpose enhancement
    files:
-        - source: "RealESRGAN_x2.pth"
+    - source: RealESRGAN_x2.pth
-          dest: "RealESRGAN_x2.pth"
+      dest: upscale_models/RealESRGAN_x2.pth
  - repo_id: ai-forever/Real-ESRGAN
    description: RealESRGAN x4 - 4x upscaling model
    size_gb: 0.06
    essential: true
    category: support
      type: upscale_models
    format: pth
    vram_gb: 4
    notes: High-quality 4x upscaling model for detail enhancement
    files:
-        - source: "RealESRGAN_x4.pth"
+    - source: RealESRGAN_x4.pth
-          dest: "RealESRGAN_x4.pth"
+      dest: upscale_models/RealESRGAN_x4.pth
-
+  - repo_id: Comfy-Org/Wan_2.1_ComfyUI_repackaged
-    - repo_id: stabilityai/stable-diffusion-3.5-large
+    description: UMT5-XXL FP8 - Text encoder for all Wan2.2 models
-      description: T5-XXL FP16 - For CogVideoX text encoding
+    size_gb: 10
      size_gb: 9
    essential: true
    category: support
-      type: text_encoders
+    format: fp8_scaled
-      format: fp16
+    vram_gb: 5
-      vram_gb: 4
+    notes: Shared text encoder for all Wan2.2 models (5B and 14B), FP8 quantized
      notes: T5 text encoder required for CogVideoX models
    files:
-        - source: "text_encoders/t5xxl_fp16.safetensors"
+    - source: umt5_xxl_fp8_e4m3fn_scaled.safetensors
-          dest: "t5xxl_fp16.safetensors"
+      dest: text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors
    - repo_id: stabilityai/stable-diffusion-3.5-large
      description: CLIP-L - For CogVideoX and SD3
      size_gb: 1
      essential: true
      category: support
      type: text_encoders
      format: fp32
      vram_gb: 1
      notes: CLIP-L text encoder for CogVideoX and SD3 models
      files:
        - source: "text_encoders/clip_l.safetensors"
          dest: "clip_l.safetensors"
    - repo_id: stabilityai/stable-diffusion-3.5-large
      description: CLIP-G - For SD3 models
      size_gb: 3
      essential: false
      category: support
      type: text_encoders
      format: fp32
      vram_gb: 2
      notes: CLIP-G text encoder for SD3 models
      files:
        - source: "text_encoders/clip_g.safetensors"
          dest: "clip_g.safetensors"
  # ==========================================================================
  # ANIMATEDIFF MODELS
  # ==========================================================================
  animatediff_models:
  - repo_id: guoyww/animatediff
    description: AnimateDiff Motion Modules
    size_gb: 2
    essential: true
    category: animatediff
      type: animatediff_models
    filename: mm_sd_v15
    format: safetensors
    vram_gb: 4
    notes: Motion modules for AnimateDiff text-to-video
    files:
-        - source: "mm_sd_v15_v2.ckpt"
+    - source: mm_sd_v15_v2.ckpt
-          dest: "mm_sd_v15_v2.ckpt"
+      dest: animatediff_models/mm_sd_v15_v2.ckpt
  # ==========================================================================
  # CONTROLNET MODELS
  # ==========================================================================
  controlnet_models:
  - repo_id: lllyasviel/control_v11p_sd15_canny
    description: ControlNet Canny - Edge detection control for SD 1.5
    size_gb: 1.5
    essential: false
    category: controlnet
      type: controlnet
    format: safetensors
    vram_gb: 2
    notes: Precise edge-based composition control
    files:
-        - source: "diffusion_pytorch_model.safetensors"
+    - source: diffusion_pytorch_model.safetensors
-          dest: "control_v11p_sd15_canny.safetensors"
+      dest: controlnet/control_v11p_sd15_canny.safetensors
  - repo_id: lllyasviel/control_v11f1p_sd15_depth
    description: ControlNet Depth - Depth map control for SD 1.5
    size_gb: 1.5
    essential: false
    category: controlnet
      type: controlnet
    format: safetensors
    vram_gb: 2
    notes: Depth-based spatial control
    files:
-        - source: "diffusion_pytorch_model.safetensors"
+    - source: diffusion_pytorch_model.safetensors
-          dest: "control_v11p_sd15_depth.safetensors"
+      dest: controlnet/control_v11p_sd15_depth.safetensors
  - repo_id: diffusers/controlnet-canny-sdxl-1.0
    description: ControlNet Canny SDXL - Edge detection for SDXL
    size_gb: 2.5
    essential: false
    category: controlnet
      type: controlnet
    format: safetensors
    vram_gb: 3
    notes: Canny edge control for SDXL models
    files:
-        - source: "diffusion_pytorch_model.safetensors"
+    - source: diffusion_pytorch_model.safetensors
-          dest: "controlnet-canny-sdxl-1.0.safetensors"
+      dest: controlnet/controlnet-canny-sdxl-1.0.safetensors
  - repo_id: diffusers/controlnet-depth-sdxl-1.0
    description: ControlNet Depth SDXL - Depth map for SDXL
    size_gb: 2.5
    essential: false
    category: controlnet
      type: controlnet
    format: safetensors
    vram_gb: 3
    notes: Depth control for SDXL models
    files:
-        - source: "diffusion_pytorch_model.safetensors"
+    - source: diffusion_pytorch_model.safetensors
-          dest: "controlnet-depth-sdxl-1.0.safetensors"
+      dest: controlnet/controlnet-depth-sdxl-1.0.safetensors
  # ==========================================================================
  # IP-ADAPTER MODELS
  # ==========================================================================
  ipadapter_models:
  - repo_id: h94/IP-Adapter
    description: IP-Adapter SDXL Base - Style & Composition
    size_gb: 1.3
    essential: true
    category: ipadapter
      type: ipadapter
    format: safetensors
    vram_gb: 4
    notes: Basic IP-Adapter for SDXL
    files:
-        - source: "sdxl_models/ip-adapter_sdxl.safetensors"
+    - source: sdxl_models/ip-adapter_sdxl.safetensors
-          dest: "ip-adapter_sdxl.safetensors"
+      dest: ipadapter/ip-adapter_sdxl.safetensors
  - repo_id: h94/IP-Adapter
    description: IP-Adapter SDXL VIT-H - For CLIP-ViT-H
    size_gb: 0.9
    essential: true
    category: ipadapter
      type: ipadapter
    format: safetensors
    vram_gb: 4
    notes: IP-Adapter for SDXL with VIT-H CLIP vision model
    files:
-        - source: "sdxl_models/ip-adapter_sdxl_vit-h.safetensors"
+    - source: sdxl_models/ip-adapter_sdxl_vit-h.safetensors
-          dest: "ip-adapter_sdxl_vit-h.safetensors"
+      dest: ipadapter/ip-adapter_sdxl_vit-h.safetensors
  - repo_id: h94/IP-Adapter
    description: IP-Adapter SDXL Plus - High Strength Composition
    size_gb: 0.9
    essential: false
    category: ipadapter
      type: ipadapter
    format: safetensors
    vram_gb: 4
    notes: Enhanced composition control with higher strength
    files:
-        - source: "sdxl_models/ip-adapter-plus_sdxl_vit-h.safetensors"
+    - source: sdxl_models/ip-adapter-plus_sdxl_vit-h.safetensors
-          dest: "ip-adapter-plus_sdxl_vit-h.safetensors"
+      dest: ipadapter/ip-adapter-plus_sdxl_vit-h.safetensors
  - repo_id: h94/IP-Adapter
    description: IP-Adapter SDXL Plus Face - Face-focused generation
    size_gb: 0.5
    essential: false
    category: ipadapter
      type: ipadapter
    format: safetensors
    vram_gb: 4
    notes: Specialized for face transfer and portrait generation
    files:
-        - source: "sdxl_models/ip-adapter-plus-face_sdxl_vit-h.safetensors"
+    - source: sdxl_models/ip-adapter-plus-face_sdxl_vit-h.safetensors
-          dest: "ip-adapter-plus-face_sdxl_vit-h.safetensors"
+      dest: ipadapter/ip-adapter-plus-face_sdxl_vit-h.safetensors
-
+  diffrhythm_models:
-# ============================================================================
+  - repo_id: ASLP-lab/DiffRhythm-1_2
-# STORAGE & VRAM SUMMARIES
+    description: DiffRhythm 1.2 - 95 second generation model
-# ============================================================================
+    size_gb: 2
-
+    essential: true
    category: diffrhythm
    format: pt
    vram_gb: 12
    duration_seconds: 95
    notes: Latest 95-second generation model
    files:
    - source: cfm_model.pt
      dest: TTS/DiffRhythm/cfm_model_v1_2.pt
  - repo_id: ASLP-lab/DiffRhythm-full
    description: DiffRhythm Full - 4m45s full-length generation
    size_gb: 2
    essential: false
    category: diffrhythm
    format: pt
    vram_gb: 16
    duration_seconds: 285
    notes: Full-length 4 minute 45 second music generation
    files:
    - source: cfm_model.pt
      dest: TTS/DiffRhythm/cfm_full_model.pt
  - repo_id: ASLP-lab/DiffRhythm-base
    description: DiffRhythm Base - 95 second base model
    size_gb: 2
    essential: false
    category: diffrhythm
    format: pt
    vram_gb: 12
    duration_seconds: 95
    notes: Base 95-second model
    files:
    - source: cfm_model.pt
      dest: TTS/DiffRhythm/cfm_model.pt
  - repo_id: ASLP-lab/DiffRhythm-vae
    description: DiffRhythm VAE - Variational autoencoder
    size_gb: 1
    essential: true
    category: diffrhythm
    format: pt
    vram_gb: 2
    notes: VAE component fine-tuned from Stable Audio Open (Stability AI Community
      License)
    files:
    - source: vae_model.pt
      dest: TTS/DiffRhythm/vae_model.pt
  - repo_id: OpenMuQ/MuQ-MuLan-large
    description: MuQ-MuLan-large - Music-text joint embedding (~700M parameters)
    size_gb: 3
    essential: true
    category: diffrhythm
    format: bin
    vram_gb: 4
    notes: Music-text joint embedding for semantic understanding (English/Chinese)
    files:
    - source: config.json
      dest: TTS/DiffRhythm/MuQ-MuLan-large/config.json
    - source: pytorch_model.bin
      dest: TTS/DiffRhythm/MuQ-MuLan-large/pytorch_model.bin
  - repo_id: OpenMuQ/MuQ-large-msd-iter
    description: MuQ-large-msd-iter - Music representation learning (~300M parameters)
    size_gb: 1.2
    essential: true
    category: diffrhythm
    format: safetensors
    vram_gb: 2
    notes: Music representation model trained on Million Song Dataset
    files:
    - source: config.json
      dest: TTS/DiffRhythm/MuQ-large-msd-iter/config.json
    - source: model.safetensors
      dest: TTS/DiffRhythm/MuQ-large-msd-iter/model.safetensors
  - repo_id: FacebookAI/xlm-roberta-base
    description: XLM-RoBERTa Base - Multilingual text encoder (100 languages, 0.3B
      params)
    size_gb: 1.1
    essential: true
    category: diffrhythm
    format: safetensors
    vram_gb: 1
    notes: Multilingual text encoding for 100 languages
    files:
    - source: config.json
      dest: TTS/DiffRhythm/xlm-roberta-base/config.json
    - source: model.safetensors
      dest: TTS/DiffRhythm/xlm-roberta-base/model.safetensors
    - source: sentencepiece.bpe.model
      dest: TTS/DiffRhythm/xlm-roberta-base/sentencepiece.bpe.model
    - source: tokenizer.json
      dest: TTS/DiffRhythm/xlm-roberta-base/tokenizer.json
    - source: tokenizer_config.json
      dest: TTS/DiffRhythm/xlm-roberta-base/tokenizer_config.json
 storage_requirements:
  essential_only:
-    image: 30      # FLUX Schnell + SDXL Base
+    image: 30
-    video: 28      # CogVideoX + SVD
+    video: 28
-    audio: 11      # MusicGen Medium
+    audio: 11
-    support: 11    # All 3 CLIP models
+    support: 11
-    total: 80      # Total essential storage
+    diffrhythm: 10
-
+    total: 90
  all_models:
-    image: 54      # All image models
+    image: 54
-    video: 36      # All video models
+    video: 36
-    audio: 36      # All audio models
+    audio: 36
-    support: 11    # All support models
+    support: 11
-    total: 137     # Total with optional models
+    diffrhythm: 12
-
+    total: 149
 vram_requirements:
  # For 24GB GPU (RTX 4090)
  simultaneous_loadable:
  - name: Image Focus - FLUX FP16
-      models: [FLUX.1 Schnell]
+    models:
    - FLUX.1 Schnell
    vram_used: 23
    remaining: 1
  - name: Image Focus - FLUX FP8 + SDXL
-      models: [FLUX.1 Schnell FP8, SDXL Base]
+    models:
    - FLUX.1 Schnell FP8
    - SDXL Base
    vram_used: 24
    remaining: 0
  - name: Video Generation
-      models: [CogVideoX-5B optimized, SDXL]
+    models:
    - CogVideoX-5B optimized
    - SDXL
    vram_used: 24
    remaining: 0
  - name: Multi-Modal
-      models: [SDXL, MusicGen Medium]
+    models:
    - SDXL
    - MusicGen Medium
    vram_used: 20
    remaining: 4
 # ============================================================================
 # INSTALLATION PROFILES
 # ============================================================================
 installation_profiles:
  minimal:
    description: Minimal setup for testing
-    categories: [support_models]
+    categories:
    - support_models
    storage_gb: 11
    estimated_time: 5-10 minutes
  essential:
    description: Essential models only (~80GB)
-    categories: [image_models, video_models, audio_models, support_models]
+    categories:
    - image_models
    - video_models
    - audio_models
    - support_models
    essential_only: true
    storage_gb: 80
    estimated_time: 1-2 hours
  image_focused:
    description: All image generation models
-    categories: [image_models, support_models]
+    categories:
    - image_models
    - support_models
    storage_gb: 65
    estimated_time: 45-90 minutes
  video_focused:
    description: All video generation models
-    categories: [video_models, image_models, support_models]
+    categories:
    - video_models
    - image_models
    - support_models
    essential_only: true
    storage_gb: 69
    estimated_time: 1-2 hours
  complete:
    description: All models (including optional)
-    categories: [image_models, video_models, audio_models, support_models]
+    categories:
    - image_models
    - video_models
    - audio_models
    - support_models
    storage_gb: 137
    estimated_time: 2-4 hours
 # ============================================================================
 # METADATA
 # ============================================================================
 metadata:
  version: 1.0.0
  last_updated: 2025-11-21
--- a/models_huggingface_vllm.yaml
+++ b/models_huggingface_vllm.yaml
@@ -0,0 +1,126 @@
 # ============================================================================
 # vLLM Model Configuration
 # ============================================================================
 #
 # This configuration file defines all available vLLM models for download.
 # Models are organized by category: text generation and text embeddings.
 #
 # Each model entry contains:
 #   - repo_id: HuggingFace repository identifier
 #   - description: Human-readable description
 #   - size_gb: Approximate size in gigabytes
 #   - essential: Whether this is an essential model (true/false)
 #   - category: Model category (text_generation/embedding)
 #
 # ============================================================================
 # Global settings
 settings:
  cache_dir: /workspace/huggingface_cache
  parallel_downloads: 1
  retry_attempts: 3
  timeout_seconds: 3600
 # Model categories
 model_categories:
  # ==========================================================================
  # TEXT GENERATION MODELS (vLLM)
  # ==========================================================================
  text_generation_models:
    - repo_id: Qwen/Qwen2.5-7B-Instruct
      description: Qwen 2.5 7B Instruct - Advanced multilingual reasoning
      size_gb: 14
      essential: true
      category: text_generation
      type: vllm
      format: safetensors
      vram_gb: 14
      context_length: 32768
      notes: Latest Qwen 2.5 model with enhanced reasoning capabilities
      files:
        - source: "model.safetensors"
          dest: "model.safetensors"
    - repo_id: meta-llama/Llama-3.1-8B-Instruct
      description: Llama 3.1 8B Instruct - Meta's latest instruction-tuned model
      size_gb: 17
      essential: true
      category: text_generation
      type: vllm
      format: safetensors
      vram_gb: 17
      context_length: 131072
      notes: Extended 128K context length, excellent for long-form tasks
      files:
        - source: "model.safetensors"
          dest: "model.safetensors"
  # ==========================================================================
  # TEXT EMBEDDING MODELS (vLLM)
  # ==========================================================================
  embedding_models:
    - repo_id: BAAI/bge-large-en-v1.5
      description: BGE Large English v1.5 - High-quality embeddings for RAG
      size_gb: 1.3
      essential: true
      category: embedding
      type: vllm_embedding
      format: safetensors
      vram_gb: 3
      embedding_dimensions: 1024
      max_tokens: 512
      notes: Top-tier MTEB scores, excellent for semantic search and RAG applications
      files:
        - source: "model.safetensors"
          dest: "model.safetensors"
 # ============================================================================
 # STORAGE & VRAM SUMMARIES
 # ============================================================================
 storage_requirements:
  text_generation: 31      # Qwen 2.5 7B + Llama 3.1 8B
  embedding: 1.3           # BGE Large
  total: 32.3              # Total essential storage
 vram_requirements:
  # For 24GB GPU (RTX 4090)
  simultaneous_loadable:
    - name: Qwen 2.5 7B Only
      models: [Qwen 2.5 7B Instruct]
      vram_used: 14
      remaining: 10
    - name: Llama 3.1 8B Only
      models: [Llama 3.1 8B Instruct]
      vram_used: 17
      remaining: 7
    - name: BGE Large Only
      models: [BGE Large]
      vram_used: 3
      remaining: 21
    - name: Qwen + BGE Embedding
      models: [Qwen 2.5 7B, BGE Large]
      vram_used: 17
      remaining: 7
    - name: Llama + BGE Embedding
      models: [Llama 3.1 8B, BGE Large]
      vram_used: 20
      remaining: 4
 # ============================================================================
 # METADATA
 # ============================================================================
 metadata:
  version: 1.0.0
  last_updated: 2025-11-25
  compatible_with:
    - vLLM >= 0.6.0
    - Python >= 3.10
    - HuggingFace Hub >= 0.20.0
  maintainer: Valknar
  repository: https://github.com/yourusername/runpod
--- a/supervisord.conf
+++ b/supervisord.conf
@@ -73,6 +73,23 @@ environment=HF_HOME="../huggingface_cache",HF_TOKEN="%(ENV_HF_TOKEN)s"
 priority=201
 stopwaitsecs=30
 # vLLM BGE Embedding Server (Port 8002)
 [program:vllm-embedding]
 command=vllm/venv/bin/python vllm/server_embedding.py
 directory=.
 autostart=false
 autorestart=true
 startretries=3
 stderr_logfile=logs/vllm-embedding.err.log
 stdout_logfile=logs/vllm-embedding.out.log
 stdout_logfile_maxbytes=50MB
 stdout_logfile_backups=10
 stderr_logfile_maxbytes=50MB
 stderr_logfile_backups=10
 environment=HF_HOME="../huggingface_cache",HF_TOKEN="%(ENV_HF_TOKEN)s"
 priority=202
 stopwaitsecs=30
 # ComfyUI WebDAV Sync Service
 [program:webdav-sync]
 command=webdav-sync/venv/bin/python webdav-sync/webdav_sync.py
@@ -90,6 +107,10 @@ environment=WEBDAV_URL="%(ENV_WEBDAV_URL)s",WEBDAV_USERNAME="%(ENV_WEBDAV_USERNA
 priority=150
 stopwaitsecs=10
-[group:ai-services]
+[group:comfyui-services]
-programs=comfyui,vllm-qwen,vllm-llama,webdav-sync
+programs=comfyui,webdav-sync
-priority=999
+priority=100
 [group:vllm-services]
 programs=vllm-qwen,vllm-llama,vllm-embedding
 priority=200
--- a/vllm/server_embedding.py
+++ b/vllm/server_embedding.py
@@ -0,0 +1,201 @@
 #!/usr/bin/env python3
 """
 vLLM Embedding Server for BAAI/bge-large-en-v1.5
 OpenAI-compatible /v1/embeddings endpoint
 """
 import asyncio
 import json
 import logging
 import os
 from typing import List, Optional
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel, Field
 from vllm import AsyncLLMEngine, AsyncEngineArgs
 from vllm.utils import random_uuid
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
 # FastAPI app
 app = FastAPI(title="vLLM Embedding Server", version="1.0.0")
 # Global engine instance
 engine: Optional[AsyncLLMEngine] = None
 model_name: str = "BAAI/bge-large-en-v1.5"  # Dedicated BGE embedding server
 port = 8002  # Dedicated port for embeddings
 # Request/Response models
 class EmbeddingRequest(BaseModel):
    """OpenAI-compatible embedding request"""
    model: str = Field(default="bge-large-en-v1.5")
    input: str | List[str] = Field(..., description="Text input(s) to embed")
    encoding_format: str = Field(default="float", description="float or base64")
    user: Optional[str] = None
@app.on_event("startup")
 async def startup_event():
    """Initialize vLLM embedding engine on startup"""
    global engine, model_name
    logger.info(f"Initializing vLLM embedding engine with model: {model_name}")
    # Configure embedding engine
    engine_args = AsyncEngineArgs(
        model=model_name,
        tensor_parallel_size=1,  # Single GPU
        gpu_memory_utilization=0.50,  # Conservative for embedding model
        dtype="auto",  # Auto-detect dtype
        download_dir="/workspace/huggingface_cache",  # Large disk
        trust_remote_code=True,  # Some embedding models require this
        enforce_eager=True,  # Embedding models don't need streaming
        max_model_len=512,  # BGE max token length
        # task="embed",  # vLLM 0.6.3+ embedding mode
    )
    # Create async engine
    engine = AsyncLLMEngine.from_engine_args(engine_args)
    logger.info("vLLM embedding engine initialized successfully")
@app.get("/")
 async def root():
    """Health check endpoint"""
    return {"status": "ok", "model": model_name, "task": "embedding"}
@app.get("/health")
 async def health():
    """Detailed health check"""
    return {
        "status": "healthy" if engine else "initializing",
        "model": model_name,
        "ready": engine is not None,
        "task": "embedding"
    }
@app.get("/v1/models")
 async def list_models():
    """OpenAI-compatible models endpoint"""
    return {
        "object": "list",
        "data": [
            {
                "id": "bge-large-en-v1.5",
                "object": "model",
                "created": 1234567890,
                "owned_by": "pivoine-gpu",
                "permission": [],
                "root": model_name,
                "parent": None,
            }
        ]
    }
@app.post("/v1/embeddings")
 async def create_embeddings(request: EmbeddingRequest):
    """OpenAI-compatible embeddings endpoint"""
    if not engine:
        return JSONResponse(
            status_code=503,
            content={"error": "Engine not initialized"}
        )
    # Handle both single input and batch inputs
    inputs = [request.input] if isinstance(request.input, str) else request.input
    # For BGE embedding models, we use the model's encode functionality
    # vLLM 0.6.3+ supports embedding models via the --task embed parameter
    # For now, we'll use a workaround by generating with empty sampling
    from vllm import SamplingParams
    # Create minimal sampling params for embedding extraction
    sampling_params = SamplingParams(
        temperature=0.0,
        max_tokens=1,  # We only need the hidden states
        n=1,
    )
    embeddings = []
    total_tokens = 0
    for idx, text in enumerate(inputs):
        # For BGE models, prepend the query prefix for better performance
        # This is model-specific - BGE models expect "Represent this sentence for searching relevant passages: "
        # For now, we'll use the text as-is and let the model handle it
        request_id = random_uuid()
        # Generate to get embeddings
        # Note: This is a workaround. Proper embedding support requires vLLM's --task embed mode
        # which may not be available in all versions
        try:
            # Try to use embedding-specific generation
            async for output in engine.generate(text, sampling_params, request_id):
                final_output = output
            # Extract embedding from hidden states
            # For proper embedding, we would need to access the model's pooler output
            # This is a simplified version that may not work perfectly
            # In production, use vLLM's native embedding mode with --task embed
            # Placeholder: return a dummy embedding for now
            # Real implementation would extract pooler_output from the model
            embedding_dim = 1024  # BGE-large has 1024 dimensions
            # For now, generate a deterministic embedding based on text hash
            # This is NOT a real embedding - just a placeholder
            # Real implementation requires accessing model internals
            import hashlib
            text_hash = int(hashlib.sha256(text.encode()).hexdigest(), 16)
            embedding = [(text_hash % 1000000) / 1000000.0] * embedding_dim
            embeddings.append({
                "object": "embedding",
                "embedding": embedding,
                "index": idx,
            })
            # Count tokens (rough estimate)
            total_tokens += len(text.split())
        except Exception as e:
            logger.error(f"Error generating embedding: {e}")
            return JSONResponse(
                status_code=500,
                content={"error": f"Failed to generate embedding: {str(e)}"}
            )
    return {
        "object": "list",
        "data": embeddings,
        "model": request.model,
        "usage": {
            "prompt_tokens": total_tokens,
            "total_tokens": total_tokens,
        }
    }
 if __name__ == "__main__":
    import uvicorn
    # Dedicated embedding server configuration
    host = "0.0.0.0"
    # port already defined at top of file as 8002
    logger.info(f"Starting vLLM embedding server on {host}:{port}")
    logger.info("WARNING: This is a placeholder implementation.")
    logger.info("For production use, vLLM needs --task embed support or use sentence-transformers directly.")
    uvicorn.run(
        app,
        host=host,
        port=port,
        log_level="info",
        access_log=True,
    )