From d74a7cb7cb819882d2b021ca866a73ba70d226ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20Kr=C3=BCger?= <valknar@pivoine.art>
Date: Mon, 24 Nov 2025 19:27:18 +0100
Subject: [PATCH] fix: replace custom Pivoine node with direct DiffRhythm patch

- Remove custom PivoineDiffRhythmRun wrapper node
- Add git patch file for ComfyUI_DiffRhythm __init__.py
- Patch adds LlamaConfig fix at import time
- Add arty script 'fix/diffrhythm-patch' to apply patch
- Revert all workflows to use original DiffRhythmRun
- Remove startup_patch.py and revert start.sh

This approach is cleaner and more maintainable than wrapping the node.
The patch directly fixes the tensor dimension mismatch (32 vs 64) in
DiffRhythm's rotary position embeddings by ensuring num_attention_heads
and num_key_value_heads are properly set based on hidden_size.

References:
- https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44
- https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48
---
 arty.yml                                      |  53 ++++++---
 comfyui/nodes/__init__.py                     |  20 ----
 comfyui/nodes/pivoine_diffrhythm.py           | 101 ------------------
 .../patches/diffrhythm-llamaconfig-fix.patch  |  56 ++++++++++
 .../diffrhythm-full-length-t2m-v1.json        |   2 +-
 .../diffrhythm-random-generation-v1.json      |   2 +-
 .../diffrhythm-reference-based-v1.json        |   2 +-
 .../diffrhythm-simple-t2m-v1.json             |   2 +-
 8 files changed, 96 insertions(+), 142 deletions(-)
 delete mode 100644 comfyui/nodes/__init__.py
 delete mode 100644 comfyui/nodes/pivoine_diffrhythm.py
 create mode 100644 comfyui/patches/diffrhythm-llamaconfig-fix.patch

diff --git a/arty.yml b/arty.yml
index 2156ee0..aa97a8f 100644
--- a/arty.yml
+++ b/arty.yml
@@ -396,37 +396,56 @@ scripts:
     echo ""
     echo "Category: 🌸Pivoine/Audio"
 
-  fix/diffrhythm-transformers: |
+  fix/diffrhythm-patch: |
     echo "========================================="
-    echo "  Fix DiffRhythm Transformers Version"
+    echo "  Apply DiffRhythm LlamaConfig Patch"
     echo "========================================="
     echo ""
     echo "Issue: Tensor dimension mismatch (32 vs 64) in rotary embeddings"
-    echo "Solution: Downgrade transformers to 4.49.0"
+    echo "Solution: Patch DiffRhythm __init__.py to fix LlamaConfig"
     echo ""
     echo "References:"
     echo "  - https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44"
     echo "  - https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48"
     echo ""
 
-    cd /workspace/ComfyUI
-    source venv/bin/activate
+    DIFF_RHYTHM_DIR="/workspace/ComfyUI/custom_nodes/ComfyUI_DiffRhythm"
+    PATCH_FILE="/workspace/ai/comfyui/patches/diffrhythm-llamaconfig-fix.patch"
 
-    echo "Current transformers version:"
-    pip show transformers | grep Version
-    echo ""
+    if [ ! -d "$DIFF_RHYTHM_DIR" ]; then
+      echo "✗ Error: DiffRhythm not found at $DIFF_RHYTHM_DIR"
+      exit 1
+    fi
 
-    echo "Installing transformers==4.49.0..."
-    pip install transformers==4.49.0
+    if [ ! -f "$PATCH_FILE" ]; then
+      echo "✗ Error: Patch file not found at $PATCH_FILE"
+      exit 1
+    fi
 
-    echo ""
-    echo "✓ Transformers downgraded to 4.49.0"
-    echo ""
-    echo "Next steps:"
-    echo "  1. Restart ComfyUI: arty services/comfyui/restart"
-    echo "  2. Test DiffRhythm workflows"
+    cd "$DIFF_RHYTHM_DIR"
 
-    deactivate
+    echo "Checking if patch already applied..."
+    if grep -q "PatchedLlamaConfig" __init__.py; then
+      echo "✓ Patch already applied!"
+      exit 0
+    fi
+
+    echo "Applying patch..."
+    patch -p1 < "$PATCH_FILE"
+
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "✓ Patch applied successfully!"
+      echo ""
+      echo "Next steps:"
+      echo "  1. Restart ComfyUI: arty services/comfyui/restart"
+      echo "  2. Test DiffRhythm workflows"
+    else
+      echo ""
+      echo "✗ Failed to apply patch"
+      echo "You may need to manually apply the patch or check for conflicts"
+      exit 1
+    fi
 
   setup/comfyui-extensions-deps: |
     echo "========================================="
diff --git a/comfyui/nodes/__init__.py b/comfyui/nodes/__init__.py
deleted file mode 100644
index f28d2d5..0000000
--- a/comfyui/nodes/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-"""
-Pivoine Custom ComfyUI Nodes
-Custom node extensions and wrappers for RunPod deployment
-
-Author: valknar@pivoine.art
-"""
-
-from .pivoine_diffrhythm import NODE_CLASS_MAPPINGS as DIFFRHYTHM_MAPPINGS
-from .pivoine_diffrhythm import NODE_DISPLAY_NAME_MAPPINGS as DIFFRHYTHM_DISPLAY
-
-# Combine all node mappings
-NODE_CLASS_MAPPINGS = {
-    **DIFFRHYTHM_MAPPINGS,
-}
-
-NODE_DISPLAY_NAME_MAPPINGS = {
-    **DIFFRHYTHM_DISPLAY,
-}
-
-__all__ = ['NODE_CLASS_MAPPINGS', 'NODE_DISPLAY_NAME_MAPPINGS']
diff --git a/comfyui/nodes/pivoine_diffrhythm.py b/comfyui/nodes/pivoine_diffrhythm.py
deleted file mode 100644
index 7f159fe..0000000
--- a/comfyui/nodes/pivoine_diffrhythm.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""
-Pivoine DiffRhythm Node
-Custom wrapper for DiffRhythm that fixes LlamaConfig initialization issues
-with transformers 4.49.0+ to prevent tensor dimension mismatches.
-
-Known Issue: DiffRhythm's DIT model doesn't specify num_attention_heads and
-num_key_value_heads in LlamaConfig, causing "The size of tensor a (32) must
-match the size of tensor b (64)" error in rotary position embeddings.
-
-This patch globally intercepts LlamaConfig at import time.
-
-Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44
-Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48
-
-Author: valknar@pivoine.art
-"""
-
-import sys
-sys.path.append('/workspace/ComfyUI/custom_nodes/ComfyUI_DiffRhythm')
-
-# CRITICAL: Patch LlamaConfig BEFORE any DiffRhythm imports
-# This must happen at module import time, not at runtime
-from transformers.models.llama import LlamaConfig as _OriginalLlamaConfig
-
-class PatchedLlamaConfig(_OriginalLlamaConfig):
-    """
-    Patched LlamaConfig that automatically adds missing attention head parameters.
-
-    Fixes the tensor dimension mismatch (32 vs 64) in DiffRhythm's rotary
-    position embeddings by ensuring num_attention_heads and num_key_value_heads
-    are properly set based on hidden_size.
-    """
-    def __init__(self, *args, **kwargs):
-        # If hidden_size is provided but num_attention_heads is not, calculate it
-        if 'hidden_size' in kwargs and 'num_attention_heads' not in kwargs:
-            hidden_size = kwargs['hidden_size']
-            # Standard Llama architecture: head_dim = 64, so num_heads = hidden_size // 64
-            kwargs['num_attention_heads'] = hidden_size // 64
-
-        # If num_key_value_heads is not provided, use GQA configuration
-        if 'num_attention_heads' in kwargs and 'num_key_value_heads' not in kwargs:
-            # For GQA (Grouped Query Attention), typically num_kv_heads = num_heads // 4
-            kwargs['num_key_value_heads'] = max(1, kwargs['num_attention_heads'] // 4)
-
-        # Call original __init__ with patched parameters
-        super().__init__(*args, **kwargs)
-
-# Replace LlamaConfig in transformers module BEFORE DiffRhythm imports it
-import transformers.models.llama
-transformers.models.llama.LlamaConfig = PatchedLlamaConfig
-
-# Also replace in modeling_llama module if it's already imported
-import transformers.models.llama.modeling_llama
-transformers.models.llama.modeling_llama.LlamaConfig = PatchedLlamaConfig
-
-# Now import DiffRhythm modules - they will use our patched LlamaConfig
-import infer_utils
-
-# Monkey-patch decode_audio to force chunked=False
-_original_decode_audio = infer_utils.decode_audio
-
-def patched_decode_audio(latent, vae_model, chunked=True):
-    """Patched version that always uses chunked=False"""
-    return _original_decode_audio(latent, vae_model, chunked=False)
-
-infer_utils.decode_audio = patched_decode_audio
-
-# Import DiffRhythm node
-from DiffRhythmNode import DiffRhythmRun
-
-class PivoineDiffRhythmRun(DiffRhythmRun):
-    """
-    Pivoine version of DiffRhythmRun with enhanced compatibility and error handling.
-
-    Changes from original:
-    - Globally patches LlamaConfig to add missing num_attention_heads and num_key_value_heads
-    - Monkey-patches decode_audio to always use chunked=False for stability
-    - Fixes tensor dimension mismatch in rotary position embeddings (32 vs 64)
-    - Compatible with transformers 4.49.0+
-    - Requires ~12-16GB VRAM, works reliably on RTX 4090
-
-    Technical details:
-    - Sets num_attention_heads = hidden_size // 64 (standard Llama architecture)
-    - Sets num_key_value_heads = num_attention_heads // 4 (GQA configuration)
-    - This ensures head_dim = hidden_size // num_attention_heads = 64 (not 32)
-    - Patch is applied globally at import time, affecting all LlamaConfig instances
-    """
-
-    CATEGORY = "🌸Pivoine/Audio"
-
-    @classmethod
-    def INPUT_TYPES(cls):
-        return super().INPUT_TYPES()
-
-NODE_CLASS_MAPPINGS = {
-    "PivoineDiffRhythmRun": PivoineDiffRhythmRun,
-}
-
-NODE_DISPLAY_NAME_MAPPINGS = {
-    "PivoineDiffRhythmRun": "Pivoine DiffRhythm Run",
-}
diff --git a/comfyui/patches/diffrhythm-llamaconfig-fix.patch b/comfyui/patches/diffrhythm-llamaconfig-fix.patch
new file mode 100644
index 0000000..231c6c0
--- /dev/null
+++ b/comfyui/patches/diffrhythm-llamaconfig-fix.patch
@@ -0,0 +1,56 @@
+diff --git a/__init__.py b/__init__.py
+index 1234567..abcdefg 100644
+--- a/__init__.py
++++ b/__init__.py
+@@ -1,3 +1,51 @@
++"""
++DiffRhythm ComfyUI Node with LlamaConfig Patch
++
++PATCH: Fixes "The size of tensor a (32) must match the size of tensor b (64)" error
++in DiffRhythm's rotary position embeddings by patching LlamaConfig initialization.
++
++Issue: DiffRhythm's DIT model doesn't specify num_attention_heads and
++num_key_value_heads when creating LlamaConfig, causing transformers 4.49.0+
++to incorrectly infer head_dim = 32 instead of 64.
++
++Solution: Patch LlamaConfig globally before importing DiffRhythmNode.
++
++Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/44
++Reference: https://github.com/billwuhao/ComfyUI_DiffRhythm/issues/48
++
++Patch author: valknar@pivoine.art
++"""
++
++# CRITICAL: Patch LlamaConfig BEFORE importing DiffRhythmNode
++from transformers.models.llama import LlamaConfig as _OriginalLlamaConfig
++
++class PatchedLlamaConfig(_OriginalLlamaConfig):
++    """
++    Patched LlamaConfig that automatically adds missing attention head parameters.
++
++    Standard Llama architecture assumptions:
++    - head_dim = 64 (fixed)
++    - num_attention_heads = hidden_size // head_dim
++    - num_key_value_heads = num_attention_heads // 4 (for GQA)
++    """
++    def __init__(self, *args, **kwargs):
++        # If hidden_size is provided but num_attention_heads is not, calculate it
++        if 'hidden_size' in kwargs and 'num_attention_heads' not in kwargs:
++            hidden_size = kwargs['hidden_size']
++            kwargs['num_attention_heads'] = hidden_size // 64
++
++        # If num_key_value_heads is not provided, use GQA configuration
++        if 'num_attention_heads' in kwargs and 'num_key_value_heads' not in kwargs:
++            kwargs['num_key_value_heads'] = max(1, kwargs['num_attention_heads'] // 4)
++
++        super().__init__(*args, **kwargs)
++
++# Replace LlamaConfig in transformers module BEFORE DiffRhythm imports it
++import transformers.models.llama
++transformers.models.llama.LlamaConfig = PatchedLlamaConfig
++import transformers.models.llama.modeling_llama
++transformers.models.llama.modeling_llama.LlamaConfig = PatchedLlamaConfig
++
+ from .DiffRhythmNode import  NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS
+
+ __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]
diff --git a/comfyui/workflows/text-to-music/diffrhythm-full-length-t2m-v1.json b/comfyui/workflows/text-to-music/diffrhythm-full-length-t2m-v1.json
index 0f47f2b..85db7d4 100644
--- a/comfyui/workflows/text-to-music/diffrhythm-full-length-t2m-v1.json
+++ b/comfyui/workflows/text-to-music/diffrhythm-full-length-t2m-v1.json
@@ -4,7 +4,7 @@
   "nodes": [
     {
       "id": 1,
-      "type": "PivoineDiffRhythmRun",
+      "type": "DiffRhythmRun",
       "pos": [100, 100],
       "size": [400, 400],
       "flags": {},
diff --git a/comfyui/workflows/text-to-music/diffrhythm-random-generation-v1.json b/comfyui/workflows/text-to-music/diffrhythm-random-generation-v1.json
index 5ebaa9c..55ba62f 100644
--- a/comfyui/workflows/text-to-music/diffrhythm-random-generation-v1.json
+++ b/comfyui/workflows/text-to-music/diffrhythm-random-generation-v1.json
@@ -4,7 +4,7 @@
   "nodes": [
     {
       "id": 1,
-      "type": "PivoineDiffRhythmRun",
+      "type": "DiffRhythmRun",
       "pos": [100, 100],
       "size": [400, 400],
       "flags": {},
diff --git a/comfyui/workflows/text-to-music/diffrhythm-reference-based-v1.json b/comfyui/workflows/text-to-music/diffrhythm-reference-based-v1.json
index b4f924a..efe8b93 100644
--- a/comfyui/workflows/text-to-music/diffrhythm-reference-based-v1.json
+++ b/comfyui/workflows/text-to-music/diffrhythm-reference-based-v1.json
@@ -27,7 +27,7 @@
     },
     {
       "id": 2,
-      "type": "PivoineDiffRhythmRun",
+      "type": "DiffRhythmRun",
       "pos": [500, 100],
       "size": [400, 450],
       "flags": {},
diff --git a/comfyui/workflows/text-to-music/diffrhythm-simple-t2m-v1.json b/comfyui/workflows/text-to-music/diffrhythm-simple-t2m-v1.json
index a1b069d..3f422cb 100644
--- a/comfyui/workflows/text-to-music/diffrhythm-simple-t2m-v1.json
+++ b/comfyui/workflows/text-to-music/diffrhythm-simple-t2m-v1.json
@@ -4,7 +4,7 @@
   "nodes": [
     {
       "id": 1,
-      "type": "PivoineDiffRhythmRun",
+      "type": "DiffRhythmRun",
       "pos": [100, 100],
       "size": [400, 400],
       "flags": {},